File Coverage

lib/Bio/Roary/SplitGroups.pm

Criterion	Covered	Total	%
statement	183	196	93.3
branch	32	34	94.1
condition	3	3	100.0
subroutine	23	24	95.8
pod	0	1	0.0
total	241	258	93.4

line	stmt	bran	cond	sub	pod	time	code
1							package Bio::Roary::SplitGroups;
2							$Bio::Roary::SplitGroups::VERSION = '3.11.0';
3							# ABSTRACT: split groups
4
5
6	2			2		81994	use Moose;
	2					375658
	2					12
7	2			2		12033	use Bio::Roary::AnalyseGroups;
	2					6
	2					85
8	2			2		26	use File::Path qw(make_path remove_tree);
	2					4
	2					146
9	2			2		438	use File::Copy qw(move);
	2					1785
	2					98
10	2			2		427	use File::Temp;
	2					13556
	2					152
11	2			2		14	use File::Basename;
	2					3
	2					113
12	2			2		506	use File::Slurper 'read_lines';
	2					22374
	2					95
13	2			2		13	use Cwd;
	2					3
	2					2938
14
15
16							has 'groupfile' => ( is => 'ro', isa => 'Str', required => 1 );
17							has 'fasta_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
18							has 'outfile' => ( is => 'ro', isa => 'Str', required => 1 );
19							has 'iterations' => ( is => 'ro', isa => 'Int', default => 5 );
20							has 'dont_delete' => ( is => 'ro', isa => 'Bool', default => 0 );
21
22							has '_neighbourhood_size' => ( is => 'ro', isa => 'Int', default => 5 );
23
24							has '_group_filelist' => ( is => 'rw', isa => 'ArrayRef', lazy_build => 1 );
25							has '_tmp_dir_object' => ( is => 'ro', isa => 'File::Temp::Dir', default => sub { File::Temp->newdir( DIR => getcwd, CLEANUP => 1 ); } );
26							has '_tmp_dir' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build__tmp_dir' );
27
28							has '_analyse_groups_obj' => ( is => 'ro', lazy_build => 1 );
29							has '_genes_to_files' => ( is => 'ro', lazy_build => 1 );
30							has '_genes_to_groups' => ( is => 'rw', isa => 'HashRef' );
31
32							has '_first_gene_of_group_which_doesnt_have_paralogs' => ( is => 'rw', isa => 'HashRef', default => sub {{}} );
33
34							has '_genes_to_neighbourhood' => ( is => 'rw', isa => 'HashRef', lazy => 1, builder => '_build__genes_to_neighbourhood' );
35
36
37							has '_gene_files_temp_dir_obj' =>
38							( is => 'ro', isa => 'File::Temp::Dir', default => sub { File::Temp->newdir( DIR => getcwd, CLEANUP => 1 ); } );
39
40
41							has '_do_sorting' => ( is => 'rw', isa => 'Bool', default => 0 ); # set to 1 for testing only
42
43							sub _build__tmp_dir {
44	4			4		8	my ($self) = @_;
45	4					76	return $self->_tmp_dir_object->dirname();
46							}
47
48							sub _build__analyse_groups_obj {
49	4			4		7	my ( $self ) = @_;
50
51	4					72	return Bio::Roary::AnalyseGroups->new(
52							fasta_files => $self->fasta_files,
53							groups_filename => $self->groupfile
54							);
55							}
56
57							sub _build__genes_to_files {
58	4			4		8	my ( $self ) = @_;
59	4					83	return $self->_analyse_groups_obj->_genes_to_file;
60							}
61
62							sub _build__group_filelist {
63	4			4		11	my ( $self ) = @_;
64	4					82	my $tmp = $self->_tmp_dir;
65
66	4					80	my @filelist = ( $self->groupfile );
67	4					65	for my $i ( 1..($self->iterations - 1) ){
68	16					33	push( @filelist, "$tmp/group_$i" );
69							}
70	4					70	push( @filelist, $self->outfile );
71
72	4					70	return \@filelist;
73							}
74
75							sub _build__genes_to_neighbourhood
76							{
77	4			4		10	my ( $self ) = @_;
78	4					4	my %genes_to_neighbourhood;
79	4					5	for my $fasta_file( @{$self->fasta_files})
	4					79
80							{
81	9					425	my ( $filename, $directories, $suffix ) = fileparse( $fasta_file, qr/\.[^.]*/ );
82	9					235	system('grep \> '.$fasta_file.'\| sed \'s/>//\' >'.$self->_gene_files_temp_dir_obj."/".$filename.$suffix ) ;
83
84	9					30938	my @genes = read_lines($self->_gene_files_temp_dir_obj."/".$filename.$suffix );
85
86	9					1262	for(my $i =0; $i< @genes; $i++)
87							{
88	225					4015	for(my $offset = 1; $offset <= $self->_neighbourhood_size; $offset++)
89							{
90	1125	100				1444	if($i -$offset >= 0)
91							{
92	990					858	push(@{$genes_to_neighbourhood{$genes[$i]}}, $genes[$i - $offset ]);
	990					2452
93							}
94	1125	100				4067	if($i +$offset <@genes)
95							{
96	990					833	push(@{$genes_to_neighbourhood{$genes[$i]}}, $genes[$i + $offset ]);
	990					18675
97							}
98							}
99							}
100							}
101	4					99	return \%genes_to_neighbourhood;
102							}
103
104							sub split_groups {
105	4			4	0	1411	my ( $self ) = @_;
106
107							# iteratively
108	4					124	for my $x ( 0..($self->iterations - 1) ){
109	10					37	my ( $in_groups, $out_groups ) = $self->_get_files_for_iteration( $x );
110
111							# read in groups, check paralogs and split
112	10					11	my @newgroups;
113	10					13	my $any_paralogs = 0;
114	10					26	$self->_set_genes_to_groups( $in_groups );
115	10					178	open( my $group_handle, '<', $in_groups );
116	10					72	while( my $line = <$group_handle> ){
117	139					485	my @group = split( /\s+/, $line );
118
119	139	100				3053	if($self->_first_gene_of_group_which_doesnt_have_paralogs->{$group[0]})
		100
		100
120							{
121	75					202	push( @newgroups, \@group );
122							}
123							elsif(@group == 1)
124							{
125	1					26	$self->_first_gene_of_group_which_doesnt_have_paralogs->{$group[0]}++;
126	1					4	push( @newgroups, \@group );
127							}
128							elsif( $self->_contains_paralogs( \@group ) ){
129	6					7	my @true_orthologs = @{ $self->_true_orthologs( \@group ) };
	6					25
130	6					12	push( @newgroups, @true_orthologs);
131	6					65	$any_paralogs = 1;
132							}
133							else {
134	57					1288	$self->_first_gene_of_group_which_doesnt_have_paralogs->{$group[0]}++;
135	57					247	push( @newgroups, \@group );
136							}
137							}
138	10					51	close( $group_handle );
139
140							# check if next iteration required, move output if not
141	10	100				22	unless ($any_paralogs){
142	4					88	move $in_groups, $self->outfile; # input file will be the same as new output file if no splitting has been performed
143	4					346	last;
144							}
145
146							# write split groups to file
147	6					314	open( my $outfile_handle, '>', $out_groups );
148	6					19	for my $g ( @newgroups ) {
149	88					86	my $group_str = join( "\t", @{ $g } ) . "\n";
	88					127
150	88					131	print $outfile_handle $group_str;
151							}
152	6					176	close( $outfile_handle );
153							}
154							}
155
156							sub _set_genes_to_groups {
157	10			10		13	my ( $self, $groupfile ) = @_;
158
159	10					12	my %genes2groups;
160	10					9	my $c = 0;
161	10					228	open( my $gfh, '<', $groupfile );
162	10					110	while( my $line = <$gfh> ){
163	139					178	chomp $line;
164	139					321	my @genes = split( /\s+/, $line );
165	139					174	for my $g ( @genes ){
166	311					432	$genes2groups{$g} = $c;
167							}
168	139					305	$c++;
169							}
170	10					52	close($gfh);
171	10					262	$self->_genes_to_groups( \%genes2groups );
172							}
173
174							sub _update_genes_to_groups {
175	0			0		0	my ( $self, $groups ) = @_;
176
177	0					0	my %genes2groups = %{ $self->_genes_to_groups };
	0					0
178	0					0	my $c = 1;
179	0					0	for my $g ( @{ $groups } ){
	0					0
180	0					0	for my $h ( @{ $g } ){
	0					0
181	0					0	$genes2groups{$h} .= ".$c";
182							}
183	0					0	$c++;
184							}
185
186	0					0	$self->_genes_to_groups( \%genes2groups );
187							}
188
189							sub _get_files_for_iteration {
190	10			10		23	my ( $self, $n ) = @_;
191	10					14	my @filelist = @{ $self->_group_filelist };
	10					255
192	10					31	return ( $filelist[$n], $filelist[$n+1] );
193							}
194
195							sub _contains_paralogs {
196	63			63		90	my ( $self, $group ) = @_;
197
198	63	100				79	return 1 if defined $self->_find_paralogs( $group );
199	57					93	return 0;
200							}
201
202							sub _find_paralogs {
203	69			69		76	my ( $self, $group ) = @_;
204
205	69					62	my %occ;
206	69					61	for my $gene ( @{ $group } ){
	69					102
207	182					3214	my $gene_file = $self->_genes_to_files->{ $gene };
208	182					186	push( @{ $occ{$gene_file} }, $gene );
	182					423
209							}
210
211							# pick the smallest number of paralogs
212	69					91	my $smallest_number = 1000000;
213	69					88	my $smallest_group;
214	69					126	for my $v ( values %occ ){
215	148					139	my $v_len = scalar( @{$v} );
	148					142
216	148	100	100			392	if ( $v_len < $smallest_number && $v_len > 1 ){
217	14					17	$smallest_number = $v_len;
218	14					15	$smallest_group = $v;
219							}
220							}
221	69	100				131	return $smallest_group if ( defined $smallest_group );
222
223	57					137	return undef;
224							}
225
226							sub _true_orthologs {
227	6			6		14	my ( $self, $group ) = @_;
228
229							# first, create CGN hash for group
230	6					10	my %cgns;
231	6					6	for my $g ( @{ $group } ){
	6					13
232	31					55	$cgns{$g} = $self->_parse_gene_neighbourhood( $g );
233							}
234
235							# finding paralogs in the group
236	6					12	my @paralogs = @{ $self->_find_paralogs( $group ) };
	6					16
237	6					8	my @paralog_cgns_groups;
238	6					15	for my $p ( @paralogs ){
239	12					21	my %paralog_groups ;
240	12					11	for my $paralog_gene (@{$cgns{$p}})
	12					20
241							{
242	112					1996	my $gene_paralog_group = $self->_genes_to_groups->{$paralog_gene};
243	112	100				189	next unless( defined($gene_paralog_group));
244	74					1286	$paralog_groups{$self->_genes_to_groups->{$paralog_gene}}++;
245							}
246	12					66	push( @paralog_cgns_groups, \%paralog_groups );
247							}
248
249							# create data structure to hold new groups
250	6					7	my @new_groups;
251	6					11	for my $p ( @paralogs ){
252	12					27	push( @new_groups, [ $p ] );
253							}
254	6					9	push( @new_groups, [] ); # extra "leftovers" array to gather genes that don't share CGN with anything
255
256							# cluster other members of the group to their closest match
257	6					11	for my $g ( @{ $group } ){
	6					11
258	31	100				55	next if ( grep {$_ eq $g} @paralogs );
	62					143
259	19					51	my $closest = $self->_closest_cgn( $cgns{$g}, \@paralog_cgns_groups );
260	19					30	push( @{ $new_groups[$closest] }, $g );
	19					51
261							}
262
263							# check for "leftovers", remove if absent
264	6					9	my $last = pop @new_groups;
265	6	100				16	push( @new_groups, $last ) if ( @$last > 0 );
266
267							# sort
268	6	50				124	if ( $self->_do_sorting ){
269	6					9	my @sorted_new_groups;
270	6					11	for my $gr ( @new_groups ){
271	13					12	my @s_gr = sort @{ $gr };
	13					47
272	13					21	push( @sorted_new_groups, \@s_gr );
273							}
274	6					38	return \@sorted_new_groups;
275							}
276
277	0					0	return \@new_groups;
278							}
279
280							sub _closest_cgn {
281	19			19		34	my ( $self, $cgn, $p_cgns ) = @_;
282
283	19					20	my @paralog_cgns = @{ $p_cgns };
	19					29
284	19					19	my $best_score = 0;
285	19					25	my $bs_index = -1; # return -1 to add to "leftovers" array if no better score is found
286	19					39	for my $i ( 0..$#paralog_cgns ){
287	38					45	my $p_cgn = $paralog_cgns[$i];
288	38					70	my $score = $self->_shared_cgn_score( $cgn, $p_cgn );
289	38	100				78	if ( $score > $best_score ){
290	21					29	$best_score = $score;
291	21					30	$bs_index = $i;
292							}
293							}
294	19					38	return $bs_index;
295							}
296
297							sub _shared_cgn_score {
298	38			38		49	my ( $self, $cgn1, $cgn2 ) = @_;
299
300	38					37	my $total_shared = 0;
301	38					37	for my $i ( @{ $cgn1 } ){
	38					58
302	356					6192	my $input_group = $self->_genes_to_groups->{$i};
303	356	100				524	next unless(defined($input_group));
304	246	100				410	$total_shared++ if($cgn2->{$input_group});
305							}
306	38	50				44	if( (scalar @{ $cgn1 }) == 0)
	38					58
307							{
308	0					0	return 0;
309							}
310	38					45	my $score = $total_shared/scalar @{ $cgn1 };
	38					53
311	38					57	return $score;
312							}
313
314							sub _parse_gene_neighbourhood {
315	31			31		49	my ( $self, $gene_id ) = @_;
316
317	31					606	return $self->_genes_to_neighbourhood->{$gene_id };
318
319							}
320
321	2			2		15	no Moose;
	2					4
	2					12
322							__PACKAGE__->meta->make_immutable;
323							1;
324
325							__END__
326
327							=pod
328
329							=encoding UTF-8
330
331							=head1 NAME
332
333							Bio::Roary::SplitGroups - split groups
334
335							=head1 VERSION
336
337							version 3.11.0
338
339							=head1 SYNOPSIS
340
341							use Bio::Roary::SplitGroups;
342
343							=head1 AUTHOR
344
345							Andrew J. Page <ap13@sanger.ac.uk>
346
347							=head1 COPYRIGHT AND LICENSE
348
349							This software is Copyright (c) 2013 by Wellcome Trust Sanger Institute.
350
351							This is free software, licensed under:
352
353							The GNU General Public License, Version 3, June 2007
354
355							=cut