File Coverage

lib/Bio/Roary/SplitGroups.pm

Criterion	Covered	Total	%
statement	183	196	93.3
branch	32	34	94.1
condition	3	3	100.0
subroutine	23	24	95.8
pod	0	1	0.0
total	241	258	93.4

line	stmt	bran	cond	sub	pod	time	code
1							package Bio::Roary::SplitGroups;
2							$Bio::Roary::SplitGroups::VERSION = '3.10.1';
3							# ABSTRACT: split groups
4
5
6	2			2		87499	use Moose;
	2					381634
	2					16
7	2			2		13559	use Bio::Roary::AnalyseGroups;
	2					5
	2					96
8	2			2		16	use File::Path qw(make_path remove_tree);
	2					3
	2					176
9	2			2		446	use File::Copy qw(move);
	2					1823
	2					97
10	2			2		433	use File::Temp;
	2					14146
	2					134
11	2			2		12	use File::Basename;
	2					3
	2					119
12	2			2		882	use File::Slurper 'read_lines';
	2					26002
	2					123
13	2			2		17	use Cwd;
	2					4
	2					3412
14
15
16							has 'groupfile' => ( is => 'ro', isa => 'Str', required => 1 );
17							has 'fasta_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
18							has 'outfile' => ( is => 'ro', isa => 'Str', required => 1 );
19							has 'iterations' => ( is => 'ro', isa => 'Int', default => 5 );
20							has 'dont_delete' => ( is => 'ro', isa => 'Bool', default => 0 );
21
22							has '_neighbourhood_size' => ( is => 'ro', isa => 'Int', default => 5 );
23
24							has '_group_filelist' => ( is => 'rw', isa => 'ArrayRef', lazy_build => 1 );
25							has '_tmp_dir_object' => ( is => 'ro', isa => 'File::Temp::Dir', default => sub { File::Temp->newdir( DIR => getcwd, CLEANUP => 1 ); } );
26							has '_tmp_dir' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build__tmp_dir' );
27
28							has '_analyse_groups_obj' => ( is => 'ro', lazy_build => 1 );
29							has '_genes_to_files' => ( is => 'ro', lazy_build => 1 );
30							has '_genes_to_groups' => ( is => 'rw', isa => 'HashRef' );
31
32							has '_first_gene_of_group_which_doesnt_have_paralogs' => ( is => 'rw', isa => 'HashRef', default => sub {{}} );
33
34							has '_genes_to_neighbourhood' => ( is => 'rw', isa => 'HashRef', lazy => 1, builder => '_build__genes_to_neighbourhood' );
35
36
37							has '_gene_files_temp_dir_obj' =>
38							( is => 'ro', isa => 'File::Temp::Dir', default => sub { File::Temp->newdir( DIR => getcwd, CLEANUP => 1 ); } );
39
40
41							has '_do_sorting' => ( is => 'rw', isa => 'Bool', default => 0 ); # set to 1 for testing only
42
43							sub _build__tmp_dir {
44	4			4		8	my ($self) = @_;
45	4					99	return $self->_tmp_dir_object->dirname();
46							}
47
48							sub _build__analyse_groups_obj {
49	4			4		9	my ( $self ) = @_;
50
51	4					89	return Bio::Roary::AnalyseGroups->new(
52							fasta_files => $self->fasta_files,
53							groups_filename => $self->groupfile
54							);
55							}
56
57							sub _build__genes_to_files {
58	4			4		7	my ( $self ) = @_;
59	4					88	return $self->_analyse_groups_obj->_genes_to_file;
60							}
61
62							sub _build__group_filelist {
63	4			4		8	my ( $self ) = @_;
64	4					106	my $tmp = $self->_tmp_dir;
65
66	4					90	my @filelist = ( $self->groupfile );
67	4					76	for my $i ( 1..($self->iterations - 1) ){
68	16					41	push( @filelist, "$tmp/group_$i" );
69							}
70	4					81	push( @filelist, $self->outfile );
71
72	4					98	return \@filelist;
73							}
74
75							sub _build__genes_to_neighbourhood
76							{
77	4			4		11	my ( $self ) = @_;
78	4					7	my %genes_to_neighbourhood;
79	4					7	for my $fasta_file( @{$self->fasta_files})
	4					115
80							{
81	9					468	my ( $filename, $directories, $suffix ) = fileparse( $fasta_file, qr/\.[^.]*/ );
82	9					233	system('grep \> '.$fasta_file.'\| sed \'s/>//\' >'.$self->_gene_files_temp_dir_obj."/".$filename.$suffix ) ;
83
84	9					43109	my @genes = read_lines($self->_gene_files_temp_dir_obj."/".$filename.$suffix );
85
86	9					1445	for(my $i =0; $i< @genes; $i++)
87							{
88	225					4500	for(my $offset = 1; $offset <= $self->_neighbourhood_size; $offset++)
89							{
90	1125	100				1645	if($i -$offset >= 0)
91							{
92	990					904	push(@{$genes_to_neighbourhood{$genes[$i]}}, $genes[$i - $offset ]);
	990					2626
93							}
94	1125	100				4381	if($i +$offset <@genes)
95							{
96	990					897	push(@{$genes_to_neighbourhood{$genes[$i]}}, $genes[$i + $offset ]);
	990					19601
97							}
98							}
99							}
100							}
101	4					107	return \%genes_to_neighbourhood;
102							}
103
104							sub split_groups {
105	4			4	0	1495	my ( $self ) = @_;
106
107							# iteratively
108	4					130	for my $x ( 0..($self->iterations - 1) ){
109	10					38	my ( $in_groups, $out_groups ) = $self->_get_files_for_iteration( $x );
110
111							# read in groups, check paralogs and split
112	10					19	my @newgroups;
113	10					14	my $any_paralogs = 0;
114	10					24	$self->_set_genes_to_groups( $in_groups );
115	10					237	open( my $group_handle, '<', $in_groups );
116	10					80	while( my $line = <$group_handle> ){
117	139					570	my @group = split( /\s+/, $line );
118
119	139	100				3412	if($self->_first_gene_of_group_which_doesnt_have_paralogs->{$group[0]})
		100
		100
120							{
121	75					237	push( @newgroups, \@group );
122							}
123							elsif(@group == 1)
124							{
125	1					25	$self->_first_gene_of_group_which_doesnt_have_paralogs->{$group[0]}++;
126	1					6	push( @newgroups, \@group );
127							}
128							elsif( $self->_contains_paralogs( \@group ) ){
129	6					6	my @true_orthologs = @{ $self->_true_orthologs( \@group ) };
	6					19
130	6					14	push( @newgroups, @true_orthologs);
131	6					79	$any_paralogs = 1;
132							}
133							else {
134	57					1370	$self->_first_gene_of_group_which_doesnt_have_paralogs->{$group[0]}++;
135	57					263	push( @newgroups, \@group );
136							}
137							}
138	10					60	close( $group_handle );
139
140							# check if next iteration required, move output if not
141	10	100				22	unless ($any_paralogs){
142	4					114	move $in_groups, $self->outfile; # input file will be the same as new output file if no splitting has been performed
143	4					360	last;
144							}
145
146							# write split groups to file
147	6					457	open( my $outfile_handle, '>', $out_groups );
148	6					23	for my $g ( @newgroups ) {
149	88					84	my $group_str = join( "\t", @{ $g } ) . "\n";
	88					150
150	88					155	print $outfile_handle $group_str;
151							}
152	6					235	close( $outfile_handle );
153							}
154							}
155
156							sub _set_genes_to_groups {
157	10			10		18	my ( $self, $groupfile ) = @_;
158
159	10					14	my %genes2groups;
160	10					11	my $c = 0;
161	10					243	open( my $gfh, '<', $groupfile );
162	10					111	while( my $line = <$gfh> ){
163	139					178	chomp $line;
164	139					357	my @genes = split( /\s+/, $line );
165	139					193	for my $g ( @genes ){
166	311					509	$genes2groups{$g} = $c;
167							}
168	139					353	$c++;
169							}
170	10					45	close($gfh);
171	10					272	$self->_genes_to_groups( \%genes2groups );
172							}
173
174							sub _update_genes_to_groups {
175	0			0		0	my ( $self, $groups ) = @_;
176
177	0					0	my %genes2groups = %{ $self->_genes_to_groups };
	0					0
178	0					0	my $c = 1;
179	0					0	for my $g ( @{ $groups } ){
	0					0
180	0					0	for my $h ( @{ $g } ){
	0					0
181	0					0	$genes2groups{$h} .= ".$c";
182							}
183	0					0	$c++;
184							}
185
186	0					0	$self->_genes_to_groups( \%genes2groups );
187							}
188
189							sub _get_files_for_iteration {
190	10			10		24	my ( $self, $n ) = @_;
191	10					15	my @filelist = @{ $self->_group_filelist };
	10					273
192	10					37	return ( $filelist[$n], $filelist[$n+1] );
193							}
194
195							sub _contains_paralogs {
196	63			63		105	my ( $self, $group ) = @_;
197
198	63	100				87	return 1 if defined $self->_find_paralogs( $group );
199	57					102	return 0;
200							}
201
202							sub _find_paralogs {
203	69			69		83	my ( $self, $group ) = @_;
204
205	69					70	my %occ;
206	69					64	for my $gene ( @{ $group } ){
	69					141
207	182					3249	my $gene_file = $self->_genes_to_files->{ $gene };
208	182					197	push( @{ $occ{$gene_file} }, $gene );
	182					429
209							}
210
211							# pick the smallest number of paralogs
212	69					87	my $smallest_number = 1000000;
213	69					63	my $smallest_group;
214	69					113	for my $v ( values %occ ){
215	148					136	my $v_len = scalar( @{$v} );
	148					156
216	148	100	100			421	if ( $v_len < $smallest_number && $v_len > 1 ){
217	14					17	$smallest_number = $v_len;
218	14					24	$smallest_group = $v;
219							}
220							}
221	69	100				146	return $smallest_group if ( defined $smallest_group );
222
223	57					144	return undef;
224							}
225
226							sub _true_orthologs {
227	6			6		13	my ( $self, $group ) = @_;
228
229							# first, create CGN hash for group
230	6					8	my %cgns;
231	6					9	for my $g ( @{ $group } ){
	6					15
232	31					68	$cgns{$g} = $self->_parse_gene_neighbourhood( $g );
233							}
234
235							# finding paralogs in the group
236	6					12	my @paralogs = @{ $self->_find_paralogs( $group ) };
	6					22
237	6					10	my @paralog_cgns_groups;
238	6					13	for my $p ( @paralogs ){
239	12					13	my %paralog_groups ;
240	12					14	for my $paralog_gene (@{$cgns{$p}})
	12					22
241							{
242	112					2107	my $gene_paralog_group = $self->_genes_to_groups->{$paralog_gene};
243	112	100				188	next unless( defined($gene_paralog_group));
244	74					1411	$paralog_groups{$self->_genes_to_groups->{$paralog_gene}}++;
245							}
246	12					30	push( @paralog_cgns_groups, \%paralog_groups );
247							}
248
249							# create data structure to hold new groups
250	6					8	my @new_groups;
251	6					10	for my $p ( @paralogs ){
252	12					23	push( @new_groups, [ $p ] );
253							}
254	6					10	push( @new_groups, [] ); # extra "leftovers" array to gather genes that don't share CGN with anything
255
256							# cluster other members of the group to their closest match
257	6					15	for my $g ( @{ $group } ){
	6					10
258	31	100				52	next if ( grep {$_ eq $g} @paralogs );
	62					126
259	19					53	my $closest = $self->_closest_cgn( $cgns{$g}, \@paralog_cgns_groups );
260	19					22	push( @{ $new_groups[$closest] }, $g );
	19					50
261							}
262
263							# check for "leftovers", remove if absent
264	6					8	my $last = pop @new_groups;
265	6	100				16	push( @new_groups, $last ) if ( @$last > 0 );
266
267							# sort
268	6	50				136	if ( $self->_do_sorting ){
269	6					8	my @sorted_new_groups;
270	6					13	for my $gr ( @new_groups ){
271	13					16	my @s_gr = sort @{ $gr };
	13					49
272	13					29	push( @sorted_new_groups, \@s_gr );
273							}
274	6					42	return \@sorted_new_groups;
275							}
276
277	0					0	return \@new_groups;
278							}
279
280							sub _closest_cgn {
281	19			19		36	my ( $self, $cgn, $p_cgns ) = @_;
282
283	19					21	my @paralog_cgns = @{ $p_cgns };
	19					31
284	19					27	my $best_score = 0;
285	19					22	my $bs_index = -1; # return -1 to add to "leftovers" array if no better score is found
286	19					44	for my $i ( 0..$#paralog_cgns ){
287	38					45	my $p_cgn = $paralog_cgns[$i];
288	38					71	my $score = $self->_shared_cgn_score( $cgn, $p_cgn );
289	38	100				80	if ( $score > $best_score ){
290	21					25	$best_score = $score;
291	21					34	$bs_index = $i;
292							}
293							}
294	19					42	return $bs_index;
295							}
296
297							sub _shared_cgn_score {
298	38			38		60	my ( $self, $cgn1, $cgn2 ) = @_;
299
300	38					38	my $total_shared = 0;
301	38					38	for my $i ( @{ $cgn1 } ){
	38					62
302	356					6930	my $input_group = $self->_genes_to_groups->{$i};
303	356	100				582	next unless(defined($input_group));
304	246	100				442	$total_shared++ if($cgn2->{$input_group});
305							}
306	38	50				61	if( (scalar @{ $cgn1 }) == 0)
	38					74
307							{
308	0					0	return 0;
309							}
310	38					43	my $score = $total_shared/scalar @{ $cgn1 };
	38					59
311	38					60	return $score;
312							}
313
314							sub _parse_gene_neighbourhood {
315	31			31		130	my ( $self, $gene_id ) = @_;
316
317	31					646	return $self->_genes_to_neighbourhood->{$gene_id };
318
319							}
320
321	2			2		17	no Moose;
	2					8
	2					22
322							__PACKAGE__->meta->make_immutable;
323							1;
324
325							__END__
326
327							=pod
328
329							=encoding UTF-8
330
331							=head1 NAME
332
333							Bio::Roary::SplitGroups - split groups
334
335							=head1 VERSION
336
337							version 3.10.1
338
339							=head1 SYNOPSIS
340
341							use Bio::Roary::SplitGroups;
342
343							=head1 AUTHOR
344
345							Andrew J. Page <ap13@sanger.ac.uk>
346
347							=head1 COPYRIGHT AND LICENSE
348
349							This software is Copyright (c) 2013 by Wellcome Trust Sanger Institute.
350
351							This is free software, licensed under:
352
353							The GNU General Public License, Version 3, June 2007
354
355							=cut