File Coverage

lib/Bio/Roary/OrderGenes.pm
Criterion Covered Total %
statement 227 245 92.6
branch 28 36 77.7
condition 5 12 41.6
subroutine 18 19 94.7
pod n/a
total 278 312 89.1


line stmt bran cond sub pod time code
1             package Bio::Roary::OrderGenes;
2             $Bio::Roary::OrderGenes::VERSION = '3.10.1';
3             # ABSTRACT: Take in GFF files and create a matrix of what genes are beside what other genes
4              
5              
6 3     3   935 use Moose;
  3         6  
  3         22  
7 3     3   19381 use Bio::Roary::Exceptions;
  3         9  
  3         90  
8 3     3   19 use Bio::Roary::AnalyseGroups;
  3         7  
  3         88  
9 3     3   1176 use Bio::Roary::ContigsToGeneIDsFromGFF;
  3         13  
  3         127  
10 3     3   1935 use Graph;
  3         281517  
  3         126  
11 3     3   1142 use Graph::Writer::Dot;
  3         7634  
  3         97  
12 3     3   22 use File::Basename;
  3         5  
  3         5550  
13              
14             has 'gff_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
15             has 'analyse_groups_obj' => ( is => 'ro', isa => 'Bio::Roary::AnalyseGroups', required => 1 );
16             has 'core_definition' => ( is => 'ro', isa => 'Num', default => 1.0 );
17             has 'pan_graph_filename' => ( is => 'ro', isa => 'Str', default => 'core_accessory_graph.dot' );
18             has 'accessory_graph_filename' => ( is => 'ro', isa => 'Str', default => 'accessory_graph.dot' );
19             has 'sample_weights' => ( is => 'ro', isa => 'Maybe[HashRef]' );
20             has 'samples_to_clusters' => ( is => 'ro', isa => 'Maybe[HashRef]' );
21             has 'group_order' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_group_order' );
22             has 'groups_to_sample_names' => ( is => 'rw', isa => 'HashRef', default => sub { {} } );
23             has 'group_graphs' => ( is => 'ro', isa => 'Graph', lazy => 1, builder => '_build_group_graphs' );
24             has 'groups_to_contigs' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_groups_to_contigs' );
25             has '_groups_to_file_contigs' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build__groups_to_file_contigs' );
26             has '_groups' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_groups' );
27             has 'number_of_files' => ( is => 'ro', isa => 'Int', lazy => 1, builder => '_build_number_of_files' );
28             has '_groups_qc' => ( is => 'ro', isa => 'HashRef', default => sub { {} } );
29             has '_percentage_of_largest_weak_threshold' => ( is => 'ro', isa => 'Num', default => 0.9 );
30              
31             sub _build_number_of_files {
32 15     15   43 my ($self) = @_;
33 15         29 return @{ $self->gff_files };
  15         468  
34             }
35              
36             sub _build_groups {
37 0     0   0 my ($self) = @_;
38 0         0 my %groups;
39 0         0 for my $group_name ( @{ $self->analyse_groups_obj->_groups } ) {
  0         0  
40 0         0 $groups{$group_name}++;
41             }
42 0         0 return \%groups;
43             }
44              
45             sub _build__groups_to_file_contigs {
46 36     36   107 my ($self) = @_;
47              
48 36         122 my @overlapping_hypothetical_gene_ids;
49             my %samples_to_groups_contigs;
50              
51             # Open each GFF file
52 36         61 for my $filename ( @{ $self->gff_files } ) {
  36         1193  
53 108         379 my @groups_to_contigs;
54 108         5643 my $contigs_to_ids_obj = Bio::Roary::ContigsToGeneIDsFromGFF->new( gff_file => $filename );
55              
56 108         6015 my ( $sample_name, $directories, $suffix ) = fileparse($filename);
57 108         1092 $sample_name =~ s/\.gff//gi;
58              
59             # Loop over each contig in the GFF file
60 108         261 for my $contig_name ( keys %{ $contigs_to_ids_obj->contig_to_ids } ) {
  108         4470  
61 189         592 my @groups_on_contig;
62              
63             # loop over each gene in each contig in the GFF file
64 189         311 for my $gene_id ( @{ $contigs_to_ids_obj->contig_to_ids->{$contig_name} } ) {
  189         6893  
65              
66             # convert to group name
67 999         26684 my $group_name = $self->analyse_groups_obj->_genes_to_groups->{$gene_id};
68 999 100       3171 next unless ( defined($group_name) );
69              
70 176 50       7384 if ( $contigs_to_ids_obj->overlapping_hypothetical_protein_ids->{$gene_id} ) {
71 0         0 $self->_groups_qc->{$group_name} =
72             'Hypothetical protein with no hits to refseq/uniprot/clusters/cdd/tigrfams/pfam overlapping another protein with hits';
73             }
74 176         581 push( @groups_on_contig, $group_name );
75             }
76 189         616 push( @groups_to_contigs, \@groups_on_contig );
77             }
78 108         6040 $samples_to_groups_contigs{$sample_name} = \@groups_to_contigs;
79             }
80              
81 36         1486 return \%samples_to_groups_contigs;
82              
83             }
84              
85             sub _build_group_order {
86 36     36   116 my ($self) = @_;
87 36         110 my %group_order;
88              
89             my %groups_to_sample_names;
90 36         65 for my $sample_name ( keys %{ $self->_groups_to_file_contigs } ) {
  36         1274  
91 108         3646 my $groups_to_file_contigs = $self->_groups_to_file_contigs->{$sample_name};
92 108         347 for my $groups_on_contig ( @{$groups_to_file_contigs} ) {
  108         324  
93 189         302 for ( my $i = 1 ; $i < @{$groups_on_contig} ; $i++ ) {
  320         661  
94 131         229 my $group_from = $groups_on_contig->[ $i - 1 ];
95 131         213 my $group_to = $groups_on_contig->[$i];
96              
97 131 100 66     3260 if ( defined( $self->sample_weights ) && $self->sample_weights->{$sample_name} ) {
98 11         220 $group_order{$group_from}{$group_to} += $self->sample_weights->{$sample_name};
99 11         13 push( @{ $groups_to_sample_names{$group_from} }, $sample_name );
  11         24  
100             }
101             else {
102 120         433 $group_order{$group_from}{$group_to}++;
103             }
104             }
105 189 100       255 if ( @{$groups_on_contig} == 1 ) {
  189         465  
106 6         21 my $group_from = $groups_on_contig->[0];
107 6         19 my $group_to = $groups_on_contig->[0];
108 6 50 33     220 if ( defined( $self->sample_weights ) && $self->sample_weights->{$sample_name} ) {
109 0         0 $group_order{$group_from}{$group_to} += $self->sample_weights->{$sample_name};
110 0         0 push( @{ $groups_to_sample_names{$group_from} }, $sample_name );
  0         0  
111             }
112             else {
113 6         35 $group_order{$group_from}{$group_to}++;
114             }
115             }
116             }
117             }
118              
119 36         1592 $self->groups_to_sample_names( \%groups_to_sample_names );
120 36         1221 return \%group_order;
121             }
122              
123             sub _build_group_graphs {
124 36     36   111 my ($self) = @_;
125 36         487 return Graph->new( undirected => 1 );
126             }
127              
128             sub _save_graph_to_file {
129 72     72   228 my ( $self, $graph, $output_filename ) = @_;
130 72         945 my $writer = Graph::Writer::Dot->new();
131 72         2780 $writer->write_graph( $graph, $output_filename );
132 72         95720 return 1;
133             }
134              
135             sub _add_groups_to_graph {
136 36     36   112 my ($self) = @_;
137              
138 36         79 for my $current_group ( keys %{ $self->group_order() } ) {
  36         1304  
139 65         22888 for my $group_to ( keys %{ $self->group_order->{$current_group} } ) {
  65         2023  
140 71         3841 my $weight = 1.0 / ( $self->group_order->{$current_group}->{$group_to} );
141 71         1822 $self->group_graphs->add_weighted_edge( $current_group, $group_to, $weight );
142             }
143             }
144              
145             }
146              
147             sub _reorder_connected_components {
148 72     72   260 my ( $self, $graph_groups ) = @_;
149 72         187 my @ordered_graph_groups;
150             my @paths_and_weights;
151              
152 72         138 for my $graph_group ( @{$graph_groups} ) {
  72         255  
153 29         157 my %groups;
154 29         56 $groups{$_}++ for ( @{$graph_group} );
  29         145  
155 29         75 my $edge_sum = 0;
156              
157 29         146 for my $current_group ( keys %groups ) {
158 107         163 for my $group_to ( keys %{ $self->group_order->{$current_group} } ) {
  107         2823  
159 97 100       327 next unless defined( $groups{$group_to} );
160 86         1965 $edge_sum += $self->group_order->{$current_group}->{$group_to};
161             }
162             }
163              
164 29         67 my %samples_in_graph;
165 29         85 for my $current_group ( keys %groups ) {
166 107         2406 my $sample_names = $self->groups_to_sample_names->{$current_group};
167 107 100       266 if ( defined($sample_names) ) {
168 11         16 for my $sample_name ( @{$sample_names} ) {
  11         20  
169 15         23 $samples_in_graph{$sample_name}++;
170             }
171             }
172             }
173 29         106 my @sample_names = sort keys %samples_in_graph;
174              
175 29 100       49 if ( @{$graph_group} == 1 ) {
  29         99  
176              
177 6         81 push(
178             @paths_and_weights,
179             {
180             path => $graph_group,
181             average_weight => $edge_sum,
182             sample_names => \@sample_names
183             }
184             );
185             }
186             else {
187 23         105 my $graph = Graph->new( undirected => 1 );
188 23         5761 for my $current_group ( keys %groups ) {
189 101         23480 for my $group_to ( keys %{ $self->group_order->{$current_group} } ) {
  101         2889  
190 91 100       2650 if ( $groups{$group_to} ) {
191 84         1826 my $weight = 1 / $self->group_order->{$current_group}->{$group_to};
192 84         288 $graph->add_weighted_edge( $current_group, $group_to, $weight );
193             }
194             }
195             }
196 23         5312 my $minimum_spanning_tree = $graph->minimum_spanning_tree;
197 23         70301 my $dfs_obj = Graph::Traversal::DFS->new($minimum_spanning_tree);
198 23         9543 my @reordered_dfs_groups = $dfs_obj->dfs;
199 23         48599 push(
200             @paths_and_weights,
201             {
202             path => \@reordered_dfs_groups,
203             average_weight => $edge_sum,
204             sample_names => \@sample_names
205             }
206             );
207             }
208              
209             }
210              
211 72         368 return $self->_order_by_samples_and_weights( \@paths_and_weights );
212             }
213              
214             sub _order_by_samples_and_weights {
215 73     73   229 my ( $self, $paths_and_weights ) = @_;
216              
217 73         150 my @ordered_graph_groups;
218 73 100       3428 if ( !defined( $self->samples_to_clusters ) ) {
219 72         166 my @ordered_paths_and_weights = sort { $a->{average_weight} <=> $b->{average_weight} } @{$paths_and_weights};
  5         46  
  72         302  
220 72         218 @ordered_graph_groups = map { $_->{path} } @ordered_paths_and_weights;
  29         115  
221 72         406 return \@ordered_graph_groups;
222             }
223              
224             # Find the largest cluster in each graph and regroup
225 1         4 my %largest_cluster_to_paths_and_weights;
226 1         2 for my $graph_details ( @{$paths_and_weights} ) {
  1         6  
227 3         7 my %cluster_count;
228 3         3 for my $sample_name ( @{ $graph_details->{sample_names} } ) {
  3         7  
229 6 50       100 if ( defined( $self->samples_to_clusters->{$sample_name} ) ) {
230 6         98 $cluster_count{ $self->samples_to_clusters->{$sample_name} }++;
231             }
232             }
233 3 0       11 my $largest_cluster = ( sort { $cluster_count{$b} <=> $cluster_count{$a} || $a cmp $b} keys %cluster_count )[0];
  0         0  
234 3 50       7 if ( !defined($largest_cluster) ) {
235 0         0 my @ordered_paths_and_weights = sort { $b->{average_weight} <=> $a->{average_weight} } @{$paths_and_weights};
  0         0  
  0         0  
236 0         0 @ordered_graph_groups = map { $_->{path} } @ordered_paths_and_weights;
  0         0  
237 0         0 return \@ordered_graph_groups;
238             }
239              
240 3         5 push( @{ $largest_cluster_to_paths_and_weights{$largest_cluster}{graph_details} }, $graph_details );
  3         7  
241 3         7 $largest_cluster_to_paths_and_weights{$largest_cluster}{largest_cluster_size} += $cluster_count{$largest_cluster};
242             }
243              
244             # go through each cluster group and order by weight
245 1         2 my @clustered_ordered_graph_groups;
246 1         34 for my $cluster_name (
247             sort {
248             $largest_cluster_to_paths_and_weights{$b}->{largest_cluster_size}
249             <=> $largest_cluster_to_paths_and_weights{$a}->{largest_cluster_size}
250 1         7 } keys %largest_cluster_to_paths_and_weights
251             )
252             {
253            
254             my @ordered_paths_and_weights =
255 2         7 sort { $b->{average_weight} <=> $a->{average_weight} } @{ $largest_cluster_to_paths_and_weights{$cluster_name}->{graph_details} };
  1         5  
  2         6  
256 2         5 @ordered_graph_groups = map { $_->{path} } @ordered_paths_and_weights;
  3         8  
257              
258 2         4 for my $graph_group (@ordered_graph_groups) {
259 3         8 push( @clustered_ordered_graph_groups, $graph_group );
260             }
261             }
262 1         17 return \@clustered_ordered_graph_groups;
263             }
264              
265             sub _build_groups_to_contigs {
266 36     36   118 my ($self) = @_;
267 36         273 $self->_add_groups_to_graph;
268              
269 36         6056 my %groups_to_contigs;
270 36         162 my $counter = 1;
271 36         81 my $overall_counter = 1;
272 36         79 my $counter_filtered = 1;
273              
274             # Accessory
275 36         284 my $accessory_graph = $self->_create_accessory_graph;
276 36         396 my @group_graphs = $accessory_graph->connected_components();
277 36         45134 my $reordered_graphs = $self->_reorder_connected_components( \@group_graphs );
278              
279 36         1410 $self->_save_graph_to_file( $accessory_graph, $self->accessory_graph_filename );
280              
281 36         91 for my $contig_groups ( @{$reordered_graphs} ) {
  36         215  
282 14         38 my $order_counter = 1;
283              
284 14         31 for my $group_name ( @{$contig_groups} ) {
  14         43  
285 29         87 $groups_to_contigs{$group_name}{accessory_label} = $counter;
286 29         45 $groups_to_contigs{$group_name}{accessory_order} = $order_counter;
287 29         57 $groups_to_contigs{$group_name}{'accessory_overall_order'} = $overall_counter;
288 29         41 $order_counter++;
289 29         52 $overall_counter++;
290             }
291 14         32 $counter++;
292             }
293              
294             # Core + accessory
295 36         1616 my @group_graphs_all = $self->group_graphs->connected_components();
296 36         63808 my $reordered_graphs_all = $self->_reorder_connected_components( \@group_graphs_all );
297 36         1767 $self->_save_graph_to_file( $self->group_graphs, $self->pan_graph_filename );
298              
299 36         110 $overall_counter = 1;
300 36         78 $counter = 1;
301 36         105 $counter_filtered = 1;
302 36         74 for my $contig_groups ( @{$reordered_graphs_all} ) {
  36         165  
303 15         38 my $order_counter = 1;
304              
305 15         26 for my $group_name ( @{$contig_groups} ) {
  15         41  
306 78         235 $groups_to_contigs{$group_name}{label} = $counter;
307 78         159 $groups_to_contigs{$group_name}{comment} = '';
308 78         119 $groups_to_contigs{$group_name}{order} = $order_counter;
309 78         124 $groups_to_contigs{$group_name}{'core_accessory_overall_order'} = $overall_counter;
310              
311 78 100       113 if ( @{$contig_groups} <= 2 ) {
  78 50       1923  
312 2         10 $groups_to_contigs{$group_name}{comment} = 'Investigate';
313             }
314             elsif ( $self->_groups_qc->{$group_name} ) {
315 0         0 $groups_to_contigs{$group_name}{comment} = $self->_groups_qc->{$group_name};
316             }
317             else {
318 76         159 $groups_to_contigs{$group_name}{'core_accessory_overall_order_filtered'} = $counter_filtered;
319 76         109 $counter_filtered++;
320             }
321 78         109 $order_counter++;
322 78         115 $overall_counter++;
323             }
324 15         38 $counter++;
325             }
326              
327 36         76 $counter_filtered = 1;
328 36         85 for my $contig_groups ( @{$reordered_graphs} ) {
  36         100  
329 14         30 for my $group_name ( @{$contig_groups} ) {
  14         31  
330 29 50 33     219 if ( ( !defined( $groups_to_contigs{$group_name}{comment} ) )
      33        
331             || ( defined( $groups_to_contigs{$group_name}{comment} ) && $groups_to_contigs{$group_name}{comment} eq '' ) )
332             {
333 29         53 $groups_to_contigs{$group_name}{'accessory_overall_order_filtered'} = $counter_filtered;
334 29         45 $counter_filtered++;
335             }
336             }
337             }
338              
339 36         1740 return \%groups_to_contigs;
340             }
341              
342             sub _create_accessory_graph {
343 36     36   133 my ($self) = @_;
344 36         533 my $graph = Graph->new( undirected => 1 );
345              
346 36         14224 my %core_groups;
347             my %group_freq;
348              
349 36         90 for my $sample_name ( keys %{ $self->_groups_to_file_contigs } ) {
  36         1368  
350 108         3284 my $groups_to_file_contigs = $self->_groups_to_file_contigs->{$sample_name};
351              
352 108         214 for my $groups_on_contig ( @{$groups_to_file_contigs} ) {
  108         322  
353 189         343 for my $current_group ( @{$groups_on_contig} ) {
  189         427  
354 176         402 $group_freq{$current_group}++;
355             }
356             }
357             }
358              
359 36         191 for my $current_group ( keys %{ $self->group_order() } ) {
  36         1220  
360 65 100       8176 next if ( $group_freq{$current_group} >= ( $self->number_of_files * $self->core_definition ) );
361            
362 26         55 for my $group_to ( keys %{ $self->group_order->{$current_group} } ) {
  26         576  
363 26 100       571 if ( $group_freq{$group_to} >= ( $self->number_of_files * $self->core_definition ) ) {
364 11         51 $graph->add_vertex($current_group);
365             }
366             else {
367 15         328 my $weight = 1.0 / ( $self->group_order->{$current_group}->{$group_to} );
368 15         75 $graph->add_weighted_edge( $current_group, $group_to, $weight );
369             }
370             }
371             }
372              
373 36         465 return $graph;
374             }
375              
376 3     3   28 no Moose;
  3         5  
  3         35  
377             __PACKAGE__->meta->make_immutable;
378              
379             1;
380              
381             __END__
382              
383             =pod
384              
385             =encoding UTF-8
386              
387             =head1 NAME
388              
389             Bio::Roary::OrderGenes - Take in GFF files and create a matrix of what genes are beside what other genes
390              
391             =head1 VERSION
392              
393             version 3.10.1
394              
395             =head1 SYNOPSIS
396              
397             Take in the analyse groups and create a matrix of what genes are beside what other genes
398             use Bio::Roary::OrderGenes;
399              
400             my $obj = Bio::Roary::OrderGenes->new(
401             analyse_groups_obj => $analyse_groups_obj,
402             gff_files => ['file1.gff','file2.gff']
403             );
404             $obj->groups_to_contigs;
405              
406             =head1 AUTHOR
407              
408             Andrew J. Page <ap13@sanger.ac.uk>
409              
410             =head1 COPYRIGHT AND LICENSE
411              
412             This software is Copyright (c) 2013 by Wellcome Trust Sanger Institute.
413              
414             This is free software, licensed under:
415              
416             The GNU General Public License, Version 3, June 2007
417              
418             =cut