File Coverage

lib/Bio/Roary/OrderGenes.pm
Criterion Covered Total %
statement 227 245 92.6
branch 28 36 77.7
condition 5 12 41.6
subroutine 18 19 94.7
pod n/a
total 278 312 89.1


line stmt bran cond sub pod time code
1             package Bio::Roary::OrderGenes;
2             $Bio::Roary::OrderGenes::VERSION = '3.11.0';
3             # ABSTRACT: Take in GFF files and create a matrix of what genes are beside what other genes
4              
5              
6 3     3   676 use Moose;
  3         5  
  3         21  
7 3     3   17719 use Bio::Roary::Exceptions;
  3         6  
  3         66  
8 3     3   27 use Bio::Roary::AnalyseGroups;
  3         5  
  3         53  
9 3     3   858 use Bio::Roary::ContigsToGeneIDsFromGFF;
  3         12  
  3         127  
10 3     3   1873 use Graph;
  3         230754  
  3         141  
11 3     3   1078 use Graph::Writer::Dot;
  3         8714  
  3         100  
12 3     3   24 use File::Basename;
  3         8  
  3         6691  
13              
14             has 'gff_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
15             has 'analyse_groups_obj' => ( is => 'ro', isa => 'Bio::Roary::AnalyseGroups', required => 1 );
16             has 'core_definition' => ( is => 'ro', isa => 'Num', default => 1.0 );
17             has 'pan_graph_filename' => ( is => 'ro', isa => 'Str', default => 'core_accessory_graph.dot' );
18             has 'accessory_graph_filename' => ( is => 'ro', isa => 'Str', default => 'accessory_graph.dot' );
19             has 'sample_weights' => ( is => 'ro', isa => 'Maybe[HashRef]' );
20             has 'samples_to_clusters' => ( is => 'ro', isa => 'Maybe[HashRef]' );
21             has 'group_order' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_group_order' );
22             has 'groups_to_sample_names' => ( is => 'rw', isa => 'HashRef', default => sub { {} } );
23             has 'group_graphs' => ( is => 'ro', isa => 'Graph', lazy => 1, builder => '_build_group_graphs' );
24             has 'groups_to_contigs' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_groups_to_contigs' );
25             has '_groups_to_file_contigs' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build__groups_to_file_contigs' );
26             has '_groups' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_groups' );
27             has 'number_of_files' => ( is => 'ro', isa => 'Int', lazy => 1, builder => '_build_number_of_files' );
28             has '_groups_qc' => ( is => 'ro', isa => 'HashRef', default => sub { {} } );
29             has '_percentage_of_largest_weak_threshold' => ( is => 'ro', isa => 'Num', default => 0.9 );
30              
31             sub _build_number_of_files {
32 15     15   41 my ($self) = @_;
33 15         21 return @{ $self->gff_files };
  15         340  
34             }
35              
36             sub _build_groups {
37 0     0   0 my ($self) = @_;
38 0         0 my %groups;
39 0         0 for my $group_name ( @{ $self->analyse_groups_obj->_groups } ) {
  0         0  
40 0         0 $groups{$group_name}++;
41             }
42 0         0 return \%groups;
43             }
44              
45             sub _build__groups_to_file_contigs {
46 36     36   83 my ($self) = @_;
47              
48 36         93 my @overlapping_hypothetical_gene_ids;
49             my %samples_to_groups_contigs;
50              
51             # Open each GFF file
52 36         60 for my $filename ( @{ $self->gff_files } ) {
  36         999  
53 108         679 my @groups_to_contigs;
54 108         4285 my $contigs_to_ids_obj = Bio::Roary::ContigsToGeneIDsFromGFF->new( gff_file => $filename );
55              
56 108         4909 my ( $sample_name, $directories, $suffix ) = fileparse($filename);
57 108         715 $sample_name =~ s/\.gff//gi;
58              
59             # Loop over each contig in the GFF file
60 108         204 for my $contig_name ( keys %{ $contigs_to_ids_obj->contig_to_ids } ) {
  108         3607  
61 189         272 my @groups_on_contig;
62              
63             # loop over each gene in each contig in the GFF file
64 189         219 for my $gene_id ( @{ $contigs_to_ids_obj->contig_to_ids->{$contig_name} } ) {
  189         4947  
65              
66             # convert to group name
67 999         20430 my $group_name = $self->analyse_groups_obj->_genes_to_groups->{$gene_id};
68 999 100       2102 next unless ( defined($group_name) );
69              
70 176 50       5038 if ( $contigs_to_ids_obj->overlapping_hypothetical_protein_ids->{$gene_id} ) {
71 0         0 $self->_groups_qc->{$group_name} =
72             'Hypothetical protein with no hits to refseq/uniprot/clusters/cdd/tigrfams/pfam overlapping another protein with hits';
73             }
74 176         441 push( @groups_on_contig, $group_name );
75             }
76 189         565 push( @groups_to_contigs, \@groups_on_contig );
77             }
78 108         4641 $samples_to_groups_contigs{$sample_name} = \@groups_to_contigs;
79             }
80              
81 36         1199 return \%samples_to_groups_contigs;
82              
83             }
84              
85             sub _build_group_order {
86 36     36   115 my ($self) = @_;
87 36         100 my %group_order;
88              
89             my %groups_to_sample_names;
90 36         69 for my $sample_name ( keys %{ $self->_groups_to_file_contigs } ) {
  36         1055  
91 108         2780 my $groups_to_file_contigs = $self->_groups_to_file_contigs->{$sample_name};
92 108         229 for my $groups_on_contig ( @{$groups_to_file_contigs} ) {
  108         337  
93 189         257 for ( my $i = 1 ; $i < @{$groups_on_contig} ; $i++ ) {
  320         520  
94 131         205 my $group_from = $groups_on_contig->[ $i - 1 ];
95 131         174 my $group_to = $groups_on_contig->[$i];
96              
97 131 100 66     2595 if ( defined( $self->sample_weights ) && $self->sample_weights->{$sample_name} ) {
98 11         243 $group_order{$group_from}{$group_to} += $self->sample_weights->{$sample_name};
99 11         16 push( @{ $groups_to_sample_names{$group_from} }, $sample_name );
  11         39  
100             }
101             else {
102 120         314 $group_order{$group_from}{$group_to}++;
103             }
104             }
105 189 100       192 if ( @{$groups_on_contig} == 1 ) {
  189         364  
106 6         14 my $group_from = $groups_on_contig->[0];
107 6         10 my $group_to = $groups_on_contig->[0];
108 6 50 33     124 if ( defined( $self->sample_weights ) && $self->sample_weights->{$sample_name} ) {
109 0         0 $group_order{$group_from}{$group_to} += $self->sample_weights->{$sample_name};
110 0         0 push( @{ $groups_to_sample_names{$group_from} }, $sample_name );
  0         0  
111             }
112             else {
113 6         16 $group_order{$group_from}{$group_to}++;
114             }
115             }
116             }
117             }
118              
119 36         1199 $self->groups_to_sample_names( \%groups_to_sample_names );
120 36         899 return \%group_order;
121             }
122              
123             sub _build_group_graphs {
124 36     36   107 my ($self) = @_;
125 36         438 return Graph->new( undirected => 1 );
126             }
127              
128             sub _save_graph_to_file {
129 72     72   182 my ( $self, $graph, $output_filename ) = @_;
130 72         924 my $writer = Graph::Writer::Dot->new();
131 72         2335 $writer->write_graph( $graph, $output_filename );
132 72         77840 return 1;
133             }
134              
135             sub _add_groups_to_graph {
136 36     36   109 my ($self) = @_;
137              
138 36         59 for my $current_group ( keys %{ $self->group_order() } ) {
  36         1023  
139 65         19614 for my $group_to ( keys %{ $self->group_order->{$current_group} } ) {
  65         1915  
140 71         2864 my $weight = 1.0 / ( $self->group_order->{$current_group}->{$group_to} );
141 71         1755 $self->group_graphs->add_weighted_edge( $current_group, $group_to, $weight );
142             }
143             }
144              
145             }
146              
147             sub _reorder_connected_components {
148 72     72   177 my ( $self, $graph_groups ) = @_;
149 72         162 my @ordered_graph_groups;
150             my @paths_and_weights;
151              
152 72         108 for my $graph_group ( @{$graph_groups} ) {
  72         232  
153 29         57 my %groups;
154 29         41 $groups{$_}++ for ( @{$graph_group} );
  29         120  
155 29         48 my $edge_sum = 0;
156              
157 29         74 for my $current_group ( keys %groups ) {
158 107         112 for my $group_to ( keys %{ $self->group_order->{$current_group} } ) {
  107         2475  
159 97 100       212 next unless defined( $groups{$group_to} );
160 86         1506 $edge_sum += $self->group_order->{$current_group}->{$group_to};
161             }
162             }
163              
164 29         69 my %samples_in_graph;
165 29         76 for my $current_group ( keys %groups ) {
166 107         2046 my $sample_names = $self->groups_to_sample_names->{$current_group};
167 107 100       210 if ( defined($sample_names) ) {
168 11         15 for my $sample_name ( @{$sample_names} ) {
  11         16  
169 15         25 $samples_in_graph{$sample_name}++;
170             }
171             }
172             }
173 29         160 my @sample_names = sort keys %samples_in_graph;
174              
175 29 100       49 if ( @{$graph_group} == 1 ) {
  29         83  
176              
177 6         52 push(
178             @paths_and_weights,
179             {
180             path => $graph_group,
181             average_weight => $edge_sum,
182             sample_names => \@sample_names
183             }
184             );
185             }
186             else {
187 23         101 my $graph = Graph->new( undirected => 1 );
188 23         4908 for my $current_group ( keys %groups ) {
189 101         21859 for my $group_to ( keys %{ $self->group_order->{$current_group} } ) {
  101         2896  
190 91 100       1698 if ( $groups{$group_to} ) {
191 84         1554 my $weight = 1 / $self->group_order->{$current_group}->{$group_to};
192 84         249 $graph->add_weighted_edge( $current_group, $group_to, $weight );
193             }
194             }
195             }
196 23         3080 my $minimum_spanning_tree = $graph->minimum_spanning_tree;
197 23         60280 my $dfs_obj = Graph::Traversal::DFS->new($minimum_spanning_tree);
198 23         8006 my @reordered_dfs_groups = $dfs_obj->dfs;
199 23         39241 push(
200             @paths_and_weights,
201             {
202             path => \@reordered_dfs_groups,
203             average_weight => $edge_sum,
204             sample_names => \@sample_names
205             }
206             );
207             }
208              
209             }
210              
211 72         481 return $self->_order_by_samples_and_weights( \@paths_and_weights );
212             }
213              
214             sub _order_by_samples_and_weights {
215 73     73   236 my ( $self, $paths_and_weights ) = @_;
216              
217 73         140 my @ordered_graph_groups;
218 73 100       2513 if ( !defined( $self->samples_to_clusters ) ) {
219 72         153 my @ordered_paths_and_weights = sort { $a->{average_weight} <=> $b->{average_weight} } @{$paths_and_weights};
  5         31  
  72         271  
220 72         152 @ordered_graph_groups = map { $_->{path} } @ordered_paths_and_weights;
  29         93  
221 72         324 return \@ordered_graph_groups;
222             }
223              
224             # Find the largest cluster in each graph and regroup
225 1         4 my %largest_cluster_to_paths_and_weights;
226 1         5 for my $graph_details ( @{$paths_and_weights} ) {
  1         11  
227 3         6 my %cluster_count;
228 3         3 for my $sample_name ( @{ $graph_details->{sample_names} } ) {
  3         6  
229 6 50       112 if ( defined( $self->samples_to_clusters->{$sample_name} ) ) {
230 6         102 $cluster_count{ $self->samples_to_clusters->{$sample_name} }++;
231             }
232             }
233 3 0       12 my $largest_cluster = ( sort { $cluster_count{$b} <=> $cluster_count{$a} || $a cmp $b} keys %cluster_count )[0];
  0         0  
234 3 50       7 if ( !defined($largest_cluster) ) {
235 0         0 my @ordered_paths_and_weights = sort { $b->{average_weight} <=> $a->{average_weight} } @{$paths_and_weights};
  0         0  
  0         0  
236 0         0 @ordered_graph_groups = map { $_->{path} } @ordered_paths_and_weights;
  0         0  
237 0         0 return \@ordered_graph_groups;
238             }
239              
240 3         7 push( @{ $largest_cluster_to_paths_and_weights{$largest_cluster}{graph_details} }, $graph_details );
  3         8  
241 3         8 $largest_cluster_to_paths_and_weights{$largest_cluster}{largest_cluster_size} += $cluster_count{$largest_cluster};
242             }
243              
244             # go through each cluster group and order by weight
245 1         1 my @clustered_ordered_graph_groups;
246 1         8 for my $cluster_name (
247             sort {
248             $largest_cluster_to_paths_and_weights{$b}->{largest_cluster_size}
249             <=> $largest_cluster_to_paths_and_weights{$a}->{largest_cluster_size}
250 1         6 } keys %largest_cluster_to_paths_and_weights
251             )
252             {
253            
254             my @ordered_paths_and_weights =
255 2         4 sort { $b->{average_weight} <=> $a->{average_weight} } @{ $largest_cluster_to_paths_and_weights{$cluster_name}->{graph_details} };
  1         4  
  2         6  
256 2         3 @ordered_graph_groups = map { $_->{path} } @ordered_paths_and_weights;
  3         6  
257              
258 2         4 for my $graph_group (@ordered_graph_groups) {
259 3         6 push( @clustered_ordered_graph_groups, $graph_group );
260             }
261             }
262 1         9 return \@clustered_ordered_graph_groups;
263             }
264              
265             sub _build_groups_to_contigs {
266 36     36   108 my ($self) = @_;
267 36         230 $self->_add_groups_to_graph;
268              
269 36         4497 my %groups_to_contigs;
270 36         114 my $counter = 1;
271 36         67 my $overall_counter = 1;
272 36         58 my $counter_filtered = 1;
273              
274             # Accessory
275 36         205 my $accessory_graph = $self->_create_accessory_graph;
276 36         345 my @group_graphs = $accessory_graph->connected_components();
277 36         37905 my $reordered_graphs = $self->_reorder_connected_components( \@group_graphs );
278              
279 36         1159 $self->_save_graph_to_file( $accessory_graph, $self->accessory_graph_filename );
280              
281 36         113 for my $contig_groups ( @{$reordered_graphs} ) {
  36         180  
282 14         27 my $order_counter = 1;
283              
284 14         32 for my $group_name ( @{$contig_groups} ) {
  14         29  
285 29         78 $groups_to_contigs{$group_name}{accessory_label} = $counter;
286 29         53 $groups_to_contigs{$group_name}{accessory_order} = $order_counter;
287 29         53 $groups_to_contigs{$group_name}{'accessory_overall_order'} = $overall_counter;
288 29         33 $order_counter++;
289 29         38 $overall_counter++;
290             }
291 14         23 $counter++;
292             }
293              
294             # Core + accessory
295 36         1223 my @group_graphs_all = $self->group_graphs->connected_components();
296 36         52234 my $reordered_graphs_all = $self->_reorder_connected_components( \@group_graphs_all );
297 36         873 $self->_save_graph_to_file( $self->group_graphs, $self->pan_graph_filename );
298              
299 36         96 $overall_counter = 1;
300 36         98 $counter = 1;
301 36         81 $counter_filtered = 1;
302 36         77 for my $contig_groups ( @{$reordered_graphs_all} ) {
  36         119  
303 15         27 my $order_counter = 1;
304              
305 15         25 for my $group_name ( @{$contig_groups} ) {
  15         41  
306 78         196 $groups_to_contigs{$group_name}{label} = $counter;
307 78         160 $groups_to_contigs{$group_name}{comment} = '';
308 78         105 $groups_to_contigs{$group_name}{order} = $order_counter;
309 78         97 $groups_to_contigs{$group_name}{'core_accessory_overall_order'} = $overall_counter;
310              
311 78 100       83 if ( @{$contig_groups} <= 2 ) {
  78 50       1709  
312 2         7 $groups_to_contigs{$group_name}{comment} = 'Investigate';
313             }
314             elsif ( $self->_groups_qc->{$group_name} ) {
315 0         0 $groups_to_contigs{$group_name}{comment} = $self->_groups_qc->{$group_name};
316             }
317             else {
318 76         134 $groups_to_contigs{$group_name}{'core_accessory_overall_order_filtered'} = $counter_filtered;
319 76         87 $counter_filtered++;
320             }
321 78         90 $order_counter++;
322 78         99 $overall_counter++;
323             }
324 15         28 $counter++;
325             }
326              
327 36         82 $counter_filtered = 1;
328 36         70 for my $contig_groups ( @{$reordered_graphs} ) {
  36         103  
329 14         20 for my $group_name ( @{$contig_groups} ) {
  14         25  
330 29 50 33     199 if ( ( !defined( $groups_to_contigs{$group_name}{comment} ) )
      33        
331             || ( defined( $groups_to_contigs{$group_name}{comment} ) && $groups_to_contigs{$group_name}{comment} eq '' ) )
332             {
333 29         44 $groups_to_contigs{$group_name}{'accessory_overall_order_filtered'} = $counter_filtered;
334 29         39 $counter_filtered++;
335             }
336             }
337             }
338              
339 36         1421 return \%groups_to_contigs;
340             }
341              
342             sub _create_accessory_graph {
343 36     36   183 my ($self) = @_;
344 36         626 my $graph = Graph->new( undirected => 1 );
345              
346 36         11505 my %core_groups;
347             my %group_freq;
348              
349 36         73 for my $sample_name ( keys %{ $self->_groups_to_file_contigs } ) {
  36         1140  
350 108         2513 my $groups_to_file_contigs = $self->_groups_to_file_contigs->{$sample_name};
351              
352 108         161 for my $groups_on_contig ( @{$groups_to_file_contigs} ) {
  108         221  
353 189         170 for my $current_group ( @{$groups_on_contig} ) {
  189         250  
354 176         254 $group_freq{$current_group}++;
355             }
356             }
357             }
358              
359 36         86 for my $current_group ( keys %{ $self->group_order() } ) {
  36         727  
360 65 100       7296 next if ( $group_freq{$current_group} >= ( $self->number_of_files * $self->core_definition ) );
361            
362 26         59 for my $group_to ( keys %{ $self->group_order->{$current_group} } ) {
  26         481  
363 26 100       509 if ( $group_freq{$group_to} >= ( $self->number_of_files * $self->core_definition ) ) {
364 11         40 $graph->add_vertex($current_group);
365             }
366             else {
367 15         303 my $weight = 1.0 / ( $self->group_order->{$current_group}->{$group_to} );
368 15         49 $graph->add_weighted_edge( $current_group, $group_to, $weight );
369             }
370             }
371             }
372              
373 36         1584 return $graph;
374             }
375              
376 3     3   25 no Moose;
  3         8  
  3         30  
377             __PACKAGE__->meta->make_immutable;
378              
379             1;
380              
381             __END__
382              
383             =pod
384              
385             =encoding UTF-8
386              
387             =head1 NAME
388              
389             Bio::Roary::OrderGenes - Take in GFF files and create a matrix of what genes are beside what other genes
390              
391             =head1 VERSION
392              
393             version 3.11.0
394              
395             =head1 SYNOPSIS
396              
397             Take in the analyse groups and create a matrix of what genes are beside what other genes
398             use Bio::Roary::OrderGenes;
399              
400             my $obj = Bio::Roary::OrderGenes->new(
401             analyse_groups_obj => $analyse_groups_obj,
402             gff_files => ['file1.gff','file2.gff']
403             );
404             $obj->groups_to_contigs;
405              
406             =head1 AUTHOR
407              
408             Andrew J. Page <ap13@sanger.ac.uk>
409              
410             =head1 COPYRIGHT AND LICENSE
411              
412             This software is Copyright (c) 2013 by Wellcome Trust Sanger Institute.
413              
414             This is free software, licensed under:
415              
416             The GNU General Public License, Version 3, June 2007
417              
418             =cut