File Coverage

lib/Bio/Roary/AnnotateGroups.pm
Criterion Covered Total %
statement 202 220 91.8
branch 31 44 70.4
condition 5 9 55.5
subroutine 27 30 90.0
pod 0 5 0.0
total 265 308 86.0


line stmt bran cond sub pod time code
1             package Bio::Roary::AnnotateGroups;
2             $Bio::Roary::AnnotateGroups::VERSION = '3.10.1';
3             # ABSTRACT: Take in a group file and associated GFF files for the isolates and update the group name to the gene name
4              
5              
6 17     17   811798 use Moose;
  17         1177957  
  17         130  
7 17     17   118096 use Bio::Roary::Exceptions;
  17         65  
  17         506  
8 17     17   5543 use Bio::Roary::GeneNamesFromGFF;
  17         59  
  17         874  
9 17     17   7044 use Array::Utils qw(array_minus);
  17         6435  
  17         1350  
10 17     17   118 use List::Util qw(max min sum);
  17         34  
  17         1071  
11 17     17   4374 use File::Grep qw(fgrep);
  17         18407  
  17         35789  
12              
13             has 'gff_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
14             has 'output_filename' => ( is => 'ro', isa => 'Str', default => 'reannotated_groups_file' );
15             has 'groups_filename' => ( is => 'ro', isa => 'Str', required => 1 );
16             has '_ids_to_gene_names' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build__ids_to_gene_names' );
17             has '_ids_to_product' => ( is => 'rw', isa => 'HashRef', default => sub { {} } );
18             has '_ids_to_gene_size' => ( is => 'rw', isa => 'HashRef', default => sub { {} } );
19             has 'group_nucleotide_lengths' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_group_nucleotide_lengths');
20              
21             has '_groups_to_id_names' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_builder__groups_to_id_names' );
22             has '_output_fh' => ( is => 'ro', lazy => 1, builder => '_build__output_fh' );
23             has '_groups_to_consensus_gene_names' =>
24             ( is => 'rw', isa => 'HashRef', lazy => 1, builder => '_build__groups_to_consensus_gene_names' );
25             has '_filtered_gff_files' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, builder => '_build__filtered_gff_files' );
26             has '_number_of_files' => ( is => 'ro', isa => 'Int', lazy => 1, builder => '_build__number_of_files' );
27             has '_ids_to_groups' => ( is => 'rw', isa => 'HashRef', lazy => 1, builder => '_builder__ids_to_groups' );
28             has '_group_counter' => ( is => 'rw', isa => 'Int', lazy => 1, builder => '_builder__group_counter' );
29             has '_group_default_prefix' => ( is => 'rw', isa => 'Str', default => 'group_' );
30             has '_ids_to_verbose_stats' => ( is => 'rw', isa => 'HashRef', lazy_build => 1 );
31              
32             sub BUILD {
33 31     31 0 101 my ($self) = @_;
34 31         1316 $self->_ids_to_gene_names;
35             }
36              
37             sub _builder__group_counter {
38 1     1   2 my ($self) = @_;
39 1         20 my $prefix = $self->_group_default_prefix;
40 1         2 my $highest_group = 0;
41 1         1 for my $group ( @{ $self->_groups } ) {
  1         4  
42 6 50       35 if ( $group =~ /$prefix([\d]+)$/ ) {
43 6         11 my $group_id = $1;
44 6 100       10 if ( $group_id > $highest_group ) {
45 3         4 $highest_group = $group_id;
46             }
47             }
48             }
49 1         22 return $highest_group + 1;
50             }
51              
52             sub _generate__ids_to_groups {
53 25     25   86 my ($self) = @_;
54 25         43 my %ids_to_groups;
55              
56 25         56 for my $group ( keys %{ $self->_groups_to_id_names } ) {
  25         753  
57 70         99 for my $id_name ( @{ $self->_groups_to_id_names->{$group} } ) {
  70         1920  
58 122         383 $ids_to_groups{$id_name} = $group;
59             }
60             }
61 25         912 return \%ids_to_groups;
62             }
63              
64             sub _builder__ids_to_groups {
65 0     0   0 my ($self) = @_;
66 0         0 return $self->_generate__ids_to_groups;
67             }
68              
69             sub _build__output_fh {
70 25     25   128 my ($self) = @_;
71 25 50       1052 open( my $fh, '>', $self->output_filename )
72             or Bio::Roary::Exceptions::CouldntWriteToFile->throw(
73             error => "Couldnt write output file:" . $self->output_filename );
74 25         894 return $fh;
75             }
76              
77             sub _build__filtered_gff_files {
78 31     31   125 my ($self) = @_;
79 31         70 my @gff_files = grep( /\.gff$/, @{ $self->gff_files } );
  31         1000  
80 31         1039 return \@gff_files;
81             }
82              
83             sub _build__ids_to_gene_names {
84 31     31   100 my ($self) = @_;
85 31         125 my %ids_to_gene_names;
86             my %ids_to_product;
87 31         0 my %ids_to_gene_size;
88 31         61 for my $filename ( @{ $self->_filtered_gff_files } ) {
  31         1205  
89 39         1110 my $gene_names_from_gff = Bio::Roary::GeneNamesFromGFF->new( gff_file => $filename );
90 39         88 my %id_to_gene_lookup = %{ $gene_names_from_gff->ids_to_gene_name };
  39         864  
91 39         250 @ids_to_gene_names{ keys %id_to_gene_lookup } = values %id_to_gene_lookup;
92              
93 39         84 my %id_to_product_lookup = %{ $gene_names_from_gff->ids_to_product };
  39         980  
94 39         371 @ids_to_product{ keys %id_to_product_lookup } = values %id_to_product_lookup;
95            
96 39         89 my %ids_to_gene_size_lookup = %{ $gene_names_from_gff->ids_to_gene_size };
  39         892  
97 39         1279 @ids_to_gene_size{ keys %ids_to_gene_size_lookup } = values %ids_to_gene_size_lookup;
98             }
99 31         1432 $self->_ids_to_product( \%ids_to_product );
100 31         1145 $self->_ids_to_gene_size( \%ids_to_gene_size );
101              
102 31         1057 return \%ids_to_gene_names;
103             }
104              
105             sub _build__ids_to_verbose_stats {
106 1     1   4 my $self = shift;
107              
108 1     813   12 my @matches_hash = fgrep { /ID=/i } @{ $self->_filtered_gff_files };
  813         9819  
  1         86  
109 1         30 my @matches;
110 1         3 foreach my $m ( @matches_hash ){
111 3         5 push( @matches, values %{$m->{matches}} );
  3         18  
112             }
113             # chomp @matches;
114            
115 1         2 my %verbose;
116 1         3 foreach my $line ( @matches ){
117 48         49 my ( $id, $inf, $prod );
118 48 50       137 if( $line =~ m/ID=["']?([^;"']+)["']?;?/i ){
119 48         70 $id = $1;
120             }
121             else {
122 0         0 next;
123             }
124              
125 48 50       163 $inf = $1 if ( $line =~ m/inference=([^;]+);/ );
126 48 50       139 $prod = $1 if ( $line =~ m/product=([^;]+)[;\n]/ );
127              
128 48         112 my %info = ( 'inference' => $inf, 'product' => $prod );
129 48         100 $verbose{$id} = \%info;
130             }
131 1         43 return \%verbose;
132             }
133              
134              
135             sub consensus_product_for_id_names {
136 63     63 0 183 my ( $self, $id_names ) = @_;
137 63         126 my %product_freq;
138 63         272 for my $id_name ( @{$id_names} ) {
  63         329  
139 109 100       3521 next unless ( defined( $self->_ids_to_product->{$id_name} ) );
140 31         1264 $product_freq{ $self->_ids_to_product->{$id_name} }++;
141             }
142              
143 63         241 my @sorted_product_keys = sort { $product_freq{$b} <=> $product_freq{$a} } keys(%product_freq);
  0         0  
144              
145 63 100       204 if ( @sorted_product_keys > 0 ) {
146 18         75 return $sorted_product_keys[0];
147             }
148             else {
149 45         247 return '';
150             }
151             }
152              
153             sub _builder__groups_to_id_names {
154 29     29   94 my ($self) = @_;
155 29         62 my %groups_to_id_names;
156              
157 29 50       1278 open( my $fh, $self->groups_filename )
158             or Bio::Roary::Exceptions::FileNotFound->throw( error => "Group file not found:" . $self->groups_filename );
159 29         769 while (<$fh>) {
160 98         218 chomp;
161 98         198 my $line = $_;
162 98 50       625 if ( $line =~ /^(.+): (.+)$/ ) {
163 98         427 my $group_name = $1;
164 98         207 my $genes = $2;
165 98         532 my @elements = split( /[\s\t]+/, $genes );
166 98         871 $groups_to_id_names{$group_name} = \@elements;
167             }
168             }
169            
170 29         1395 return \%groups_to_id_names;
171             }
172              
173             sub _groups {
174 47     47   133 my ($self) = @_;
175 47         112 my @groups = keys %{ $self->_groups_to_id_names };
  47         1697  
176 47         310 return \@groups;
177             }
178              
179             sub _ids_grouped_by_gene_name_for_group {
180 142     142   472 my ( $self, $group_name ) = @_;
181 142         235 my %gene_name_freq;
182 142         223 for my $id_name ( @{ $self->_groups_to_id_names->{$group_name} } ) {
  142         4098  
183 243 100 66     6960 if ( defined( $self->_ids_to_gene_names->{$id_name} ) && $self->_ids_to_gene_names->{$id_name} ne "" ) {
184 57         71 push( @{ $gene_name_freq{ $self->_ids_to_gene_names->{$id_name} } }, $id_name );
  57         1157  
185             }
186             }
187 142         425 return \%gene_name_freq;
188             }
189              
190             sub _consensus_gene_name_for_group {
191 142     142   392 my ( $self, $group_name ) = @_;
192 142         445 my $gene_name_freq = $self->_ids_grouped_by_gene_name_for_group($group_name);
193              
194 142         234 my @sorted_gene_names = sort { @{ $gene_name_freq->{$b} } <=> @{ $gene_name_freq->{$a} } } keys %{$gene_name_freq};
  6         12  
  6         16  
  6         27  
  142         495  
195 142 100       510 if ( @sorted_gene_names > 0 ) {
196 28         87 return shift(@sorted_gene_names);
197             }
198             else {
199 114         480 return $group_name;
200             }
201             }
202              
203             sub _build_group_nucleotide_lengths
204             {
205 21     21   63 my ($self) = @_;
206 21         48 my %group_nucleotide_lengths;
207 21         50 for my $group_name (keys %{ $self->_groups_to_id_names } )
  21         867  
208             {
209 63         121 my @gene_lengths;
210 63         100 for my $gene_id (@{$self->_groups_to_id_names->{$group_name}})
  63         1944  
211             {
212 109         3349 my $current_gene_size = $self->_ids_to_gene_size->{$gene_id};
213 109 100       309 next unless(defined($current_gene_size) );
214 31 50       68 next if($current_gene_size < 1);
215 31         124 push(@gene_lengths, $current_gene_size);
216             }
217            
218 63 100       207 next if(@gene_lengths == 0);
219 18   50     92 my $average_gene_size = (int((sum @gene_lengths)/@gene_lengths)) || 0;
220 18   50     63 my $min_gene_size = (min @gene_lengths) || 0;
221 18   50     54 my $max_gene_size = (max @gene_lengths) || 0;
222 18         114 $group_nucleotide_lengths{$group_name} = {'min' => $min_gene_size, 'max' =>$max_gene_size , 'average' => $average_gene_size};
223             }
224 21         777 return \%group_nucleotide_lengths;
225             }
226              
227             sub _generate_groups_to_consensus_gene_names {
228 28     28   65 my ($self) = @_;
229 28         71 my %groups_to_gene_names;
230             my %gene_name_freq;
231 28         1588 my $group_prefix = $self->_group_default_prefix;
232              
233             # These are already annotated
234 28         82 for my $group_name ( sort { @{ $self->_groups_to_id_names->{$b} } <=> @{ $self->_groups_to_id_names->{$a} } }
  116         339  
  116         2856  
  116         3088  
235 28         1314 keys %{ $self->_groups_to_id_names } )
236             {
237 91 50       605 next if ( $group_name =~ /$group_prefix/ );
238 0         0 $groups_to_gene_names{$group_name} = $group_name;
239             }
240              
241 28         178 for my $group_name ( sort { @{ $self->_groups_to_id_names->{$b} } <=> @{ $self->_groups_to_id_names->{$a} } }
  116         198  
  116         2825  
  116         2717  
242 28         848 keys %{ $self->_groups_to_id_names } )
243             {
244 91 50       495 next unless ( $group_name =~ /$group_prefix/ );
245 91         302 my $consensus_gene_name = $self->_consensus_gene_name_for_group($group_name);
246              
247 91 50       221 if ( defined( $gene_name_freq{$consensus_gene_name} ) ) {
248 0         0 $groups_to_gene_names{$group_name} = $group_name;
249             }
250             else {
251 91         203 $groups_to_gene_names{$group_name} = $consensus_gene_name;
252 91         284 $gene_name_freq{$consensus_gene_name}++;
253             }
254             }
255 28         1111 return \%groups_to_gene_names;
256             }
257              
258             sub _build__groups_to_consensus_gene_names {
259 3     3   8 my ($self) = @_;
260 3         13 return $self->_generate_groups_to_consensus_gene_names;
261             }
262              
263             sub _build__number_of_files {
264 0     0   0 my ($self) = @_;
265 0         0 return @{ $self->gff_files };
  0         0  
266             }
267              
268              
269             sub _split_groups {
270 25     25   64 my ($self) = @_;
271            
272 25         113 $self->_groups_to_consensus_gene_names( $self->_generate_groups_to_consensus_gene_names );
273 25         146 $self->_ids_to_groups( $self->_generate__ids_to_groups );
274             }
275              
276             sub _remove_ids_from_group {
277 0     0   0 my ( $self, $ids_to_remove, $group ) = @_;
278              
279 0         0 my @remaining_ids = array_minus( @{ $self->_groups_to_id_names->{$group} }, @{ $ids_to_remove } );
  0         0  
  0         0  
280 0         0 $self->_groups_to_id_names->{$group} = \@remaining_ids;
281 0 0       0 if ( @{ $self->_groups_to_id_names->{$group} } == 0 ) {
  0         0  
282 0         0 delete( $self->_groups_to_id_names->{$group} );
283             }
284             }
285              
286             sub reannotate {
287 25     25 0 91 my ($self) = @_;
288              
289 25         136 $self->_split_groups;
290              
291 25         68 my %groups_to_id_names = %{ $self->_groups_to_id_names };
  25         823  
292 25         162 for
293 75         141 my $group_name ( sort { @{ $groups_to_id_names{$b} } <=> @{ $groups_to_id_names{$a} } } keys %groups_to_id_names )
  75         170  
  75         181  
294             {
295 70         2092 my $consensus_gene_name = $self->_groups_to_consensus_gene_names->{$group_name};
296 70         1822 print { $self->_output_fh } $consensus_gene_name . ": "
297 70         114 . join( "\t", @{ $self->_groups_to_id_names->{$group_name} } ) . "\n";
  70         1887  
298             }
299 25         736 close( $self->_output_fh );
300 25         200 return $self;
301             }
302              
303             sub full_annotation {
304 7     7 0 17 my ( $self, $group ) = @_;
305              
306 7         10 my @id_names = @{ $self->_groups_to_id_names->{$group} };
  7         200  
307              
308 7         13 my %product_freq;
309 7         14 for my $id_name ( @id_names ) {
310 12 100       301 next unless ( defined( $self->_ids_to_verbose_stats->{$id_name}->{'product'} ) );
311 10         245 $product_freq{ $self->_ids_to_verbose_stats->{$id_name}->{'product'} }++;
312             }
313              
314 7         25 my @sorted_product_keys = sort { $product_freq{$b} <=> $product_freq{$a} } keys(%product_freq);
  0         0  
315              
316 7 100       16 if ( @sorted_product_keys > 0 ) {
317 6         39 return join('; ', @sorted_product_keys);
318             }
319             else {
320 1         4 return '';
321             }
322            
323             }
324              
325             sub inference {
326 7     7 0 15 my ( $self, $group ) = @_;
327              
328 7         9 my @infs;
329 7         8 foreach my $g ( @{ $self->_groups_to_id_names->{$group} } ){
  7         176  
330 12 100       280 next unless ( defined $self->_ids_to_verbose_stats->{$g}->{'inference'} );
331 10         244 push( @infs, $self->_ids_to_verbose_stats->{$g}->{'inference'} );
332             }
333              
334             # maybe make a consensus in the future?
335              
336 7         25 return $infs[0];
337             }
338              
339 17     17   193 no Moose;
  17         45  
  17         201  
340             __PACKAGE__->meta->make_immutable;
341              
342             1;
343              
344             __END__
345              
346             =pod
347              
348             =encoding UTF-8
349              
350             =head1 NAME
351              
352             Bio::Roary::AnnotateGroups - Take in a group file and associated GFF files for the isolates and update the group name to the gene name
353              
354             =head1 VERSION
355              
356             version 3.10.1
357              
358             =head1 SYNOPSIS
359              
360             Take in a group file and associated GFF files for the isolates and update the group name to the gene name
361             use Bio::Roary::AnnotateGroups;
362              
363             my $obj = Bio::Roary::AnnotateGroups->new(
364             gff_files => ['abc.gff','efg.gff'],
365             output_filename => 'example_output.fa',
366             groups_filename => 'groupsfile',
367             );
368             $obj->reannotate;
369              
370             =head1 AUTHOR
371              
372             Andrew J. Page <ap13@sanger.ac.uk>
373              
374             =head1 COPYRIGHT AND LICENSE
375              
376             This software is Copyright (c) 2013 by Wellcome Trust Sanger Institute.
377              
378             This is free software, licensed under:
379              
380             The GNU General Public License, Version 3, June 2007
381              
382             =cut