File Coverage

lib/Bio/Roary/AnnotateGroups.pm
Criterion Covered Total %
statement 202 220 91.8
branch 31 44 70.4
condition 5 9 55.5
subroutine 27 30 90.0
pod 0 5 0.0
total 265 308 86.0


line stmt bran cond sub pod time code
1             package Bio::Roary::AnnotateGroups;
2             $Bio::Roary::AnnotateGroups::VERSION = '3.11.0';
3             # ABSTRACT: Take in a group file and associated GFF files for the isolates and update the group name to the gene name
4              
5              
6 17     17   821402 use Moose;
  17         1207380  
  17         117  
7 17     17   115183 use Bio::Roary::Exceptions;
  17         37  
  17         422  
8 17     17   4514 use Bio::Roary::GeneNamesFromGFF;
  17         62  
  17         956  
9 17     17   7127 use Array::Utils qw(array_minus);
  17         6249  
  17         1339  
10 17     17   119 use List::Util qw(max min sum);
  17         35  
  17         1158  
11 17     17   4622 use File::Grep qw(fgrep);
  17         18957  
  17         36436  
12              
13             has 'gff_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
14             has 'output_filename' => ( is => 'ro', isa => 'Str', default => 'reannotated_groups_file' );
15             has 'groups_filename' => ( is => 'ro', isa => 'Str', required => 1 );
16             has '_ids_to_gene_names' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build__ids_to_gene_names' );
17             has '_ids_to_product' => ( is => 'rw', isa => 'HashRef', default => sub { {} } );
18             has '_ids_to_gene_size' => ( is => 'rw', isa => 'HashRef', default => sub { {} } );
19             has 'group_nucleotide_lengths' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_group_nucleotide_lengths');
20              
21             has '_groups_to_id_names' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_builder__groups_to_id_names' );
22             has '_output_fh' => ( is => 'ro', lazy => 1, builder => '_build__output_fh' );
23             has '_groups_to_consensus_gene_names' =>
24             ( is => 'rw', isa => 'HashRef', lazy => 1, builder => '_build__groups_to_consensus_gene_names' );
25             has '_filtered_gff_files' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, builder => '_build__filtered_gff_files' );
26             has '_number_of_files' => ( is => 'ro', isa => 'Int', lazy => 1, builder => '_build__number_of_files' );
27             has '_ids_to_groups' => ( is => 'rw', isa => 'HashRef', lazy => 1, builder => '_builder__ids_to_groups' );
28             has '_group_counter' => ( is => 'rw', isa => 'Int', lazy => 1, builder => '_builder__group_counter' );
29             has '_group_default_prefix' => ( is => 'rw', isa => 'Str', default => 'group_' );
30             has '_ids_to_verbose_stats' => ( is => 'rw', isa => 'HashRef', lazy_build => 1 );
31              
32             sub BUILD {
33 31     31 0 131 my ($self) = @_;
34 31         1124 $self->_ids_to_gene_names;
35             }
36              
37             sub _builder__group_counter {
38 1     1   2 my ($self) = @_;
39 1         19 my $prefix = $self->_group_default_prefix;
40 1         2 my $highest_group = 0;
41 1         2 for my $group ( @{ $self->_groups } ) {
  1         3  
42 6 50       39 if ( $group =~ /$prefix([\d]+)$/ ) {
43 6         12 my $group_id = $1;
44 6 100       10 if ( $group_id > $highest_group ) {
45 3         7 $highest_group = $group_id;
46             }
47             }
48             }
49 1         21 return $highest_group + 1;
50             }
51              
52             sub _generate__ids_to_groups {
53 25     25   84 my ($self) = @_;
54 25         50 my %ids_to_groups;
55              
56 25         41 for my $group ( keys %{ $self->_groups_to_id_names } ) {
  25         657  
57 70         98 for my $id_name ( @{ $self->_groups_to_id_names->{$group} } ) {
  70         1660  
58 122         387 $ids_to_groups{$id_name} = $group;
59             }
60             }
61 25         798 return \%ids_to_groups;
62             }
63              
64             sub _builder__ids_to_groups {
65 0     0   0 my ($self) = @_;
66 0         0 return $self->_generate__ids_to_groups;
67             }
68              
69             sub _build__output_fh {
70 25     25   67 my ($self) = @_;
71 25 50       842 open( my $fh, '>', $self->output_filename )
72             or Bio::Roary::Exceptions::CouldntWriteToFile->throw(
73             error => "Couldnt write output file:" . $self->output_filename );
74 25         747 return $fh;
75             }
76              
77             sub _build__filtered_gff_files {
78 31     31   78 my ($self) = @_;
79 31         58 my @gff_files = grep( /\.gff$/, @{ $self->gff_files } );
  31         849  
80 31         822 return \@gff_files;
81             }
82              
83             sub _build__ids_to_gene_names {
84 31     31   102 my ($self) = @_;
85 31         139 my %ids_to_gene_names;
86             my %ids_to_product;
87 31         0 my %ids_to_gene_size;
88 31         62 for my $filename ( @{ $self->_filtered_gff_files } ) {
  31         1059  
89 39         1058 my $gene_names_from_gff = Bio::Roary::GeneNamesFromGFF->new( gff_file => $filename );
90 39         64 my %id_to_gene_lookup = %{ $gene_names_from_gff->ids_to_gene_name };
  39         840  
91 39         241 @ids_to_gene_names{ keys %id_to_gene_lookup } = values %id_to_gene_lookup;
92              
93 39         129 my %id_to_product_lookup = %{ $gene_names_from_gff->ids_to_product };
  39         931  
94 39         332 @ids_to_product{ keys %id_to_product_lookup } = values %id_to_product_lookup;
95            
96 39         85 my %ids_to_gene_size_lookup = %{ $gene_names_from_gff->ids_to_gene_size };
  39         863  
97 39         1246 @ids_to_gene_size{ keys %ids_to_gene_size_lookup } = values %ids_to_gene_size_lookup;
98             }
99 31         922 $self->_ids_to_product( \%ids_to_product );
100 31         907 $self->_ids_to_gene_size( \%ids_to_gene_size );
101              
102 31         845 return \%ids_to_gene_names;
103             }
104              
105             sub _build__ids_to_verbose_stats {
106 1     1   4 my $self = shift;
107              
108 1     813   10 my @matches_hash = fgrep { /ID=/i } @{ $self->_filtered_gff_files };
  813         8283  
  1         33  
109 1         43 my @matches;
110 1         4 foreach my $m ( @matches_hash ){
111 3         4 push( @matches, values %{$m->{matches}} );
  3         36  
112             }
113             # chomp @matches;
114            
115 1         3 my %verbose;
116 1         3 foreach my $line ( @matches ){
117 48         52 my ( $id, $inf, $prod );
118 48 50       112 if( $line =~ m/ID=["']?([^;"']+)["']?;?/i ){
119 48         72 $id = $1;
120             }
121             else {
122 0         0 next;
123             }
124              
125 48 50       166 $inf = $1 if ( $line =~ m/inference=([^;]+);/ );
126 48 50       163 $prod = $1 if ( $line =~ m/product=([^;]+)[;\n]/ );
127              
128 48         111 my %info = ( 'inference' => $inf, 'product' => $prod );
129 48         112 $verbose{$id} = \%info;
130             }
131 1         66 return \%verbose;
132             }
133              
134              
135             sub consensus_product_for_id_names {
136 63     63 0 128 my ( $self, $id_names ) = @_;
137 63         88 my %product_freq;
138 63         118 for my $id_name ( @{$id_names} ) {
  63         168  
139 109 100       2697 next unless ( defined( $self->_ids_to_product->{$id_name} ) );
140 31         652 $product_freq{ $self->_ids_to_product->{$id_name} }++;
141             }
142              
143 63         190 my @sorted_product_keys = sort { $product_freq{$b} <=> $product_freq{$a} } keys(%product_freq);
  0         0  
144              
145 63 100       201 if ( @sorted_product_keys > 0 ) {
146 18         59 return $sorted_product_keys[0];
147             }
148             else {
149 45         202 return '';
150             }
151             }
152              
153             sub _builder__groups_to_id_names {
154 29     29   84 my ($self) = @_;
155 29         54 my %groups_to_id_names;
156              
157 29 50       1153 open( my $fh, $self->groups_filename )
158             or Bio::Roary::Exceptions::FileNotFound->throw( error => "Group file not found:" . $self->groups_filename );
159 29         442 while (<$fh>) {
160 98         168 chomp;
161 98         172 my $line = $_;
162 98 50       534 if ( $line =~ /^(.+): (.+)$/ ) {
163 98         281 my $group_name = $1;
164 98         185 my $genes = $2;
165 98         423 my @elements = split( /[\s\t]+/, $genes );
166 98         568 $groups_to_id_names{$group_name} = \@elements;
167             }
168             }
169            
170 29         1121 return \%groups_to_id_names;
171             }
172              
173             sub _groups {
174 47     47   108 my ($self) = @_;
175 47         74 my @groups = keys %{ $self->_groups_to_id_names };
  47         1302  
176 47         420 return \@groups;
177             }
178              
179             sub _ids_grouped_by_gene_name_for_group {
180 142     142   248 my ( $self, $group_name ) = @_;
181 142         245 my %gene_name_freq;
182 142         200 for my $id_name ( @{ $self->_groups_to_id_names->{$group_name} } ) {
  142         3359  
183 243 100 66     5670 if ( defined( $self->_ids_to_gene_names->{$id_name} ) && $self->_ids_to_gene_names->{$id_name} ne "" ) {
184 57         63 push( @{ $gene_name_freq{ $self->_ids_to_gene_names->{$id_name} } }, $id_name );
  57         1029  
185             }
186             }
187 142         353 return \%gene_name_freq;
188             }
189              
190             sub _consensus_gene_name_for_group {
191 142     142   279 my ( $self, $group_name ) = @_;
192 142         347 my $gene_name_freq = $self->_ids_grouped_by_gene_name_for_group($group_name);
193              
194 142         232 my @sorted_gene_names = sort { @{ $gene_name_freq->{$b} } <=> @{ $gene_name_freq->{$a} } } keys %{$gene_name_freq};
  6         11  
  6         11  
  6         27  
  142         420  
195 142 100       364 if ( @sorted_gene_names > 0 ) {
196 28         85 return shift(@sorted_gene_names);
197             }
198             else {
199 114         464 return $group_name;
200             }
201             }
202              
203             sub _build_group_nucleotide_lengths
204             {
205 21     21   50 my ($self) = @_;
206 21         45 my %group_nucleotide_lengths;
207 21         38 for my $group_name (keys %{ $self->_groups_to_id_names } )
  21         520  
208             {
209 63         98 my @gene_lengths;
210 63         73 for my $gene_id (@{$self->_groups_to_id_names->{$group_name}})
  63         1471  
211             {
212 109         2354 my $current_gene_size = $self->_ids_to_gene_size->{$gene_id};
213 109 100       257 next unless(defined($current_gene_size) );
214 31 50       44 next if($current_gene_size < 1);
215 31         46 push(@gene_lengths, $current_gene_size);
216             }
217            
218 63 100       161 next if(@gene_lengths == 0);
219 18   50     75 my $average_gene_size = (int((sum @gene_lengths)/@gene_lengths)) || 0;
220 18   50     48 my $min_gene_size = (min @gene_lengths) || 0;
221 18   50     38 my $max_gene_size = (max @gene_lengths) || 0;
222 18         65 $group_nucleotide_lengths{$group_name} = {'min' => $min_gene_size, 'max' =>$max_gene_size , 'average' => $average_gene_size};
223             }
224 21         499 return \%group_nucleotide_lengths;
225             }
226              
227             sub _generate_groups_to_consensus_gene_names {
228 28     28   73 my ($self) = @_;
229 28         77 my %groups_to_gene_names;
230             my %gene_name_freq;
231 28         943 my $group_prefix = $self->_group_default_prefix;
232              
233             # These are already annotated
234 28         55 for my $group_name ( sort { @{ $self->_groups_to_id_names->{$b} } <=> @{ $self->_groups_to_id_names->{$a} } }
  118         177  
  118         2548  
  118         2398  
235 28         821 keys %{ $self->_groups_to_id_names } )
236             {
237 91 50       464 next if ( $group_name =~ /$group_prefix/ );
238 0         0 $groups_to_gene_names{$group_name} = $group_name;
239             }
240              
241 28         68 for my $group_name ( sort { @{ $self->_groups_to_id_names->{$b} } <=> @{ $self->_groups_to_id_names->{$a} } }
  118         164  
  118         2392  
  118         2324  
242 28         703 keys %{ $self->_groups_to_id_names } )
243             {
244 91 50       382 next unless ( $group_name =~ /$group_prefix/ );
245 91         228 my $consensus_gene_name = $self->_consensus_gene_name_for_group($group_name);
246              
247 91 50       211 if ( defined( $gene_name_freq{$consensus_gene_name} ) ) {
248 0         0 $groups_to_gene_names{$group_name} = $group_name;
249             }
250             else {
251 91         196 $groups_to_gene_names{$group_name} = $consensus_gene_name;
252 91         221 $gene_name_freq{$consensus_gene_name}++;
253             }
254             }
255 28         902 return \%groups_to_gene_names;
256             }
257              
258             sub _build__groups_to_consensus_gene_names {
259 3     3   7 my ($self) = @_;
260 3         14 return $self->_generate_groups_to_consensus_gene_names;
261             }
262              
263             sub _build__number_of_files {
264 0     0   0 my ($self) = @_;
265 0         0 return @{ $self->gff_files };
  0         0  
266             }
267              
268              
269             sub _split_groups {
270 25     25   61 my ($self) = @_;
271            
272 25         141 $self->_groups_to_consensus_gene_names( $self->_generate_groups_to_consensus_gene_names );
273 25         139 $self->_ids_to_groups( $self->_generate__ids_to_groups );
274             }
275              
276             sub _remove_ids_from_group {
277 0     0   0 my ( $self, $ids_to_remove, $group ) = @_;
278              
279 0         0 my @remaining_ids = array_minus( @{ $self->_groups_to_id_names->{$group} }, @{ $ids_to_remove } );
  0         0  
  0         0  
280 0         0 $self->_groups_to_id_names->{$group} = \@remaining_ids;
281 0 0       0 if ( @{ $self->_groups_to_id_names->{$group} } == 0 ) {
  0         0  
282 0         0 delete( $self->_groups_to_id_names->{$group} );
283             }
284             }
285              
286             sub reannotate {
287 25     25 0 73 my ($self) = @_;
288              
289 25         111 $self->_split_groups;
290              
291 25         68 my %groups_to_id_names = %{ $self->_groups_to_id_names };
  25         623  
292 25         163 for
293 80         116 my $group_name ( sort { @{ $groups_to_id_names{$b} } <=> @{ $groups_to_id_names{$a} } } keys %groups_to_id_names )
  80         147  
  80         171  
294             {
295 70         1871 my $consensus_gene_name = $self->_groups_to_consensus_gene_names->{$group_name};
296 70         1555 print { $self->_output_fh } $consensus_gene_name . ": "
297 70         118 . join( "\t", @{ $self->_groups_to_id_names->{$group_name} } ) . "\n";
  70         1733  
298             }
299 25         585 close( $self->_output_fh );
300 25         190 return $self;
301             }
302              
303             sub full_annotation {
304 7     7 0 18 my ( $self, $group ) = @_;
305              
306 7         8 my @id_names = @{ $self->_groups_to_id_names->{$group} };
  7         150  
307              
308 7         11 my %product_freq;
309 7         10 for my $id_name ( @id_names ) {
310 12 100       228 next unless ( defined( $self->_ids_to_verbose_stats->{$id_name}->{'product'} ) );
311 10         180 $product_freq{ $self->_ids_to_verbose_stats->{$id_name}->{'product'} }++;
312             }
313              
314 7         25 my @sorted_product_keys = sort { $product_freq{$b} <=> $product_freq{$a} } keys(%product_freq);
  0         0  
315              
316 7 100       15 if ( @sorted_product_keys > 0 ) {
317 6         34 return join('; ', @sorted_product_keys);
318             }
319             else {
320 1         4 return '';
321             }
322            
323             }
324              
325             sub inference {
326 7     7 0 12 my ( $self, $group ) = @_;
327              
328 7         9 my @infs;
329 7         8 foreach my $g ( @{ $self->_groups_to_id_names->{$group} } ){
  7         143  
330 12 100       259 next unless ( defined $self->_ids_to_verbose_stats->{$g}->{'inference'} );
331 10         230 push( @infs, $self->_ids_to_verbose_stats->{$g}->{'inference'} );
332             }
333              
334             # maybe make a consensus in the future?
335              
336 7         29 return $infs[0];
337             }
338              
339 17     17   157 no Moose;
  17         39  
  17         150  
340             __PACKAGE__->meta->make_immutable;
341              
342             1;
343              
344             __END__
345              
346             =pod
347              
348             =encoding UTF-8
349              
350             =head1 NAME
351              
352             Bio::Roary::AnnotateGroups - Take in a group file and associated GFF files for the isolates and update the group name to the gene name
353              
354             =head1 VERSION
355              
356             version 3.11.0
357              
358             =head1 SYNOPSIS
359              
360             Take in a group file and associated GFF files for the isolates and update the group name to the gene name
361             use Bio::Roary::AnnotateGroups;
362              
363             my $obj = Bio::Roary::AnnotateGroups->new(
364             gff_files => ['abc.gff','efg.gff'],
365             output_filename => 'example_output.fa',
366             groups_filename => 'groupsfile',
367             );
368             $obj->reannotate;
369              
370             =head1 AUTHOR
371              
372             Andrew J. Page <ap13@sanger.ac.uk>
373              
374             =head1 COPYRIGHT AND LICENSE
375              
376             This software is Copyright (c) 2013 by Wellcome Trust Sanger Institute.
377              
378             This is free software, licensed under:
379              
380             The GNU General Public License, Version 3, June 2007
381              
382             =cut