File Coverage

lib/Bio/Roary/AnnotateGroups.pm

Criterion	Covered	Total	%
statement	202	220	91.8
branch	31	44	70.4
condition	5	9	55.5
subroutine	27	30	90.0
pod	0	5	0.0
total	265	308	86.0

line	stmt	bran	cond	sub	pod	time	code
1							package Bio::Roary::AnnotateGroups;
2							$Bio::Roary::AnnotateGroups::VERSION = '3.10.1';
3							# ABSTRACT: Take in a group file and associated GFF files for the isolates and update the group name to the gene name
4
5
6	17			17		811798	use Moose;
	17					1177957
	17					130
7	17			17		118096	use Bio::Roary::Exceptions;
	17					65
	17					506
8	17			17		5543	use Bio::Roary::GeneNamesFromGFF;
	17					59
	17					874
9	17			17		7044	use Array::Utils qw(array_minus);
	17					6435
	17					1350
10	17			17		118	use List::Util qw(max min sum);
	17					34
	17					1071
11	17			17		4374	use File::Grep qw(fgrep);
	17					18407
	17					35789
12
13							has 'gff_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
14							has 'output_filename' => ( is => 'ro', isa => 'Str', default => 'reannotated_groups_file' );
15							has 'groups_filename' => ( is => 'ro', isa => 'Str', required => 1 );
16							has '_ids_to_gene_names' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build__ids_to_gene_names' );
17							has '_ids_to_product' => ( is => 'rw', isa => 'HashRef', default => sub { {} } );
18							has '_ids_to_gene_size' => ( is => 'rw', isa => 'HashRef', default => sub { {} } );
19							has 'group_nucleotide_lengths' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_group_nucleotide_lengths');
20
21							has '_groups_to_id_names' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_builder__groups_to_id_names' );
22							has '_output_fh' => ( is => 'ro', lazy => 1, builder => '_build__output_fh' );
23							has '_groups_to_consensus_gene_names' =>
24							( is => 'rw', isa => 'HashRef', lazy => 1, builder => '_build__groups_to_consensus_gene_names' );
25							has '_filtered_gff_files' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, builder => '_build__filtered_gff_files' );
26							has '_number_of_files' => ( is => 'ro', isa => 'Int', lazy => 1, builder => '_build__number_of_files' );
27							has '_ids_to_groups' => ( is => 'rw', isa => 'HashRef', lazy => 1, builder => '_builder__ids_to_groups' );
28							has '_group_counter' => ( is => 'rw', isa => 'Int', lazy => 1, builder => '_builder__group_counter' );
29							has '_group_default_prefix' => ( is => 'rw', isa => 'Str', default => 'group_' );
30							has '_ids_to_verbose_stats' => ( is => 'rw', isa => 'HashRef', lazy_build => 1 );
31
32							sub BUILD {
33	31			31	0	101	my ($self) = @_;
34	31					1316	$self->_ids_to_gene_names;
35							}
36
37							sub _builder__group_counter {
38	1			1		2	my ($self) = @_;
39	1					20	my $prefix = $self->_group_default_prefix;
40	1					2	my $highest_group = 0;
41	1					1	for my $group ( @{ $self->_groups } ) {
	1					4
42	6	50				35	if ( $group =~ /$prefix([\d]+)$/ ) {
43	6					11	my $group_id = $1;
44	6	100				10	if ( $group_id > $highest_group ) {
45	3					4	$highest_group = $group_id;
46							}
47							}
48							}
49	1					22	return $highest_group + 1;
50							}
51
52							sub _generate__ids_to_groups {
53	25			25		86	my ($self) = @_;
54	25					43	my %ids_to_groups;
55
56	25					56	for my $group ( keys %{ $self->_groups_to_id_names } ) {
	25					753
57	70					99	for my $id_name ( @{ $self->_groups_to_id_names->{$group} } ) {
	70					1920
58	122					383	$ids_to_groups{$id_name} = $group;
59							}
60							}
61	25					912	return \%ids_to_groups;
62							}
63
64							sub _builder__ids_to_groups {
65	0			0		0	my ($self) = @_;
66	0					0	return $self->_generate__ids_to_groups;
67							}
68
69							sub _build__output_fh {
70	25			25		128	my ($self) = @_;
71	25	50				1052	open( my $fh, '>', $self->output_filename )
72							or Bio::Roary::Exceptions::CouldntWriteToFile->throw(
73							error => "Couldnt write output file:" . $self->output_filename );
74	25					894	return $fh;
75							}
76
77							sub _build__filtered_gff_files {
78	31			31		125	my ($self) = @_;
79	31					70	my @gff_files = grep( /\.gff$/, @{ $self->gff_files } );
	31					1000
80	31					1039	return \@gff_files;
81							}
82
83							sub _build__ids_to_gene_names {
84	31			31		100	my ($self) = @_;
85	31					125	my %ids_to_gene_names;
86							my %ids_to_product;
87	31					0	my %ids_to_gene_size;
88	31					61	for my $filename ( @{ $self->_filtered_gff_files } ) {
	31					1205
89	39					1110	my $gene_names_from_gff = Bio::Roary::GeneNamesFromGFF->new( gff_file => $filename );
90	39					88	my %id_to_gene_lookup = %{ $gene_names_from_gff->ids_to_gene_name };
	39					864
91	39					250	@ids_to_gene_names{ keys %id_to_gene_lookup } = values %id_to_gene_lookup;
92
93	39					84	my %id_to_product_lookup = %{ $gene_names_from_gff->ids_to_product };
	39					980
94	39					371	@ids_to_product{ keys %id_to_product_lookup } = values %id_to_product_lookup;
95
96	39					89	my %ids_to_gene_size_lookup = %{ $gene_names_from_gff->ids_to_gene_size };
	39					892
97	39					1279	@ids_to_gene_size{ keys %ids_to_gene_size_lookup } = values %ids_to_gene_size_lookup;
98							}
99	31					1432	$self->_ids_to_product( \%ids_to_product );
100	31					1145	$self->_ids_to_gene_size( \%ids_to_gene_size );
101
102	31					1057	return \%ids_to_gene_names;
103							}
104
105							sub _build__ids_to_verbose_stats {
106	1			1		4	my $self = shift;
107
108	1			813		12	my @matches_hash = fgrep { /ID=/i } @{ $self->_filtered_gff_files };
	813					9819
	1					86
109	1					30	my @matches;
110	1					3	foreach my $m ( @matches_hash ){
111	3					5	push( @matches, values %{$m->{matches}} );
	3					18
112							}
113							# chomp @matches;
114
115	1					2	my %verbose;
116	1					3	foreach my $line ( @matches ){
117	48					49	my ( $id, $inf, $prod );
118	48	50				137	if( $line =~ m/ID=["']?([^;"']+)["']?;?/i ){
119	48					70	$id = $1;
120							}
121							else {
122	0					0	next;
123							}
124
125	48	50				163	$inf = $1 if ( $line =~ m/inference=([^;]+);/ );
126	48	50				139	$prod = $1 if ( $line =~ m/product=([^;]+)[;\n]/ );
127
128	48					112	my %info = ( 'inference' => $inf, 'product' => $prod );
129	48					100	$verbose{$id} = \%info;
130							}
131	1					43	return \%verbose;
132							}
133
134
135							sub consensus_product_for_id_names {
136	63			63	0	183	my ( $self, $id_names ) = @_;
137	63					126	my %product_freq;
138	63					272	for my $id_name ( @{$id_names} ) {
	63					329
139	109	100				3521	next unless ( defined( $self->_ids_to_product->{$id_name} ) );
140	31					1264	$product_freq{ $self->_ids_to_product->{$id_name} }++;
141							}
142
143	63					241	my @sorted_product_keys = sort { $product_freq{$b} <=> $product_freq{$a} } keys(%product_freq);
	0					0
144
145	63	100				204	if ( @sorted_product_keys > 0 ) {
146	18					75	return $sorted_product_keys[0];
147							}
148							else {
149	45					247	return '';
150							}
151							}
152
153							sub _builder__groups_to_id_names {
154	29			29		94	my ($self) = @_;
155	29					62	my %groups_to_id_names;
156
157	29	50				1278	open( my $fh, $self->groups_filename )
158							or Bio::Roary::Exceptions::FileNotFound->throw( error => "Group file not found:" . $self->groups_filename );
159	29					769	while (<$fh>) {
160	98					218	chomp;
161	98					198	my $line = $_;
162	98	50				625	if ( $line =~ /^(.+): (.+)$/ ) {
163	98					427	my $group_name = $1;
164	98					207	my $genes = $2;
165	98					532	my @elements = split( /[\s\t]+/, $genes );
166	98					871	$groups_to_id_names{$group_name} = \@elements;
167							}
168							}
169
170	29					1395	return \%groups_to_id_names;
171							}
172
173							sub _groups {
174	47			47		133	my ($self) = @_;
175	47					112	my @groups = keys %{ $self->_groups_to_id_names };
	47					1697
176	47					310	return \@groups;
177							}
178
179							sub _ids_grouped_by_gene_name_for_group {
180	142			142		472	my ( $self, $group_name ) = @_;
181	142					235	my %gene_name_freq;
182	142					223	for my $id_name ( @{ $self->_groups_to_id_names->{$group_name} } ) {
	142					4098
183	243	100	66			6960	if ( defined( $self->_ids_to_gene_names->{$id_name} ) && $self->_ids_to_gene_names->{$id_name} ne "" ) {
184	57					71	push( @{ $gene_name_freq{ $self->_ids_to_gene_names->{$id_name} } }, $id_name );
	57					1157
185							}
186							}
187	142					425	return \%gene_name_freq;
188							}
189
190							sub _consensus_gene_name_for_group {
191	142			142		392	my ( $self, $group_name ) = @_;
192	142					445	my $gene_name_freq = $self->_ids_grouped_by_gene_name_for_group($group_name);
193
194	142					234	my @sorted_gene_names = sort { @{ $gene_name_freq->{$b} } <=> @{ $gene_name_freq->{$a} } } keys %{$gene_name_freq};
	6					12
	6					16
	6					27
	142					495
195	142	100				510	if ( @sorted_gene_names > 0 ) {
196	28					87	return shift(@sorted_gene_names);
197							}
198							else {
199	114					480	return $group_name;
200							}
201							}
202
203							sub _build_group_nucleotide_lengths
204							{
205	21			21		63	my ($self) = @_;
206	21					48	my %group_nucleotide_lengths;
207	21					50	for my $group_name (keys %{ $self->_groups_to_id_names } )
	21					867
208							{
209	63					121	my @gene_lengths;
210	63					100	for my $gene_id (@{$self->_groups_to_id_names->{$group_name}})
	63					1944
211							{
212	109					3349	my $current_gene_size = $self->_ids_to_gene_size->{$gene_id};
213	109	100				309	next unless(defined($current_gene_size) );
214	31	50				68	next if($current_gene_size < 1);
215	31					124	push(@gene_lengths, $current_gene_size);
216							}
217
218	63	100				207	next if(@gene_lengths == 0);
219	18		50			92	my $average_gene_size = (int((sum @gene_lengths)/@gene_lengths)) \|\| 0;
220	18		50			63	my $min_gene_size = (min @gene_lengths) \|\| 0;
221	18		50			54	my $max_gene_size = (max @gene_lengths) \|\| 0;
222	18					114	$group_nucleotide_lengths{$group_name} = {'min' => $min_gene_size, 'max' =>$max_gene_size , 'average' => $average_gene_size};
223							}
224	21					777	return \%group_nucleotide_lengths;
225							}
226
227							sub _generate_groups_to_consensus_gene_names {
228	28			28		65	my ($self) = @_;
229	28					71	my %groups_to_gene_names;
230							my %gene_name_freq;
231	28					1588	my $group_prefix = $self->_group_default_prefix;
232
233							#Â These are already annotated
234	28					82	for my $group_name ( sort { @{ $self->_groups_to_id_names->{$b} } <=> @{ $self->_groups_to_id_names->{$a} } }
	116					339
	116					2856
	116					3088
235	28					1314	keys %{ $self->_groups_to_id_names } )
236							{
237	91	50				605	next if ( $group_name =~ /$group_prefix/ );
238	0					0	$groups_to_gene_names{$group_name} = $group_name;
239							}
240
241	28					178	for my $group_name ( sort { @{ $self->_groups_to_id_names->{$b} } <=> @{ $self->_groups_to_id_names->{$a} } }
	116					198
	116					2825
	116					2717
242	28					848	keys %{ $self->_groups_to_id_names } )
243							{
244	91	50				495	next unless ( $group_name =~ /$group_prefix/ );
245	91					302	my $consensus_gene_name = $self->_consensus_gene_name_for_group($group_name);
246
247	91	50				221	if ( defined( $gene_name_freq{$consensus_gene_name} ) ) {
248	0					0	$groups_to_gene_names{$group_name} = $group_name;
249							}
250							else {
251	91					203	$groups_to_gene_names{$group_name} = $consensus_gene_name;
252	91					284	$gene_name_freq{$consensus_gene_name}++;
253							}
254							}
255	28					1111	return \%groups_to_gene_names;
256							}
257
258							sub _build__groups_to_consensus_gene_names {
259	3			3		8	my ($self) = @_;
260	3					13	return $self->_generate_groups_to_consensus_gene_names;
261							}
262
263							sub _build__number_of_files {
264	0			0		0	my ($self) = @_;
265	0					0	return @{ $self->gff_files };
	0					0
266							}
267
268
269							sub _split_groups {
270	25			25		64	my ($self) = @_;
271
272	25					113	$self->_groups_to_consensus_gene_names( $self->_generate_groups_to_consensus_gene_names );
273	25					146	$self->_ids_to_groups( $self->_generate__ids_to_groups );
274							}
275
276							sub _remove_ids_from_group {
277	0			0		0	my ( $self, $ids_to_remove, $group ) = @_;
278
279	0					0	my @remaining_ids = array_minus( @{ $self->_groups_to_id_names->{$group} }, @{ $ids_to_remove } );
	0					0
	0					0
280	0					0	$self->_groups_to_id_names->{$group} = \@remaining_ids;
281	0	0				0	if ( @{ $self->_groups_to_id_names->{$group} } == 0 ) {
	0					0
282	0					0	delete( $self->_groups_to_id_names->{$group} );
283							}
284							}
285
286							sub reannotate {
287	25			25	0	91	my ($self) = @_;
288
289	25					136	$self->_split_groups;
290
291	25					68	my %groups_to_id_names = %{ $self->_groups_to_id_names };
	25					823
292	25					162	for
293	75					141	my $group_name ( sort { @{ $groups_to_id_names{$b} } <=> @{ $groups_to_id_names{$a} } } keys %groups_to_id_names )
	75					170
	75					181
294							{
295	70					2092	my $consensus_gene_name = $self->_groups_to_consensus_gene_names->{$group_name};
296	70					1822	print { $self->_output_fh } $consensus_gene_name . ": "
297	70					114	. join( "\t", @{ $self->_groups_to_id_names->{$group_name} } ) . "\n";
	70					1887
298							}
299	25					736	close( $self->_output_fh );
300	25					200	return $self;
301							}
302
303							sub full_annotation {
304	7			7	0	17	my ( $self, $group ) = @_;
305
306	7					10	my @id_names = @{ $self->_groups_to_id_names->{$group} };
	7					200
307
308	7					13	my %product_freq;
309	7					14	for my $id_name ( @id_names ) {
310	12	100				301	next unless ( defined( $self->_ids_to_verbose_stats->{$id_name}->{'product'} ) );
311	10					245	$product_freq{ $self->_ids_to_verbose_stats->{$id_name}->{'product'} }++;
312							}
313
314	7					25	my @sorted_product_keys = sort { $product_freq{$b} <=> $product_freq{$a} } keys(%product_freq);
	0					0
315
316	7	100				16	if ( @sorted_product_keys > 0 ) {
317	6					39	return join('; ', @sorted_product_keys);
318							}
319							else {
320	1					4	return '';
321							}
322
323							}
324
325							sub inference {
326	7			7	0	15	my ( $self, $group ) = @_;
327
328	7					9	my @infs;
329	7					8	foreach my $g ( @{ $self->_groups_to_id_names->{$group} } ){
	7					176
330	12	100				280	next unless ( defined $self->_ids_to_verbose_stats->{$g}->{'inference'} );
331	10					244	push( @infs, $self->_ids_to_verbose_stats->{$g}->{'inference'} );
332							}
333
334							# maybe make a consensus in the future?
335
336	7					25	return $infs[0];
337							}
338
339	17			17		193	no Moose;
	17					45
	17					201
340							__PACKAGE__->meta->make_immutable;
341
342							1;
343
344							__END__
345
346							=pod
347
348							=encoding UTF-8
349
350							=head1 NAME
351
352							Bio::Roary::AnnotateGroups - Take in a group file and associated GFF files for the isolates and update the group name to the gene name
353
354							=head1 VERSION
355
356							version 3.10.1
357
358							=head1 SYNOPSIS
359
360							Take in a group file and associated GFF files for the isolates and update the group name to the gene name
361							use Bio::Roary::AnnotateGroups;
362
363							my $obj = Bio::Roary::AnnotateGroups->new(
364							gff_files => ['abc.gff','efg.gff'],
365							output_filename => 'example_output.fa',
366							groups_filename => 'groupsfile',
367							);
368							$obj->reannotate;
369
370							=head1 AUTHOR
371
372							Andrew J. Page <ap13@sanger.ac.uk>
373
374							=head1 COPYRIGHT AND LICENSE
375
376							This software is Copyright (c) 2013 by Wellcome Trust Sanger Institute.
377
378							This is free software, licensed under:
379
380							The GNU General Public License, Version 3, June 2007
381
382							=cut