File Coverage

lib/Bio/Roary/AnnotateGroups.pm

Criterion	Covered	Total	%
statement	202	220	91.8
branch	31	44	70.4
condition	5	9	55.5
subroutine	27	30	90.0
pod	0	5	0.0
total	265	308	86.0

line	stmt	bran	cond	sub	pod	time	code
1							package Bio::Roary::AnnotateGroups;
2							$Bio::Roary::AnnotateGroups::VERSION = '3.11.0';
3							# ABSTRACT: Take in a group file and associated GFF files for the isolates and update the group name to the gene name
4
5
6	17			17		821402	use Moose;
	17					1207380
	17					117
7	17			17		115183	use Bio::Roary::Exceptions;
	17					37
	17					422
8	17			17		4514	use Bio::Roary::GeneNamesFromGFF;
	17					62
	17					956
9	17			17		7127	use Array::Utils qw(array_minus);
	17					6249
	17					1339
10	17			17		119	use List::Util qw(max min sum);
	17					35
	17					1158
11	17			17		4622	use File::Grep qw(fgrep);
	17					18957
	17					36436
12
13							has 'gff_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
14							has 'output_filename' => ( is => 'ro', isa => 'Str', default => 'reannotated_groups_file' );
15							has 'groups_filename' => ( is => 'ro', isa => 'Str', required => 1 );
16							has '_ids_to_gene_names' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build__ids_to_gene_names' );
17							has '_ids_to_product' => ( is => 'rw', isa => 'HashRef', default => sub { {} } );
18							has '_ids_to_gene_size' => ( is => 'rw', isa => 'HashRef', default => sub { {} } );
19							has 'group_nucleotide_lengths' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_group_nucleotide_lengths');
20
21							has '_groups_to_id_names' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_builder__groups_to_id_names' );
22							has '_output_fh' => ( is => 'ro', lazy => 1, builder => '_build__output_fh' );
23							has '_groups_to_consensus_gene_names' =>
24							( is => 'rw', isa => 'HashRef', lazy => 1, builder => '_build__groups_to_consensus_gene_names' );
25							has '_filtered_gff_files' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, builder => '_build__filtered_gff_files' );
26							has '_number_of_files' => ( is => 'ro', isa => 'Int', lazy => 1, builder => '_build__number_of_files' );
27							has '_ids_to_groups' => ( is => 'rw', isa => 'HashRef', lazy => 1, builder => '_builder__ids_to_groups' );
28							has '_group_counter' => ( is => 'rw', isa => 'Int', lazy => 1, builder => '_builder__group_counter' );
29							has '_group_default_prefix' => ( is => 'rw', isa => 'Str', default => 'group_' );
30							has '_ids_to_verbose_stats' => ( is => 'rw', isa => 'HashRef', lazy_build => 1 );
31
32							sub BUILD {
33	31			31	0	131	my ($self) = @_;
34	31					1124	$self->_ids_to_gene_names;
35							}
36
37							sub _builder__group_counter {
38	1			1		2	my ($self) = @_;
39	1					19	my $prefix = $self->_group_default_prefix;
40	1					2	my $highest_group = 0;
41	1					2	for my $group ( @{ $self->_groups } ) {
	1					3
42	6	50				39	if ( $group =~ /$prefix([\d]+)$/ ) {
43	6					12	my $group_id = $1;
44	6	100				10	if ( $group_id > $highest_group ) {
45	3					7	$highest_group = $group_id;
46							}
47							}
48							}
49	1					21	return $highest_group + 1;
50							}
51
52							sub _generate__ids_to_groups {
53	25			25		84	my ($self) = @_;
54	25					50	my %ids_to_groups;
55
56	25					41	for my $group ( keys %{ $self->_groups_to_id_names } ) {
	25					657
57	70					98	for my $id_name ( @{ $self->_groups_to_id_names->{$group} } ) {
	70					1660
58	122					387	$ids_to_groups{$id_name} = $group;
59							}
60							}
61	25					798	return \%ids_to_groups;
62							}
63
64							sub _builder__ids_to_groups {
65	0			0		0	my ($self) = @_;
66	0					0	return $self->_generate__ids_to_groups;
67							}
68
69							sub _build__output_fh {
70	25			25		67	my ($self) = @_;
71	25	50				842	open( my $fh, '>', $self->output_filename )
72							or Bio::Roary::Exceptions::CouldntWriteToFile->throw(
73							error => "Couldnt write output file:" . $self->output_filename );
74	25					747	return $fh;
75							}
76
77							sub _build__filtered_gff_files {
78	31			31		78	my ($self) = @_;
79	31					58	my @gff_files = grep( /\.gff$/, @{ $self->gff_files } );
	31					849
80	31					822	return \@gff_files;
81							}
82
83							sub _build__ids_to_gene_names {
84	31			31		102	my ($self) = @_;
85	31					139	my %ids_to_gene_names;
86							my %ids_to_product;
87	31					0	my %ids_to_gene_size;
88	31					62	for my $filename ( @{ $self->_filtered_gff_files } ) {
	31					1059
89	39					1058	my $gene_names_from_gff = Bio::Roary::GeneNamesFromGFF->new( gff_file => $filename );
90	39					64	my %id_to_gene_lookup = %{ $gene_names_from_gff->ids_to_gene_name };
	39					840
91	39					241	@ids_to_gene_names{ keys %id_to_gene_lookup } = values %id_to_gene_lookup;
92
93	39					129	my %id_to_product_lookup = %{ $gene_names_from_gff->ids_to_product };
	39					931
94	39					332	@ids_to_product{ keys %id_to_product_lookup } = values %id_to_product_lookup;
95
96	39					85	my %ids_to_gene_size_lookup = %{ $gene_names_from_gff->ids_to_gene_size };
	39					863
97	39					1246	@ids_to_gene_size{ keys %ids_to_gene_size_lookup } = values %ids_to_gene_size_lookup;
98							}
99	31					922	$self->_ids_to_product( \%ids_to_product );
100	31					907	$self->_ids_to_gene_size( \%ids_to_gene_size );
101
102	31					845	return \%ids_to_gene_names;
103							}
104
105							sub _build__ids_to_verbose_stats {
106	1			1		4	my $self = shift;
107
108	1			813		10	my @matches_hash = fgrep { /ID=/i } @{ $self->_filtered_gff_files };
	813					8283
	1					33
109	1					43	my @matches;
110	1					4	foreach my $m ( @matches_hash ){
111	3					4	push( @matches, values %{$m->{matches}} );
	3					36
112							}
113							# chomp @matches;
114
115	1					3	my %verbose;
116	1					3	foreach my $line ( @matches ){
117	48					52	my ( $id, $inf, $prod );
118	48	50				112	if( $line =~ m/ID=["']?([^;"']+)["']?;?/i ){
119	48					72	$id = $1;
120							}
121							else {
122	0					0	next;
123							}
124
125	48	50				166	$inf = $1 if ( $line =~ m/inference=([^;]+);/ );
126	48	50				163	$prod = $1 if ( $line =~ m/product=([^;]+)[;\n]/ );
127
128	48					111	my %info = ( 'inference' => $inf, 'product' => $prod );
129	48					112	$verbose{$id} = \%info;
130							}
131	1					66	return \%verbose;
132							}
133
134
135							sub consensus_product_for_id_names {
136	63			63	0	128	my ( $self, $id_names ) = @_;
137	63					88	my %product_freq;
138	63					118	for my $id_name ( @{$id_names} ) {
	63					168
139	109	100				2697	next unless ( defined( $self->_ids_to_product->{$id_name} ) );
140	31					652	$product_freq{ $self->_ids_to_product->{$id_name} }++;
141							}
142
143	63					190	my @sorted_product_keys = sort { $product_freq{$b} <=> $product_freq{$a} } keys(%product_freq);
	0					0
144
145	63	100				201	if ( @sorted_product_keys > 0 ) {
146	18					59	return $sorted_product_keys[0];
147							}
148							else {
149	45					202	return '';
150							}
151							}
152
153							sub _builder__groups_to_id_names {
154	29			29		84	my ($self) = @_;
155	29					54	my %groups_to_id_names;
156
157	29	50				1153	open( my $fh, $self->groups_filename )
158							or Bio::Roary::Exceptions::FileNotFound->throw( error => "Group file not found:" . $self->groups_filename );
159	29					442	while (<$fh>) {
160	98					168	chomp;
161	98					172	my $line = $_;
162	98	50				534	if ( $line =~ /^(.+): (.+)$/ ) {
163	98					281	my $group_name = $1;
164	98					185	my $genes = $2;
165	98					423	my @elements = split( /[\s\t]+/, $genes );
166	98					568	$groups_to_id_names{$group_name} = \@elements;
167							}
168							}
169
170	29					1121	return \%groups_to_id_names;
171							}
172
173							sub _groups {
174	47			47		108	my ($self) = @_;
175	47					74	my @groups = keys %{ $self->_groups_to_id_names };
	47					1302
176	47					420	return \@groups;
177							}
178
179							sub _ids_grouped_by_gene_name_for_group {
180	142			142		248	my ( $self, $group_name ) = @_;
181	142					245	my %gene_name_freq;
182	142					200	for my $id_name ( @{ $self->_groups_to_id_names->{$group_name} } ) {
	142					3359
183	243	100	66			5670	if ( defined( $self->_ids_to_gene_names->{$id_name} ) && $self->_ids_to_gene_names->{$id_name} ne "" ) {
184	57					63	push( @{ $gene_name_freq{ $self->_ids_to_gene_names->{$id_name} } }, $id_name );
	57					1029
185							}
186							}
187	142					353	return \%gene_name_freq;
188							}
189
190							sub _consensus_gene_name_for_group {
191	142			142		279	my ( $self, $group_name ) = @_;
192	142					347	my $gene_name_freq = $self->_ids_grouped_by_gene_name_for_group($group_name);
193
194	142					232	my @sorted_gene_names = sort { @{ $gene_name_freq->{$b} } <=> @{ $gene_name_freq->{$a} } } keys %{$gene_name_freq};
	6					11
	6					11
	6					27
	142					420
195	142	100				364	if ( @sorted_gene_names > 0 ) {
196	28					85	return shift(@sorted_gene_names);
197							}
198							else {
199	114					464	return $group_name;
200							}
201							}
202
203							sub _build_group_nucleotide_lengths
204							{
205	21			21		50	my ($self) = @_;
206	21					45	my %group_nucleotide_lengths;
207	21					38	for my $group_name (keys %{ $self->_groups_to_id_names } )
	21					520
208							{
209	63					98	my @gene_lengths;
210	63					73	for my $gene_id (@{$self->_groups_to_id_names->{$group_name}})
	63					1471
211							{
212	109					2354	my $current_gene_size = $self->_ids_to_gene_size->{$gene_id};
213	109	100				257	next unless(defined($current_gene_size) );
214	31	50				44	next if($current_gene_size < 1);
215	31					46	push(@gene_lengths, $current_gene_size);
216							}
217
218	63	100				161	next if(@gene_lengths == 0);
219	18		50			75	my $average_gene_size = (int((sum @gene_lengths)/@gene_lengths)) \|\| 0;
220	18		50			48	my $min_gene_size = (min @gene_lengths) \|\| 0;
221	18		50			38	my $max_gene_size = (max @gene_lengths) \|\| 0;
222	18					65	$group_nucleotide_lengths{$group_name} = {'min' => $min_gene_size, 'max' =>$max_gene_size , 'average' => $average_gene_size};
223							}
224	21					499	return \%group_nucleotide_lengths;
225							}
226
227							sub _generate_groups_to_consensus_gene_names {
228	28			28		73	my ($self) = @_;
229	28					77	my %groups_to_gene_names;
230							my %gene_name_freq;
231	28					943	my $group_prefix = $self->_group_default_prefix;
232
233							#Â These are already annotated
234	28					55	for my $group_name ( sort { @{ $self->_groups_to_id_names->{$b} } <=> @{ $self->_groups_to_id_names->{$a} } }
	118					177
	118					2548
	118					2398
235	28					821	keys %{ $self->_groups_to_id_names } )
236							{
237	91	50				464	next if ( $group_name =~ /$group_prefix/ );
238	0					0	$groups_to_gene_names{$group_name} = $group_name;
239							}
240
241	28					68	for my $group_name ( sort { @{ $self->_groups_to_id_names->{$b} } <=> @{ $self->_groups_to_id_names->{$a} } }
	118					164
	118					2392
	118					2324
242	28					703	keys %{ $self->_groups_to_id_names } )
243							{
244	91	50				382	next unless ( $group_name =~ /$group_prefix/ );
245	91					228	my $consensus_gene_name = $self->_consensus_gene_name_for_group($group_name);
246
247	91	50				211	if ( defined( $gene_name_freq{$consensus_gene_name} ) ) {
248	0					0	$groups_to_gene_names{$group_name} = $group_name;
249							}
250							else {
251	91					196	$groups_to_gene_names{$group_name} = $consensus_gene_name;
252	91					221	$gene_name_freq{$consensus_gene_name}++;
253							}
254							}
255	28					902	return \%groups_to_gene_names;
256							}
257
258							sub _build__groups_to_consensus_gene_names {
259	3			3		7	my ($self) = @_;
260	3					14	return $self->_generate_groups_to_consensus_gene_names;
261							}
262
263							sub _build__number_of_files {
264	0			0		0	my ($self) = @_;
265	0					0	return @{ $self->gff_files };
	0					0
266							}
267
268
269							sub _split_groups {
270	25			25		61	my ($self) = @_;
271
272	25					141	$self->_groups_to_consensus_gene_names( $self->_generate_groups_to_consensus_gene_names );
273	25					139	$self->_ids_to_groups( $self->_generate__ids_to_groups );
274							}
275
276							sub _remove_ids_from_group {
277	0			0		0	my ( $self, $ids_to_remove, $group ) = @_;
278
279	0					0	my @remaining_ids = array_minus( @{ $self->_groups_to_id_names->{$group} }, @{ $ids_to_remove } );
	0					0
	0					0
280	0					0	$self->_groups_to_id_names->{$group} = \@remaining_ids;
281	0	0				0	if ( @{ $self->_groups_to_id_names->{$group} } == 0 ) {
	0					0
282	0					0	delete( $self->_groups_to_id_names->{$group} );
283							}
284							}
285
286							sub reannotate {
287	25			25	0	73	my ($self) = @_;
288
289	25					111	$self->_split_groups;
290
291	25					68	my %groups_to_id_names = %{ $self->_groups_to_id_names };
	25					623
292	25					163	for
293	80					116	my $group_name ( sort { @{ $groups_to_id_names{$b} } <=> @{ $groups_to_id_names{$a} } } keys %groups_to_id_names )
	80					147
	80					171
294							{
295	70					1871	my $consensus_gene_name = $self->_groups_to_consensus_gene_names->{$group_name};
296	70					1555	print { $self->_output_fh } $consensus_gene_name . ": "
297	70					118	. join( "\t", @{ $self->_groups_to_id_names->{$group_name} } ) . "\n";
	70					1733
298							}
299	25					585	close( $self->_output_fh );
300	25					190	return $self;
301							}
302
303							sub full_annotation {
304	7			7	0	18	my ( $self, $group ) = @_;
305
306	7					8	my @id_names = @{ $self->_groups_to_id_names->{$group} };
	7					150
307
308	7					11	my %product_freq;
309	7					10	for my $id_name ( @id_names ) {
310	12	100				228	next unless ( defined( $self->_ids_to_verbose_stats->{$id_name}->{'product'} ) );
311	10					180	$product_freq{ $self->_ids_to_verbose_stats->{$id_name}->{'product'} }++;
312							}
313
314	7					25	my @sorted_product_keys = sort { $product_freq{$b} <=> $product_freq{$a} } keys(%product_freq);
	0					0
315
316	7	100				15	if ( @sorted_product_keys > 0 ) {
317	6					34	return join('; ', @sorted_product_keys);
318							}
319							else {
320	1					4	return '';
321							}
322
323							}
324
325							sub inference {
326	7			7	0	12	my ( $self, $group ) = @_;
327
328	7					9	my @infs;
329	7					8	foreach my $g ( @{ $self->_groups_to_id_names->{$group} } ){
	7					143
330	12	100				259	next unless ( defined $self->_ids_to_verbose_stats->{$g}->{'inference'} );
331	10					230	push( @infs, $self->_ids_to_verbose_stats->{$g}->{'inference'} );
332							}
333
334							# maybe make a consensus in the future?
335
336	7					29	return $infs[0];
337							}
338
339	17			17		157	no Moose;
	17					39
	17					150
340							__PACKAGE__->meta->make_immutable;
341
342							1;
343
344							__END__
345
346							=pod
347
348							=encoding UTF-8
349
350							=head1 NAME
351
352							Bio::Roary::AnnotateGroups - Take in a group file and associated GFF files for the isolates and update the group name to the gene name
353
354							=head1 VERSION
355
356							version 3.11.0
357
358							=head1 SYNOPSIS
359
360							Take in a group file and associated GFF files for the isolates and update the group name to the gene name
361							use Bio::Roary::AnnotateGroups;
362
363							my $obj = Bio::Roary::AnnotateGroups->new(
364							gff_files => ['abc.gff','efg.gff'],
365							output_filename => 'example_output.fa',
366							groups_filename => 'groupsfile',
367							);
368							$obj->reannotate;
369
370							=head1 AUTHOR
371
372							Andrew J. Page <ap13@sanger.ac.uk>
373
374							=head1 COPYRIGHT AND LICENSE
375
376							This software is Copyright (c) 2013 by Wellcome Trust Sanger Institute.
377
378							This is free software, licensed under:
379
380							The GNU General Public License, Version 3, June 2007
381
382							=cut