File Coverage

lib/Bio/Roary/ReformatInputGFFs.pm
Criterion Covered Total %
statement 120 129 93.0
branch 24 38 63.1
condition 3 3 100.0
subroutine 16 16 100.0
pod 0 1 0.0
total 163 187 87.1


line stmt bran cond sub pod time code
1             package Bio::Roary::ReformatInputGFFs;
2             $Bio::Roary::ReformatInputGFFs::VERSION = '3.11.0';
3             # ABSTRACT: Take in gff files and add suffix where a gene id is seen twice
4              
5              
6 2     2   91874 use Moose;
  2         400851  
  2         13  
7 2     2   12161 use Bio::Roary::Exceptions;
  2         3  
  2         42  
8 2     2   10 use Cwd;
  2         3  
  2         398  
9 2     2   321 use File::Copy;
  2         1766  
  2         92  
10 2     2   517 use Log::Log4perl qw(:easy);
  2         31144  
  2         14  
11 2     2   1686 use Bio::Tools::GFF;
  2         78278  
  2         58  
12 2     2   25 use File::Path qw(make_path);
  2         3  
  2         90  
13 2     2   11 use File::Basename;
  2         3  
  2         99  
14 2     2   599 use Digest::MD5::File qw(file_md5_hex);
  2         48870  
  2         52  
15              
16             has 'gff_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
17             has 'logger' => ( is => 'ro', lazy => 1, builder => '_build_logger' );
18             has '_tags_to_filter' => ( is => 'ro', isa => 'Str', default => '(CDS|ncRNA|tRNA|tmRNA|rRNA)' );
19             has 'output_directory' => ( is => 'ro', isa => 'Str', default => 'fixed_input_files' );
20             has 'suffix_counter' => ( is => 'rw', isa => 'Int', default => 1 );
21              
22             has 'fixed_gff_files' => ( is => 'rw', isa => 'ArrayRef', default => sub { [] } );
23              
24             sub _build_logger {
25 3     3   7 my ($self) = @_;
26 3         23 Log::Log4perl->easy_init( $ERROR );
27 3         6572 my $logger = get_logger();
28 3         253 return $logger;
29             }
30              
31             sub fix_duplicate_gene_ids {
32 4     4 0 594 my ($self) = @_;
33              
34 4         9 my %gene_ids_seen_before;
35            
36             my %file_md5s;
37            
38 4         8 for my $file ( @{ $self->gff_files } ) {
  4         137  
39 9         41 my $digest = file_md5_hex($file);
40            
41 9 100       1543 if(defined($file_md5s{$digest}))
42             {
43             $self->logger->warn(
44 1         26 "Input files have identical MD5 hashes, only using the first file: ".$file_md5s{$digest}." == ".$file
45             );
46 1         8 next;
47             }
48             else
49             {
50 8         28 $file_md5s{$digest} = $file;
51             }
52            
53 8         14 my $ids_seen = 0;
54 8         25 my $ids_from_file = $self->_get_ids_for_gff_file($file);
55              
56 8 50       635 if ( @{$ids_from_file} < 1 ) {
  8         28  
57 0         0 $self->logger->error(
58             "Input GFF file doesnt contain annotation we can use so excluding it from the analysis: $file"
59             );
60             }
61             else {
62 8         13 for my $gene_id ( @{$ids_from_file} ) {
  8         20  
63 33 100       58 if ( $gene_ids_seen_before{$gene_id} ) {
64 2         57 $self->logger->error(
65             "Input file contains duplicate gene IDs, attempting to fix by adding a unique suffix, new GFF in the fixed_input_files directory: $file "
66             );
67 2         663 my $updated_file = $self->_add_suffix_to_gene_ids_and_return_new_file($file, $digest);
68 2 50       7 push( @{ $self->fixed_gff_files }, $updated_file ) if ( defined($updated_file) );
  2         57  
69 2         4 $ids_seen = 1;
70 2         4 last;
71             }
72 31         59 $gene_ids_seen_before{$gene_id}++;
73             }
74            
75             # We know its a valid GFF file since we could open it and extract IDs.
76             # We need to make sure the filenames end in .gff. If it contained duplicate IDs, then they are fixed so nothing to do, but
77             # if they didnt, then we have to double check and repair if necessary.
78 8 100       25 if ( $ids_seen == 0 ) {
79            
80            
81 6         9 push( @{ $self->fixed_gff_files }, $self->_fix_gff_file_extension($file) );
  6         211  
82             }
83             }
84             }
85 4         34 return 1;
86             }
87              
88             sub _fix_gff_file_extension
89             {
90 6     6   17 my ( $self, $input_file ) = @_;
91            
92 6         258 my ( $filename, $directories, $suffix ) = fileparse( $input_file, qr/\.[^.]*/ );
93 6 50       43 return $input_file if($suffix eq '.gff');
94            
95            
96 0 0       0 make_path( $self->output_directory ) if ( !( -d $self->output_directory ) );
97 0         0 my $output_file = $self->output_directory . '/' . $filename . '.gff';
98 0 0       0 copy($input_file, $output_file) or $self->logger->error("Couldnt copy file with invalid gff extention: $input_file -> $output_file");
99 0         0 return $output_file;
100             }
101              
102              
103             sub _add_suffix_to_gene_ids_and_return_new_file {
104 3     3   10 my ( $self, $input_file, $digest ) = @_;
105 3         90 my ( $filename, $directories, $suffix ) = fileparse( $input_file, qr/\.[^.]*/ );
106 3 50       111 make_path( $self->output_directory ) if ( !( -d $self->output_directory ) );
107 3         81 my $output_file = $self->output_directory . '/' . $filename . '.gff';
108              
109 3         82 open( my $input_gff_fh, $input_file );
110 3         117 open( my $out_gff_fh, '>', $output_file );
111            
112             # There is a chance that there can be a collision here, but its remote.
113 3         9 my $random_locus_tag = "".$digest;
114            
115 3         75 $self->logger->warn(
116             "Renamed GFF file from: $input_file -> $output_file" );
117 3         75 $self->logger->warn(
118             "Locus tag used is '$random_locus_tag' for file: $input_file" );
119              
120 3         18 my $found_fasta = 0;
121 3         5 my $gene_counter = 1;
122 3         57 while (<$input_gff_fh>) {
123 663         680 my $line = $_;
124              
125 663 100       834 if ( $line =~ /^\#\#FASTA/ ) {
126 3         4 $found_fasta = 1;
127             }
128              
129 663 100 100     1323 if ( $line =~ /\#/ || $found_fasta == 1 ) {
130 645         536 print {$out_gff_fh} $line;
  645         804  
131 645         968 next;
132             }
133              
134 18         53 my @cells = split( /\t/, $line );
135 18         46 my @tags = split( /;/, $cells[8] );
136 18         19 my $found_id = 0;
137 18         52 for ( my $i = 0 ; $i < @tags ; $i++ ) {
138 52 100       107 if ( $tags[$i] =~ /^(ID=["']?)([^;"']+)(["']?)/ ) {
139 10         17 my $current_id = $2;
140 10         208 $current_id .= '___' . $self->suffix_counter;
141 10         25 $tags[$i] = $1 .$random_locus_tag.'_'. $gene_counter . $3;
142 10         12 $gene_counter++;
143 10         14 $found_id++;
144 10         11 last;
145             }
146             }
147 18 100       27 if ( $found_id == 0 ) {
148 8         16 unshift( @tags, 'ID=' . $random_locus_tag.'_'. $gene_counter );
149 8         8 $gene_counter++;
150             }
151 18         43 $cells[8] = join( ';', @tags );
152 18         18 print {$out_gff_fh} join( "\t", @cells );
  18         86  
153             }
154              
155 3 50       9 if ( $found_fasta == 0 ) {
156 0         0 $self->logger->warn(
157             "Input GFF file doesnt appear to have the FASTA sequence at the end of the file so is being excluded from the analysis: $input_file" );
158 0         0 return undef;
159             }
160 3         55 close($out_gff_fh);
161 3         10 close($input_gff_fh);
162 3         19 return $output_file;
163             }
164              
165             sub _get_ids_for_gff_file {
166 10     10   555 my ( $self, $file ) = @_;
167 10         14 my @gene_ids;
168 10         281 my $tags_regex = $self->_tags_to_filter;
169 10         79 my $gffio = Bio::Tools::GFF->new( -file => $file, -gff_version => 3 );
170 10         6004 while ( my $feature = $gffio->next_feature() ) {
171 100 100       41140 next if !( $feature->primary_tag =~ /$tags_regex/ );
172 51         535 my $gene_id = $self->_get_feature_id($feature);
173 51 50       200 push( @gene_ids, $gene_id ) if ( defined($gene_id) );
174             }
175 10         48584 return \@gene_ids;
176             }
177              
178             sub _get_feature_id {
179 51     51   84 my ( $self, $feature ) = @_;
180 51         66 my ( $gene_id, @junk );
181 51 50       90 if ( $feature->has_tag('ID') ) {
    0          
182 51         281 ( $gene_id, @junk ) = $feature->get_tag_values('ID');
183             }
184             elsif ( $feature->has_tag('locus_tag') ) {
185 0         0 ( $gene_id, @junk ) = $feature->get_tag_values('locus_tag');
186             }
187             else {
188 0         0 return undef;
189             }
190 51         427 $gene_id =~ s!["']!!g;
191 51 50       96 return undef if ( $gene_id eq "" );
192 51         86 return $gene_id;
193             }
194              
195 2     2   2215 no Moose;
  2         6  
  2         14  
196             __PACKAGE__->meta->make_immutable;
197              
198             1;
199              
200             __END__
201              
202             =pod
203              
204             =encoding UTF-8
205              
206             =head1 NAME
207              
208             Bio::Roary::ReformatInputGFFs - Take in gff files and add suffix where a gene id is seen twice
209              
210             =head1 VERSION
211              
212             version 3.11.0
213              
214             =head1 SYNOPSIS
215              
216             Take in gff files and add suffix where a gene id is seen twice
217             use Bio::Roary::ReformatInputGFFs;
218              
219             my $obj = Bio::Roary::PrepareInputFiles->new(
220             gff_files => ['abc.gff','ddd.faa'],
221             );
222             $obj->fix_duplicate_gene_ids;
223             $obj->fixed_gff_files;
224              
225             =head1 AUTHOR
226              
227             Andrew J. Page <ap13@sanger.ac.uk>
228              
229             =head1 COPYRIGHT AND LICENSE
230              
231             This software is Copyright (c) 2013 by Wellcome Trust Sanger Institute.
232              
233             This is free software, licensed under:
234              
235             The GNU General Public License, Version 3, June 2007
236              
237             =cut