File Coverage

lib/Bio/Roary/ReformatInputGFFs.pm
Criterion Covered Total %
statement 120 129 93.0
branch 24 38 63.1
condition 3 3 100.0
subroutine 16 16 100.0
pod 0 1 0.0
total 163 187 87.1


line stmt bran cond sub pod time code
1             package Bio::Roary::ReformatInputGFFs;
2             $Bio::Roary::ReformatInputGFFs::VERSION = '3.10.1';
3             # ABSTRACT: Take in gff files and add suffix where a gene id is seen twice
4              
5              
6 2     2   85722 use Moose;
  2         375262  
  2         13  
7 2     2   12940 use Bio::Roary::Exceptions;
  2         3  
  2         49  
8 2     2   14 use Cwd;
  2         5  
  2         141  
9 2     2   326 use File::Copy;
  2         1796  
  2         116  
10 2     2   590 use Log::Log4perl qw(:easy);
  2         34338  
  2         14  
11 2     2   1751 use Bio::Tools::GFF;
  2         78549  
  2         56  
12 2     2   12 use File::Path qw(make_path);
  2         3  
  2         84  
13 2     2   10 use File::Basename;
  2         3  
  2         93  
14 2     2   626 use Digest::MD5::File qw(file_md5_hex);
  2         52321  
  2         13  
15              
16             has 'gff_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
17             has 'logger' => ( is => 'ro', lazy => 1, builder => '_build_logger' );
18             has '_tags_to_filter' => ( is => 'ro', isa => 'Str', default => '(CDS|ncRNA|tRNA|tmRNA|rRNA)' );
19             has 'output_directory' => ( is => 'ro', isa => 'Str', default => 'fixed_input_files' );
20             has 'suffix_counter' => ( is => 'rw', isa => 'Int', default => 1 );
21              
22             has 'fixed_gff_files' => ( is => 'rw', isa => 'ArrayRef', default => sub { [] } );
23              
24             sub _build_logger {
25 3     3   4 my ($self) = @_;
26 3         26 Log::Log4perl->easy_init( $ERROR );
27 3         6815 my $logger = get_logger();
28 3         214 return $logger;
29             }
30              
31             sub fix_duplicate_gene_ids {
32 4     4 0 561 my ($self) = @_;
33              
34 4         8 my %gene_ids_seen_before;
35            
36             my %file_md5s;
37            
38 4         6 for my $file ( @{ $self->gff_files } ) {
  4         114  
39 9         45 my $digest = file_md5_hex($file);
40            
41 9 100       1575 if(defined($file_md5s{$digest}))
42             {
43             $self->logger->warn(
44 1         27 "Input files have identical MD5 hashes, only using the first file: ".$file_md5s{$digest}." == ".$file
45             );
46 1         7 next;
47             }
48             else
49             {
50 8         23 $file_md5s{$digest} = $file;
51             }
52            
53 8         13 my $ids_seen = 0;
54 8         26 my $ids_from_file = $self->_get_ids_for_gff_file($file);
55              
56 8 50       746 if ( @{$ids_from_file} < 1 ) {
  8         32  
57 0         0 $self->logger->error(
58             "Input GFF file doesnt contain annotation we can use so excluding it from the analysis: $file"
59             );
60             }
61             else {
62 8         15 for my $gene_id ( @{$ids_from_file} ) {
  8         22  
63 33 100       69 if ( $gene_ids_seen_before{$gene_id} ) {
64 2         56 $self->logger->error(
65             "Input file contains duplicate gene IDs, attempting to fix by adding a unique suffix, new GFF in the fixed_input_files directory: $file "
66             );
67 2         742 my $updated_file = $self->_add_suffix_to_gene_ids_and_return_new_file($file, $digest);
68 2 50       6 push( @{ $self->fixed_gff_files }, $updated_file ) if ( defined($updated_file) );
  2         67  
69 2         3 $ids_seen = 1;
70 2         5 last;
71             }
72 31         73 $gene_ids_seen_before{$gene_id}++;
73             }
74            
75             # We know its a valid GFF file since we could open it and extract IDs.
76             # We need to make sure the filenames end in .gff. If it contained duplicate IDs, then they are fixed so nothing to do, but
77             # if they didnt, then we have to double check and repair if necessary.
78 8 100       29 if ( $ids_seen == 0 ) {
79            
80            
81 6         12 push( @{ $self->fixed_gff_files }, $self->_fix_gff_file_extension($file) );
  6         201  
82             }
83             }
84             }
85 4         33 return 1;
86             }
87              
88             sub _fix_gff_file_extension
89             {
90 6     6   16 my ( $self, $input_file ) = @_;
91            
92 6         270 my ( $filename, $directories, $suffix ) = fileparse( $input_file, qr/\.[^.]*/ );
93 6 50       45 return $input_file if($suffix eq '.gff');
94            
95            
96 0 0       0 make_path( $self->output_directory ) if ( !( -d $self->output_directory ) );
97 0         0 my $output_file = $self->output_directory . '/' . $filename . '.gff';
98 0 0       0 copy($input_file, $output_file) or $self->logger->error("Couldnt copy file with invalid gff extention: $input_file -> $output_file");
99 0         0 return $output_file;
100             }
101              
102              
103             sub _add_suffix_to_gene_ids_and_return_new_file {
104 3     3   9 my ( $self, $input_file, $digest ) = @_;
105 3         92 my ( $filename, $directories, $suffix ) = fileparse( $input_file, qr/\.[^.]*/ );
106 3 50       78 make_path( $self->output_directory ) if ( !( -d $self->output_directory ) );
107 3         79 my $output_file = $self->output_directory . '/' . $filename . '.gff';
108              
109 3         84 open( my $input_gff_fh, $input_file );
110 3         143 open( my $out_gff_fh, '>', $output_file );
111            
112             # There is a chance that there can be a collision here, but its remote.
113 3         13 my $random_locus_tag = "".$digest;
114            
115 3         72 $self->logger->warn(
116             "Renamed GFF file from: $input_file -> $output_file" );
117 3         72 $self->logger->warn(
118             "Locus tag used is '$random_locus_tag' for file: $input_file" );
119              
120 3         18 my $found_fasta = 0;
121 3         6 my $gene_counter = 1;
122 3         66 while (<$input_gff_fh>) {
123 663         670 my $line = $_;
124              
125 663 100       865 if ( $line =~ /^\#\#FASTA/ ) {
126 3         4 $found_fasta = 1;
127             }
128              
129 663 100 100     1311 if ( $line =~ /\#/ || $found_fasta == 1 ) {
130 645         554 print {$out_gff_fh} $line;
  645         849  
131 645         1063 next;
132             }
133              
134 18         61 my @cells = split( /\t/, $line );
135 18         44 my @tags = split( /;/, $cells[8] );
136 18         21 my $found_id = 0;
137 18         30 for ( my $i = 0 ; $i < @tags ; $i++ ) {
138 52 100       109 if ( $tags[$i] =~ /^(ID=["']?)([^;"']+)(["']?)/ ) {
139 10         16 my $current_id = $2;
140 10         207 $current_id .= '___' . $self->suffix_counter;
141 10         28 $tags[$i] = $1 .$random_locus_tag.'_'. $gene_counter . $3;
142 10         13 $gene_counter++;
143 10         8 $found_id++;
144 10         12 last;
145             }
146             }
147 18 100       26 if ( $found_id == 0 ) {
148 8         15 unshift( @tags, 'ID=' . $random_locus_tag.'_'. $gene_counter );
149 8         9 $gene_counter++;
150             }
151 18         41 $cells[8] = join( ';', @tags );
152 18         19 print {$out_gff_fh} join( "\t", @cells );
  18         101  
153             }
154              
155 3 50       7 if ( $found_fasta == 0 ) {
156 0         0 $self->logger->warn(
157             "Input GFF file doesnt appear to have the FASTA sequence at the end of the file so is being excluded from the analysis: $input_file" );
158 0         0 return undef;
159             }
160 3         49 close($out_gff_fh);
161 3         10 close($input_gff_fh);
162 3         20 return $output_file;
163             }
164              
165             sub _get_ids_for_gff_file {
166 10     10   590 my ( $self, $file ) = @_;
167 10         18 my @gene_ids;
168 10         269 my $tags_regex = $self->_tags_to_filter;
169 10         79 my $gffio = Bio::Tools::GFF->new( -file => $file, -gff_version => 3 );
170 10         6367 while ( my $feature = $gffio->next_feature() ) {
171 100 100       43328 next if !( $feature->primary_tag =~ /$tags_regex/ );
172 51         563 my $gene_id = $self->_get_feature_id($feature);
173 51 50       220 push( @gene_ids, $gene_id ) if ( defined($gene_id) );
174             }
175 10         52426 return \@gene_ids;
176             }
177              
178             sub _get_feature_id {
179 51     51   86 my ( $self, $feature ) = @_;
180 51         67 my ( $gene_id, @junk );
181 51 50       98 if ( $feature->has_tag('ID') ) {
    0          
182 51         324 ( $gene_id, @junk ) = $feature->get_tag_values('ID');
183             }
184             elsif ( $feature->has_tag('locus_tag') ) {
185 0         0 ( $gene_id, @junk ) = $feature->get_tag_values('locus_tag');
186             }
187             else {
188 0         0 return undef;
189             }
190 51         480 $gene_id =~ s!["']!!g;
191 51 50       98 return undef if ( $gene_id eq "" );
192 51         89 return $gene_id;
193             }
194              
195 2     2   2286 no Moose;
  2         4  
  2         16  
196             __PACKAGE__->meta->make_immutable;
197              
198             1;
199              
200             __END__
201              
202             =pod
203              
204             =encoding UTF-8
205              
206             =head1 NAME
207              
208             Bio::Roary::ReformatInputGFFs - Take in gff files and add suffix where a gene id is seen twice
209              
210             =head1 VERSION
211              
212             version 3.10.1
213              
214             =head1 SYNOPSIS
215              
216             Take in gff files and add suffix where a gene id is seen twice
217             use Bio::Roary::ReformatInputGFFs;
218              
219             my $obj = Bio::Roary::PrepareInputFiles->new(
220             gff_files => ['abc.gff','ddd.faa'],
221             );
222             $obj->fix_duplicate_gene_ids;
223             $obj->fixed_gff_files;
224              
225             =head1 AUTHOR
226              
227             Andrew J. Page <ap13@sanger.ac.uk>
228              
229             =head1 COPYRIGHT AND LICENSE
230              
231             This software is Copyright (c) 2013 by Wellcome Trust Sanger Institute.
232              
233             This is free software, licensed under:
234              
235             The GNU General Public License, Version 3, June 2007
236              
237             =cut