File Coverage

blib/lib/Bio/ViennaNGS/AnnoC.pm
Criterion Covered Total %
statement 21 163 12.8
branch 0 52 0.0
condition 0 20 0.0
subroutine 7 12 58.3
pod 3 3 100.0
total 31 250 12.4


line stmt bran cond sub pod time code
1             # -*-CPerl-*-
2             # Last changed Time-stamp: <2015-02-06 16:27:40 mtw>
3              
4             package Bio::ViennaNGS::AnnoC;
5              
6 1     1   32698 use version; our $VERSION = qv('0.12_15');
  1         1337  
  1         4  
7 1     1   529 use Bio::ViennaNGS::Util qw(sortbed);
  1         2  
  1         77  
8 1     1   700 use Bio::Tools::GFF;
  1         71144  
  1         33  
9 1     1   8 use Path::Class;
  1         1  
  1         52  
10 1     1   4 use Carp;
  1         1  
  1         47  
11 1     1   5 use Moose;
  1         1  
  1         8  
12 1     1   5287 use namespace::autoclean;
  1         1123  
  1         4  
13              
14             has 'accession' => (
15             is => 'rw',
16             isa => 'Str',
17             predicate => 'has_accession',
18             );
19              
20             has 'features' => (
21             is => 'ro',
22             isa => 'HashRef',
23             predicate => 'has_features',
24             default => sub { {} },
25             );
26              
27             has 'nr_features' => (
28             is => 'ro',
29             isa => 'Int',
30             builder => '_get_nr_of_features',
31             lazy => 1,
32             );
33              
34             has 'featstat' => (
35             is => 'ro',
36             isa => 'HashRef',
37             builder => '_set_featstat',
38             predicate => 'has_featstat',
39             lazy => 1,
40             );
41              
42             before 'featstat' => sub {
43             my $self = shift;
44             $self->_get_nr_of_features();
45             };
46              
47             sub _set_featstat {
48 0     0     my $self = shift;
49 0           my $this_function = (caller(0))[3];
50 0           my %fs = ();
51 0 0         confess "ERROR [$this_function] \$self->features not available"
52             unless ($self->has_features);
53 0           $fs{total} = 0;
54 0           $fs{origin} = "$this_function ".$VERSION;
55 0           $fs{count} = $self->nr_features;
56 0           foreach my $uid ( keys %{$self->features} ){
  0            
57 0           my $gbkey = ${$self->features}{$uid}->{gbkey};
  0            
58 0           $fs{total} += 1;
59 0 0         unless (exists $fs{$gbkey}){
60 0           $fs{$gbkey} = 0;
61             }
62 0           $fs{$gbkey} += 1;
63             }
64 0           return \%fs;
65             }
66              
67             sub _get_nr_of_features {
68 0     0     my $self = shift;
69 0           my $this_function = (caller(0))[3];
70 0 0         confess "ERROR [$this_function] \$self->features not available"
71             unless ($self->has_features);
72 0           return (keys %{$self->features});
  0            
73             }
74              
75             sub parse_gff {
76 0     0 1   my ($self,$in_file) = @_;
77 0           my ($i,$gffio,$header,$f,$gbkey);
78 0           my $this_function = (caller(0))[3];
79              
80 0           $gffio = Bio::Tools::GFF->new(-file => $in_file,
81             -gff_version => 3,
82             );
83 0           $gffio->ignore_sequence(1);
84 0 0         if ($header = $gffio->next_segment() ){
85 0           $self->accession( $header->display_id() );
86             }
87 0           else{ carp "ERROR [$this_function] Cannot parse GFF header\n" }
88              
89 0           while($f = $gffio->next_feature()) {
90 0           my ($uid,$feat_name);
91 0           my @name = my @id = my @gbkeys = ();
92              
93 0 0         next if ($f->primary_tag() eq "exon");
94              
95             # 1) determine gbkey of the current feature
96 0           @gbkeys = $f->get_tag_values("gbkey");
97 0           $gbkey = $gbkeys[0];
98              
99             # 2) get a unique ID for each feature
100 0 0         if ($f->has_tag('ID')){
101 0           @id = $f->get_tag_values('ID');
102 0           $uid = $id[0]; # ID=id101
103             }
104             else {
105 0           croak "ERROR [$this_function] Feature '$gbkey' at pos.\
106             $f->start does not have \'ID\' attribute\n";
107             }
108              
109             # 3) assign parent's unique ID in case a parent record exists
110 0 0         if ($f->has_tag('Parent')){
111 0           @id = $f->get_tag_values('Parent');
112 0           $uid = $id[0]; # ID=id101
113             }
114              
115             # 4) find a name for the current feature, use 'Name' or 'ID' attribute
116 0 0         if ($f->has_tag('Name')){
    0          
117 0           @name = $f->get_tag_values('Name');
118 0           $feat_name = $name[0];
119             }
120             elsif ($f->has_tag('ID')){
121 0           @id = $f->get_tag_values('ID');
122 0           $feat_name = $id[0]; # ID=id101, use ID as feature name
123             }
124             else {
125 0           croak "ERROR [$this_function] Cannot set name for feature \
126             $f->gbkey at pos. $f->start\n";
127             }
128              
129 0 0         unless (exists ${$self->features}{$uid}) { # gene / ribosome_entry_site / etc.
  0            
130 0           ${$self->features}{$uid}->{start} = $f->start;
  0            
131 0           ${$self->features}{$uid}->{end} = $f->end;
  0            
132 0           ${$self->features}{$uid}->{strand} = $f->strand;
  0            
133 0           ${$self->features}{$uid}->{length} = $f->length;
  0            
134 0           ${$self->features}{$uid}->{seqid} = $f->seq_id;
  0            
135 0   0       ${$self->features}{$uid}->{score} = $f->score || 0;
  0            
136 0           ${$self->features}{$uid}->{gbkey} = $gbkey;
  0            
137 0           ${$self->features}{$uid}->{name} = $feat_name;
  0            
138 0           ${$self->features}{$uid}->{uid} = $uid;
  0            
139             }
140             else { # CDS / tRNA / rRNA / etc.
141 0           ${$self->features}{$uid}->{gbkey} = $gbkey; # gbkey for tRNA/ rRNA/ CDS etc
  0            
142             }
143             }
144 0           $gffio->close();
145             }
146              
147             sub features2bed {
148 0     0 1   my ($self,$gbkey,$dest,$bn,$log) = @_;
149 0           my ($chrom,$chrom_start,$chrom_end,$name,$score,$strand,$thick_start);
150 0           my ($thick_end,$reserved,$block_count,$block_sizes,$block_starts);
151 0           my @ft = ();
152 0           my $this_function = (caller(0))[3];
153              
154 0 0         croak "ERROR [$this_function] $self->features not available"
155             unless ($self->has_features);
156 0 0         croak "ERROT [$this_function] $self->featstat not available"
157             unless ($self->has_featstat);
158 0 0         croak "ERROR [$this_function] $dest does not exist"
159             unless (-d $dest);
160 0 0         if (defined $log){open(LOG, ">>", $log) or croak $!;}
  0 0          
161              
162 0 0         if (defined $gbkey){ # dump features of just one genbank key
163 0           confess "ERROR [$this_function] genbank key \'$gbkey\' N/A in hash "
164 0 0         unless (exists ${$self->featstat}{$gbkey});
165 0           $ft[0] = $gbkey;
166             }
167             else{ # dump features for all genbank keys
168 0           foreach my $gbk (keys %{$self->featstat}) {
  0            
169 0 0 0       next if ($gbk eq 'total' || $gbk eq 'Src' || $gbk eq 'accession' ||
      0        
      0        
      0        
170             $gbk eq 'origin' || $gbk eq 'count');
171 0           push @ft,$gbk;
172             }
173             }
174              
175 0           foreach my $f (@ft){
176 0           my $bedname = file($dest,"$bn.$f.bed");
177 0           my $bedname_u = file($dest,"$bn.$f.u.bed");
178 0 0         open (BEDOUT, "> $bedname_u") or croak $!;
179              
180             # dump unsorted gene annotation from DS to BED12
181 0           foreach my $uid (keys %{$self->features}){
  0            
182 0 0         next unless (${$self->features}{$uid}->{gbkey} eq $f);
  0            
183 0           my @bedline = ();
184 0           $chrom = ${$self->features}{$uid}->{seqid};
  0            
185 0           $chrom_start = ${$self->features}{$uid}->{start};
  0            
186 0           $chrom_start--; # BED is 0-based
187 0           $chrom_end = ${$self->features}{$uid}->{end};
  0            
188 0           $name = ${$self->features}{$uid}->{name};
  0            
189 0           $score = ${$self->features}{$uid}->{score};
  0            
190 0 0         $strand = ${$self->features}{$uid}->{strand} == -1 ? '-' : '+'; #default to +
  0            
191 0           $thick_start = $chrom_start;
192 0           $thick_end = $chrom_end;
193 0           $reserved = 0;
194 0           $block_count = 1;
195 0           $block_sizes = ${$self->features}{$uid}->{length}.",";
  0            
196 0           $block_starts = "0,";
197 0           @bedline = join ("\t", ($chrom,$chrom_start,$chrom_end,
198             $name,$score,$strand,$thick_start,
199             $thick_end,$reserved,$block_count,
200             $block_sizes, $block_starts));
201 0           print BEDOUT "@bedline\n";
202             }
203 0           close (BEDOUT);
204              
205 0           sortbed($bedname_u,".",$bedname,1,undef); # sort bed file
206              
207             } # end foreach
208 0 0         if (defined $log){close(LOG)};
  0            
209             }
210              
211             sub feature_summary {
212 0     0 1   my ($self, $dest) = @_;
213 0           my ($fn,$fh);
214 0           my $this_function = (caller(0))[3];
215              
216 0 0         croak "ERROR [$this_function] $dest does not exist\n"
217             unless (-d $dest);
218 0 0         croak "ERROR [$this_function] $self->accession not available\n"
219             unless ($self->has_accession);
220              
221 0           $fn = dir($dest,$self->accession.".summary.txt");
222 0 0         open $fh, ">", $fn or croak $!;
223              
224 0           print $fh "Accession\t ".$self->accession."\n";
225 0           print $fh "Origin \t ${$self->featstat}{origin}\n";
  0            
226 0           foreach my $ft (sort keys %{$self->featstat}){
  0            
227 0 0 0       next if ($ft =~ /total/ || $ft =~ /accession/ || $ft =~ /origin/);
      0        
228 0           print $fh "$ft\t${$self->featstat}{$ft}\n";
  0            
229             }
230 0           print $fh "Total\t${$self->featstat}{total}\n";
  0            
231 0           close $fh;
232             }
233              
234             __PACKAGE__->meta->make_immutable;
235              
236             1;
237              
238             __END__
239              
240             =head1 NAME
241              
242             Bio::ViennaNGS::AnnoC - Object-oriented interface for storing and
243             converting biological sequence annotation formats
244              
245             =head1 SYNOPSIS
246              
247             use Bio::ViennaNGS::AnnoC;
248              
249             my $obj = Bio::ViennaNGS::AnnoC->new();
250              
251             # parse GFF3 file to internal data straucture
252             $obj->parse_gff($gff3_file);
253              
254             # compute summary of parsed annotation
255             $obj->featstat;
256              
257             # dump feature summary to file
258             $obj->feature_summary($dest);
259              
260             # dump all tRNAs contained in data structure as BED12
261             $obj->features2bed("tRNA",$dest,$bn,$log)
262              
263             =head1 DESCRIPTION
264              
265             This module provides an object-oriented interface for storing and
266             converting biological sequence annotation data. Based on the C<Moose>
267             object system, it maintains a central data structure which is curently
268             designed to represent simple, non-spliced (ie single-exon) annotation
269             data. Future versions of the module will account for more generic
270             scenarios, including spliced isoforms.
271              
272             =head1 METHODS
273              
274             =over
275              
276             =item parse_gff
277              
278             Title : parse_gff
279             Usage : C<<$obj->parse_gff($gff3_file);>>
280             Function: Parses GFF3 annotation files of non-spliced genomes into
281             C<<$self->features>>
282             Args : The full path to a GFF3 file
283             Returns :
284             Notes : The GFF3 specification is available at
285             L<http://www.sequenceontology.org/resources/gff3.html>. This
286             routine has been tested with NCBI bacteria GFF3 annotation.
287              
288             =item feature_summary
289              
290             Title : feature_summary
291             Usage : C<<$obj->feature_summary($dest);>>
292             Function : Generate a summary file for all features present in
293             C<<$self->features>>
294             Args : Full output path for summary.txt file
295             Returns :
296              
297             =item features2bed
298              
299             Title : features2bed
300             Usage : C<<$obj->features2bed($feature,$workdir,$bn,$log);>>
301             Function : Dumps genomic features from C<$self->features> hash to a
302             BED12 file.
303             Args : C<<$gbkey>> can be either a string corresponding to a genbank
304             key in C<<$self->featstat>> or C<undef>. If defined, only
305             features of the speficied key will be dumped to a single BED12
306             file. If C<$gbkey> is C<undef>, BED12 files will be generated
307             for each type present in C<<$self->featstat>>. C<$dest> is the
308             output directory and C<$bn> the basename for all output
309             files. C<$log> is either be the full path to a logfile or
310             C<undef>.
311             Returns :
312              
313             =back
314              
315             =head1 DEPENDENCIES
316              
317             =over
318              
319             =item L<Bio::Tools::GFF>
320              
321             =item L<IPC::Cmd>
322              
323             =item L<Path::Class>
324              
325             =item L<Carp>
326              
327             =back
328              
329             =head1 AUTHORS
330              
331             Michael T. Wolfinger E<lt>michael@wolfinger.euE<gt>
332              
333             =head1 COPYRIGHT AND LICENSE
334              
335             Copyright (C) 2014-2015 Michael T. Wolfinger E<lt>michael@wolfinger.euE<gt>
336              
337             This library is free software; you can redistribute it and/or modify
338             it under the same terms as Perl itself, either Perl version 5.10.0 or,
339             at your option, any later version of Perl 5 you may have available.
340              
341             This program is distributed in the hope that it will be useful, but
342             WITHOUT ANY WARRANTY; without even the implied warranty of
343             MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
344             General Public License for more details.
345              
346              
347             =cut