File Coverage

blib/lib/Bio/ViennaNGS/AnnoC.pm
Criterion Covered Total %
statement 10 12 83.3
branch n/a
condition n/a
subroutine 4 4 100.0
pod n/a
total 14 16 87.5


line stmt bran cond sub pod time code
1             # -*-CPerl-*-
2             # Last changed Time-stamp: <2014-12-20 00:30:51 mtw>
3              
4             package Bio::ViennaNGS::AnnoC;
5              
6 1     1   32897 use 5.12.0;
  1         5  
  1         60  
7 1     1   599 use version; our $VERSION = qv('0.12_07');
  1         1618  
  1         6  
8 1     1   597 use Bio::ViennaNGS qw(sortbed);
  1         2  
  1         22  
9 1     1   225 use Bio::Tools::GFF;
  0            
  0            
10             use Path::Class;
11             use Carp;
12             use Moose;
13             use namespace::autoclean;
14              
15             has 'accession' => (
16             is => 'rw',
17             isa => 'Str',
18             predicate => 'has_accession',
19             );
20              
21             has 'features' => (
22             is => 'ro',
23             isa => 'HashRef',
24             predicate => 'has_features',
25             default => sub { {} },
26             );
27              
28             has 'nr_features' => (
29             is => 'ro',
30             isa => 'Int',
31             builder => '_get_nr_of_features',
32             lazy => 1,
33             );
34              
35             has 'featstat' => (
36             is => 'ro',
37             isa => 'HashRef',
38             builder => '_set_featstat',
39             predicate => 'has_featstat',
40             lazy => 1,
41             );
42              
43             before 'featstat' => sub {
44             my $self = shift;
45             $self->_get_nr_of_features();
46             };
47              
48             sub _set_featstat {
49             my $self = shift;
50             my $this_function = (caller(0))[3];
51             my %fs = ();
52             confess "ERROR [$this_function] \$self->features not available"
53             unless ($self->has_features);
54             $fs{total} = 0;
55             $fs{origin} = "$this_function ".$VERSION;
56             $fs{count} = $self->nr_features;
57             foreach my $uid ( keys %{$self->features} ){
58             my $gbkey = ${$self->features}{$uid}->{gbkey};
59             $fs{total} += 1;
60             unless (exists $fs{$gbkey}){
61             $fs{$gbkey} = 0;
62             }
63             $fs{$gbkey} += 1;
64             }
65             return \%fs;
66             }
67              
68             sub _get_nr_of_features {
69             my $self = shift;
70             my $this_function = (caller(0))[3];
71             confess "ERROR [$this_function] \$self->features not available"
72             unless ($self->has_features);
73             return (keys %{$self->features});
74             }
75              
76             sub parse_gff {
77             my ($self,$in_file) = @_;
78             my ($i,$gffio,$header,$f,$gbkey);
79             my $this_function = (caller(0))[3];
80              
81             $gffio = Bio::Tools::GFF->new(-file => $in_file,
82             -gff_version => 3,
83             );
84             $gffio->ignore_sequence(1);
85             if ($header = $gffio->next_segment() ){
86             $self->accession( $header->display_id() );
87             }
88             else{ carp "ERROR [$this_function] Cannot parse GFF header\n" }
89              
90             while($f = $gffio->next_feature()) {
91             my ($uid,$feat_name);
92             my @name = my @id = my @gbkeys = ();
93              
94             next if ($f->primary_tag() eq "exon");
95              
96             # 1) determine gbkey of the current feature
97             @gbkeys = $f->get_tag_values("gbkey");
98             $gbkey = $gbkeys[0];
99              
100             # 2) get a unique ID for each feature
101             if ($f->has_tag('ID')){
102             @id = $f->get_tag_values('ID');
103             $uid = $id[0]; # ID=id101
104             }
105             else {
106             croak "ERROR [$this_function] Feature '$gbkey' at pos.\
107             $f->start does not have \'ID\' attribute\n";
108             }
109              
110             # 3) assign parent's unique ID in case a parent record exists
111             if ($f->has_tag('Parent')){
112             @id = $f->get_tag_values('Parent');
113             $uid = $id[0]; # ID=id101
114             }
115              
116             # 4) find a name for the current feature, use 'Name' or 'ID' attribute
117             if ($f->has_tag('Name')){
118             @name = $f->get_tag_values('Name');
119             $feat_name = $name[0];
120             }
121             elsif ($f->has_tag('ID')){
122             @id = $f->get_tag_values('ID');
123             $feat_name = $id[0]; # ID=id101, use ID as feature name
124             }
125             else {
126             croak "ERROR [$this_function] Cannot set name for feature \
127             $f->gbkey at pos. $f->start\n";
128             }
129              
130             unless (exists ${$self->features}{$uid}) { # gene / ribosome_entry_site / etc.
131             ${$self->features}{$uid}->{start} = $f->start;
132             ${$self->features}{$uid}->{end} = $f->end;
133             ${$self->features}{$uid}->{strand} = $f->strand;
134             ${$self->features}{$uid}->{length} = $f->length;
135             ${$self->features}{$uid}->{seqid} = $f->seq_id;
136             ${$self->features}{$uid}->{score} = $f->score || 0;
137             ${$self->features}{$uid}->{gbkey} = $gbkey;
138             ${$self->features}{$uid}->{name} = $feat_name;
139             ${$self->features}{$uid}->{uid} = $uid;
140             }
141             else { # CDS / tRNA / rRNA / etc.
142             ${$self->features}{$uid}->{gbkey} = $gbkey; # gbkey for tRNA/ rRNA/ CDS etc
143             }
144             }
145             $gffio->close();
146             }
147              
148             sub features2bed {
149             my ($self,$gbkey,$dest,$bn,$log) = @_;
150             my ($chrom,$chrom_start,$chrom_end,$name,$score,$strand,$thick_start);
151             my ($thick_end,$reserved,$block_count,$block_sizes,$block_starts);
152             my @ft = ();
153             my $this_function = (caller(0))[3];
154              
155             croak "ERROR [$this_function] $self->features not available"
156             unless ($self->has_features);
157             croak "ERROT [$this_function] $self->featstat not available"
158             unless ($self->has_featstat);
159             croak "ERROR [$this_function] $dest does not exist"
160             unless (-d $dest);
161             if (defined $log){open(LOG, ">>", $log) or croak $!;}
162              
163             if (defined $gbkey){ # dump features of just one genbank key
164             confess "ERROR [$this_function] genbank key \'$gbkey\' N/A in hash "
165             unless (exists ${$self->featstat}{$gbkey});
166             $ft[0] = $gbkey;
167             }
168             else{ # dump features for all genbank keys
169             foreach my $gbk (keys %{$self->featstat}) {
170             next if ($gbk eq 'total' || $gbk eq 'Src' || $gbk eq 'accession' ||
171             $gbk eq 'origin' || $gbk eq 'count');
172             push @ft,$gbk;
173             }
174             }
175              
176             foreach my $f (@ft){
177             my $bedname = file($dest,"$bn.$f.bed");
178             my $bedname_u = file($dest,"$bn.$f.u.bed");
179             open (BEDOUT, "> $bedname_u") or croak $!;
180              
181             # dump unsorted gene annotation from DS to BED12
182             foreach my $uid (keys %{$self->features}){
183             next unless (${$self->features}{$uid}->{gbkey} eq $f);
184             my @bedline = ();
185             $chrom = ${$self->features}{$uid}->{seqid};
186             $chrom_start = ${$self->features}{$uid}->{start};
187             $chrom_start--; # BED is 0-based
188             $chrom_end = ${$self->features}{$uid}->{end};
189             $name = ${$self->features}{$uid}->{name};
190             $score = ${$self->features}{$uid}->{score};
191             $strand = ${$self->features}{$uid}->{strand} == -1 ? '-' : '+'; #default to +
192             $thick_start = $chrom_start;
193             $thick_end = $chrom_end;
194             $reserved = 0;
195             $block_count = 1;
196             $block_sizes = ${$self->features}{$uid}->{length}.",";
197             $block_starts = "0,";
198             @bedline = join ("\t", ($chrom,$chrom_start,$chrom_end,
199             $name,$score,$strand,$thick_start,
200             $thick_end,$reserved,$block_count,
201             $block_sizes, $block_starts));
202             print BEDOUT "@bedline\n";
203             }
204             close (BEDOUT);
205              
206             sortbed($bedname_u,".",$bedname,1,undef); # sort bed file
207              
208             } # end foreach
209             if (defined $log){close(LOG)};
210             }
211              
212             sub feature_summary {
213             my ($self, $dest) = @_;
214             my ($fn,$fh);
215             my $this_function = (caller(0))[3];
216              
217             croak "ERROR [$this_function] $dest does not exist\n"
218             unless (-d $dest);
219             croak "ERROR [$this_function] $self->accession not available\n"
220             unless ($self->has_accession);
221              
222             $fn = dir($dest,$self->accession.".summary.txt");
223             open $fh, ">", $fn or croak $!;
224              
225             print $fh "Accession\t ".$self->accession."\n";
226             print $fh "Origin \t ${$self->featstat}{origin}\n";
227             foreach my $ft (sort keys %{$self->featstat}){
228             next if ($ft =~ /total/ || $ft =~ /accession/ || $ft =~ /origin/);
229             print $fh "$ft\t${$self->featstat}{$ft}\n";
230             }
231             print $fh "Total\t${$self->featstat}{total}\n";
232             close $fh;
233             }
234              
235             __PACKAGE__->meta->make_immutable;
236              
237             1;
238              
239             __END__
240              
241             =head1 NAME
242              
243             Bio::ViennaNGS::AnnoC - Object-oriented interface for storing and
244             converting biological sequence annotation formats
245              
246             =head1 SYNOPSIS
247              
248             use Bio::ViennaNGS::AnnoC;
249              
250             my $obj = Bio::ViennaNGS::AnnoC->new();
251              
252             # parse GFF3 file to internal data straucture
253             $obj->parse_gff($gff3_file);
254              
255             # compute summary of parsed annotation
256             $obj->featstat;
257              
258             # dump feature summary to file
259             $obj->feature_summary($dest);
260              
261             # dump all tRNAs contained in data structure as BED12
262             $obj->features2bed("tRNA",$dest,$bn,$log)
263              
264             =head1 DESCRIPTION
265              
266             This module provides an object-oriented interface for storing and
267             converting biological sequence annotation data. Based on the C<Moose>
268             object system, it maintains a central data structure which is curently
269             designed to represent simple, non-spliced (ie single-exon) annotation
270             data. Future versions of the module will account for more generic
271             scenarios, including spliced isoforms.
272              
273             =head1 METHODS
274              
275             =over
276              
277             =item parse_gff
278              
279             Title : parse_gff
280             Usage : $obj->parse_gff($gff3_file);
281             Function: Parses GFF3 annotation files of non-spliced genomes into
282             C<$self->features>
283             Args : The full path to a GFF3 file
284             Returns :
285             Notes : The GFF3 specification is available at
286             L<http://www.sequenceontology.org/resources/gff3.html>.
287             This routine has been tested with NCBI bacteria GFF3
288             annotation.
289              
290             =item feature_summary
291              
292             Title : feature_summary
293             Usage : $obj->feature_summary($dest);
294             Function: Generate a summary file for all features present in
295             C<$self->features>
296             Args : Full output path for summary.txt file
297             Returns :
298              
299              
300             =item features2bed
301              
302             Title : features2bed
303             Usage : $obj->features2bed($feature,$workdir,$bn,$log);
304             Function: Dumps genomic features from C<$self->features> hash to a
305             BED12 file.
306             Args : C<$gbkey> can be either a string corresponding to a
307             genbank key in C<$self->featstat> or C<undef>. If defined,
308             only features of the speficied key will be dumped to a single
309             BED12 file. If C<$gbkey> is C<undef>, BED12 files will be
310             generated for each type present in C<$self->featstat>.
311             C<$dest> is the output directory and C<$bn> the basename for
312             all output files. C<$log> is either be the full path to a
313             logfile or C<undef>.
314             Returns :
315              
316             =back
317              
318             =head1 DEPENDENCIES
319              
320             =over 4
321              
322             =item L<Bio::Tools::GFF>
323              
324             =item L<IPC::Cmd>
325              
326             =item L<Path::Class>
327              
328             =item L<Carp>
329              
330             =back
331              
332             =head1 AUTHORS
333              
334             Michael T. Wolfinger E<lt>michael@wolfinger.euE<gt>
335              
336             =head1 COPYRIGHT AND LICENSE
337              
338             Copyright (C) 2014 Michael T. Wolfinger E<lt>michael@wolfinger.euE<gt>
339              
340             This library is free software; you can redistribute it and/or modify
341             it under the same terms as Perl itself, either Perl version 5.12.4 or,
342             at your option, any later version of Perl 5 you may have available.
343              
344             This program is distributed in the hope that it will be useful, but
345             WITHOUT ANY WARRANTY; without even the implied warranty of
346             MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
347             General Public License for more details.
348              
349              
350             =cut