File Coverage

Bio/LiveSeq/IO/BioPerl.pm
Criterion Covered Total %
statement 121 156 77.5
branch 23 44 52.2
condition 6 12 50.0
subroutine 7 8 87.5
pod 3 3 100.0
total 160 223 71.7


line stmt bran cond sub pod time code
1             #
2             # bioperl module for Bio::LiveSeq::IO::BioPerl
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Joseph Insana
7             #
8             # Copyright Joseph Insana
9             #
10             # You may distribute this module under the same terms as perl itself
11             #
12             # POD documentation - main docs before the code
13              
14             =head1 NAME
15              
16             Bio::LiveSeq::IO::BioPerl - Loader for LiveSeq from EMBL entries with BioPerl
17              
18             =head1 SYNOPSIS
19              
20             my $db="EMBL";
21             my $file="../data/M20132";
22             my $id="HSANDREC";
23              
24             my $loader=Bio::LiveSeq::IO::BioPerl->load(-db=>"$db", -file=>"$file");
25             # or
26             my $loader=Bio::LiveSeq::IO::BioPerl->load(-db=>"$db", -id=>"$id");
27              
28             my @translationobjects=$loader->entry2liveseq();
29              
30             my $genename="AR";
31             my $gene=$loader->gene2liveseq(-gene_name => "$genename",
32             -getswissprotinfo => 0);
33              
34             #NOTE1: The only -db now supported is EMBL. Hence it defaults to EMBL.
35             #NOTE2: -file requires a filename (and path if necessary) containing an
36             # EMBL entry
37             # -id will use Bio::DB::EMBL.pm to fetch the sequence from the web,
38             # (bioperl wraparound to [w]getz from SRS)
39             #NOTE3: To retrieve the swissprot (if possible) attached to the embl entry
40             # (to get protein domains at dna level), only Bio::DB::EMBL.pm
41             # is supported under BioPerl. Refer to Bio::LiveSeq::IO::SRS
42             # otherwise.
43             #NOTE4: NOTE3 is not implemented yet for bioperl, working on it
44              
45              
46             =head1 DESCRIPTION
47              
48             This package uses BioPerl (SeqIO) to fetch a sequence database entry,
49             analyse it and create LiveSeq objects out of it.
50              
51             A filename (or an ID that will fetch entry through the web) has to be passed
52             to this package which will return references to all translation objects
53             created from the EMBL entry. References to Transcription, DNA and Exon
54             objects can all be retrieved departing from these.
55              
56             Alternatively, a specific "gene" name can be specified, together with
57             the embl-acc ID. This will create a LiveSeq::Gene object with all
58             relevant gene features attached/created.
59              
60             ATTENTION: if web fetching is requested, the package HTTP::Request needs
61             to be installed.
62              
63              
64             =head1 AUTHOR - Joseph A.L. Insana
65              
66             Email: Insana@ebi.ac.uk, jinsana@gmx.net
67              
68             =head1 APPENDIX
69              
70             The rest of the documentation details each of the object
71             methods. Internal methods are usually preceded with a _
72              
73             =cut
74              
75             # Let the code begin...
76              
77             package Bio::LiveSeq::IO::BioPerl;
78              
79             # TODO->TOCHECK
80             # each_secondary_access not working
81             # why array from each_tag_value($qual) ? When will there be more than one
82             # element in such array?
83             # what is the annotation object? ($seqobj->annotation)
84             # unsatisfied by both BioPerl binomial and SRS "org" to retrieve Organism info
85              
86 2     2   1156 use strict;
  2         2  
  2         52  
87 2     2   6 use Carp qw(cluck croak carp);
  2         2  
  2         94  
88 2     2   6 use vars qw($DBEMBLLOADED);
  2         9  
  2         58  
89 2     2   619 use Bio::SeqIO; # for -file entry loading
  2         4  
  2         87  
90              
91             # Note, the following requires HTTP::Request. If the modules are not installed
92             # uncomment the following and use only -filename and don't request swissprotinfo
93             eval {
94             require Bio::DB::EMBL; # for -id entry loading
95             $DBEMBLLOADED = 1;
96             };
97              
98              
99 2     2   9 use base qw(Bio::LiveSeq::IO::Loader);
  2         3  
  2         739  
100              
101             # This package can in the future host other databases loading subroutines.
102             # e.g. ensembl2hash
103              
104             =head2 load
105              
106             Title : load
107             Usage : my $filename="../data/M20132";
108             $loader=Bio::LiveSeq::IO::BioPerl->load(-db=>"EMBL", -file=>"$filename");
109             or
110             $loader=Bio::LiveSeq::IO::BioPerl->load(-db=>"EMBL", -id=>"HSANDREC");
111              
112             Function: loads an entry with BioPerl from a database into a hash
113             Returns : reference to a new object of class IO::BioPerl holding an entry
114             Errorcode 0
115             Args : an filename containing an EMBL entry OR an ID or ACCESSION code
116              
117             =cut
118              
119             sub load {
120 3     3 1 19 my ($thing, %args) = @_;
121 3   33     19 my $class = ref($thing) || $thing;
122 3         4 my ($obj,%loader);
123              
124 3         12 my ($db,$filename,$id)=($args{-db},$args{-file},$args{-id});
125              
126 3 100       10 if (defined($db)) {
127 2 50       8 unless ($db eq "EMBL") {
128 0         0 carp "Note: only EMBL now supported!";
129 0         0 return(0);
130             }
131             } else {
132 1         2 $db="EMBL";
133             }
134              
135 3 50 33     14 if (defined($id) && defined($filename)) {
136 0         0 carp "You can either specify a -id or a -filename!";
137 0         0 return(0);
138             }
139              
140 3 50 33     16 unless (defined($id) || defined($filename)) {
141 0         0 carp "You must specify either a -id or a -filename!";
142 0         0 return(0);
143             }
144              
145 3         5 my $hashref;
146 3 50       10 if ($db eq "EMBL") {
147 3         3 my $test_transl=0; # change to 0 to avoid comparison of translation
148              
149             # these can be changed for future needs
150 3         13 my @embl_valid_feature_names=qw(CDS CDS_span exon prim_transcript intron repeat_unit repeat_region mRNA);
151 3         9 my @embl_valid_qual_names=qw(gene codon_start db_xref product note number rpt_family transl_table);
152              
153             # dunno yet how to implement test_transl again....
154             # probably on a one-on-one basis with each translation?
155 3 50       6 if ($test_transl) {
156 0         0 push (@embl_valid_qual_names,"translation"); # needed for test_transl
157             }
158              
159 3         5 my $seqobj; # bioperl sequence object, to be passed to embl2hash
160              
161 3 50       8 if (defined($filename)) {
162 3         19 my $stream = Bio::SeqIO->new('-file' => $filename, '-format' => 'EMBL');
163 3         10 $seqobj = $stream->next_seq();
164             } else { # i.e. if -id
165            
166 0 0       0 if( $DBEMBLLOADED ) {
167 0         0 my $embl = Bio::DB::EMBL->new();
168 0         0 $seqobj = $embl->get_Seq_by_id($id); # EMBL ID or ACC
169             } else {
170 0         0 my $root = Bio::Root::Root->new();
171 0         0 $root->warn("Must have HTTP::Request::Common installed, cannot run load without the -filename option specified, see docs for Bio::LiveSeq::IO::BioPerl");
172 0         0 return;
173             }
174             }
175              
176 3         13 $hashref=&embl2hash($seqobj,\@embl_valid_feature_names,\@embl_valid_qual_names);
177             }
178 3 50       13 unless ($hashref) { return (0); }
  0         0  
179              
180 3         37 %loader = (db => $db, filename => $filename, id => $id, hash => $hashref);
181 3         5 $obj = \%loader;
182 3         15 $obj = bless $obj, $class;
183 3         15 return $obj;
184             }
185              
186             =head2 embl2hash
187              
188             Title : embl2hash
189             Function: retrieves with BioPerl an EMBL entry, parses it and creates
190             a hash that contains all the information.
191             Returns : a reference to a hash
192             Errorcode: 0
193             Args : a BioPerl Sequence Object (from file or web fetching)
194             two array references to skip features and qualifiers (for
195             performance)
196             Example: @valid_features=qw(CDS exon prim_transcript mRNA);
197             @valid_qualifiers=qw(gene codon_start db_xref product rpt_family);
198             $hashref=&embl2hash($seqobj,\@valid_features,\@valid_qualifiers);
199              
200             =cut
201              
202             # arguments: Bioperl $seqobj
203             # to skip features and qualifiers (for performance), two array
204             # references must be passed (this can change into string arguments to
205             # be passed....)
206             # returns: a reference to a hash containing the important features requested
207             sub embl2hash {
208 3     3 1 5 my $seqobj=$_[0];
209 3         5 my %valid_features; my %valid_names;
210 3 50       7 if ($_[1]) {
211 3         3 %valid_features = map {$_, 1} @{$_[1]}; # to skip features
  24         41  
  3         9  
212             }
213 3 50       20 if ($_[2]) {
214 3         4 %valid_names = map {$_, 1} @{$_[2]}; # to skip qualifiers
  24         35  
  3         10  
215             }
216              
217 3         11 my $annobj = $seqobj->annotation(); # what's this?
218              
219 3         13 my $entry_Sequence = lc($seqobj->seq()); # SRS returns lowercase
220              
221 3         13 my $entry_ID = $seqobj->display_id;
222 3         10 my $entry_AccNumber = $seqobj->accession; # or maybe accession_number ?
223 3         8 my $secondary_acc; # to fetch the other acc numbers
224 3         9 foreach $secondary_acc ($seqobj->get_secondary_accessions) { # not working!
225 1         3 $entry_AccNumber .= " $secondary_acc";
226             }
227 3         7 my $entry_Molecule = $seqobj->molecule; # this alone returns molec+division
228 3         6 my $entry_Division = $seqobj->division;
229             # fixed: now Molecule works in BioPerl, no need for next lines
230             #my @Molecule=split(" ",$entry_Molecule);
231             #my $entry_Division = pop(@Molecule); # only division
232             #$entry_Molecule = join(" ",@Molecule); # only molecule
233 3         16 my $entry_Description = $seqobj->desc;
234              
235 3         10 my $speciesobj = $seqobj->species;
236 3         11 my $entry_Organism = $speciesobj->binomial;
237              
238 3         13 my $entry_SeqLength = $seqobj->length;
239            
240             # put into the hash
241 3         4 my %entryhash;
242 3         8 $entryhash{ID}=$entry_ID;
243 3         6 $entryhash{AccNumber}=$entry_AccNumber;
244 3         4 $entryhash{Molecule}=$entry_Molecule;
245 3         6 $entryhash{Division}=$entry_Division;
246 3         4 $entryhash{Description}=$entry_Description;
247 3         8 $entryhash{Organism}=$entry_Organism;
248 3         4 $entryhash{Sequence}=$entry_Sequence;
249 3         8 $entryhash{SeqLength}=$entry_SeqLength;
250              
251 3         14 my @topfeatures=$seqobj->top_SeqFeatures();
252             # create features array
253 3         4 my $featuresnumber= scalar(@topfeatures);
254 3         5 $entryhash{FeaturesNumber}=$featuresnumber;
255 3         4 my $feature_name;
256 0         0 my @feature_qual_names; my @feature_qual_value;
257 0         0 my ($feature_qual_name,$feature_qual_number);
258 0         0 my @features;
259              
260 0         0 my ($feat,$qual,$subfeat);
261 0         0 my @subfeat;
262 3         4 my $i=0;
263 3         4 foreach $feat (@topfeatures) {
264 34         24 my %feature;
265 34         46 $feature_name = $feat->primary_tag;
266 34 100       55 unless ($valid_features{$feature_name}) {
267             #print "skipping $feature_name\n";
268 10         12 next;
269             }
270             # works ok with 0.6.2
271             # if ($feature_name eq "CDS_span") { # case of CDS with various exons 0.6.2
272             # $feature_name="CDS"; # 0.6.2
273 24         29 my $featlocation=$feat->location; # 0.7
274 24 100 100     64 if (($feature_name eq "CDS")&&($featlocation->isa('Bio::Location::SplitLocationI'))) { # case of CDS with various exons BioPerl 0.7
275             # @subfeat=$feat->sub_SeqFeature; # 0.6.2
276 2         5 @subfeat=$featlocation->sub_Location(); # 0.7
277 2         4 my @transcript;
278 2         4 foreach $subfeat (@subfeat) {
279 15         10 my @range;
280 15 50       21 if ($subfeat->strand == -1) {
281 0         0 @range=($subfeat->end,$subfeat->start,$subfeat->strand);
282             } else {
283 15         19 @range=($subfeat->start,$subfeat->end,$subfeat->strand);
284             }
285 15         21 push (@transcript,\@range);
286             }
287 2         8 $feature{range}=\@transcript;
288             } else {
289 22         14 my @range;
290 22 50       30 ($feat->strand == -1) ? (@range = ($feat->end, $feat->start, $feat->strand) ) :
291             (@range = ( $feat->start,$feat->end,$feat->strand) );
292             # works ok with 0.6.2
293 22 100       38 if ($feature_name eq "CDS") { # case of single exon CDS (CDS name but not split location)
294 1         2 my @transcript=(\@range);
295 1         2 $feature{range}=\@transcript;
296             } else { # all other range features
297 21         35 $feature{range}=\@range;
298             }
299             }
300 24         23 $feature{location}="deprecated";
301            
302 24         17 $feature{position}=$i;
303 24         23 $feature{name}=$feature_name;
304            
305 24         35 @feature_qual_names= $feat->all_tags();
306 24         24 $feature_qual_number= scalar(@feature_qual_names);
307            
308 24         20 $feature{qual_number}=$feature_qual_number;
309            
310 24         20 my %feature_qualifiers;
311 24         24 for $qual (@feature_qual_names) {
312 41         31 $feature_qual_name=$qual;
313 41 100       54 unless ($valid_names{$feature_qual_name}) {
314 6         8 next;
315             }
316 35         44 @feature_qual_value=$feat->each_tag_value($qual);
317             #print "$qual => @feature_qual_value \n";
318 35         48 $feature_qualifiers{$feature_qual_name}=$feature_qual_value[0]; # ?
319             # maybe the whole array should be entered, not just the 1st element?
320             # what could be the other elements? TOCHECK!
321             }
322 24         22 $feature{qualifiers}=\%feature_qualifiers;
323 24         29 push (@features,\%feature); # array of features
324 24         28 $i++;
325             }
326 3         5 $entryhash{Features}=\@features; # put this also into the hash
327            
328 3         5 my @cds; # array just of CDSs
329 3         7 for $i (0..$#features) {
330 24 100       35 if ($features[$i]->{'name'} eq "CDS") {
331 3         5 push(@cds,$features[$i]);
332             }
333             }
334 3         9 $entryhash{CDS}=\@cds; # put this also into the hash
335 3         28 return (\%entryhash);
336             }
337              
338             =head2 novelaasequence2gene
339              
340             Title : novelaasequence2gene
341             Usage : $gene=Bio::LiveSeq::IO::BioPerl->novelaasequence2gene(-aasequence => "MGLAAPTRS*");
342             : $gene=Bio::LiveSeq::IO::BioPerl->novelaasequence2gene(-aasequence => "MGLAAPTRS*",
343             -cusg_data => "58 44 7 29 3 3 480 267 105 143 122 39 144 162 14 59 53 25 233 292 19 113 88 246 28 68 161 231 27 102 128 151 67 60 138 131 48 61 153 19 233 73 150 31 129 38 147 71 138 43 181 81 44 15 255 118 312 392 236 82 20 10 14 141");
344             : $gene=Bio::LiveSeq::IO::BioPerl->novelaasequence2gene(-aasequence => "MGLAAPTRS*",
345             -cusg_data => "58 44 7 29 3 3 480 267 105 143 122 39 144 162 14 59 53 25 233 292 19 113 88 246 28 68 161 231 27 102 128 151 67 60 138 131 48 61 153 19 233 73 150 31 129 38 147 71 138 43 181 81 44 15 255 118 312 392 236 82 20 10 14 141",
346             -translation_table => "2",
347             -gene_name => "tyr-kinase");
348              
349             Function: creates LiveSeq objects from a novel amino acid sequence,
350             using codon usage information (loaded from a file) to choose
351             codons according to relative frequencies.
352             If a codon_usage information is not specified,
353             the default is to use Homo sapiens data (taxonomy ID 9606).
354             If a translation_table ID is not specified, it will default to 1
355             (standard code).
356             Returns : reference to a Gene object containing references to LiveSeq objects
357             Errorcode 0
358             Args : string containing an amino acid sequence
359             string (optional) with codon usage data (64 integer numbers)
360             string (optional) specifying a gene_name
361             integer (optional) specifying a translation_table ID
362              
363             =cut
364              
365             sub novelaasequence2gene {
366 0     0 1   my ($self, %args) = @_;
367 0           my ($gene_name,$cusg_data,$aasequence,$ttabid)=($args{-gene_name},$args{-cusg_data},$args{-aasequence},$args{-translation_table});
368              
369 0           my @species_codon_usage;
370 0 0         unless ($aasequence) {
371 0           carp "aasequence not given";
372 0           return (0);
373             }
374 0 0         unless ($gene_name) {
375 0           $gene_name="Novel Unknown";
376             }
377 0 0         unless ($ttabid) {
378 0           $ttabid=1;
379             }
380 0 0         unless ($cusg_data) {
381 0           @species_codon_usage=
382             qw(68664 118404 126679 51100 125600 123646 75667 210903 435317
383             139009 79303 135218 128429 192616 49456 161556 211962 131222
384             162837 213626 69346 140780 182506 219428 76684 189374 173010
385             310626 82647 202329 180955 250410 180001 118798 76398 160764
386             317359 119013 262630 359627 218376 186915 130857 377006 162826
387             113684 317703 441298 287040 245435 174805 133427 134523 108740
388             225633 185619 78463 240138 174021 244236 142435 8187 5913
389             14381); # updated 21Jul2000
390             } else {
391 0           @species_codon_usage=split(/ /,$cusg_data);
392             }
393            
394 0           my $gene=Bio::LiveSeq::IO::Loader::_common_novelaasequence2gene(\@species_codon_usage,$ttabid,$aasequence,$gene_name);
395 0           return ($gene);
396             }
397              
398             1;