File Coverage

Bio/LiveSeq/IO/BioPerl.pm
Criterion Covered Total %
statement 126 156 80.7
branch 23 44 52.2
condition 6 12 50.0
subroutine 7 8 87.5
pod 3 3 100.0
total 165 223 73.9


line stmt bran cond sub pod time code
1             #
2             # bioperl module for Bio::LiveSeq::IO::BioPerl
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Joseph Insana
7             #
8             # Copyright Joseph Insana
9             #
10             # You may distribute this module under the same terms as perl itself
11             #
12             # POD documentation - main docs before the code
13              
14             =head1 NAME
15              
16             Bio::LiveSeq::IO::BioPerl - Loader for LiveSeq from EMBL entries with BioPerl
17              
18             =head1 SYNOPSIS
19              
20             my $db="EMBL";
21             my $file="../data/M20132";
22             my $id="HSANDREC";
23              
24             my $loader=Bio::LiveSeq::IO::BioPerl->load(-db=>"$db", -file=>"$file");
25             # or
26             my $loader=Bio::LiveSeq::IO::BioPerl->load(-db=>"$db", -id=>"$id");
27              
28             my @translationobjects=$loader->entry2liveseq();
29              
30             my $genename="AR";
31             my $gene=$loader->gene2liveseq(-gene_name => "$genename",
32             -getswissprotinfo => 0);
33              
34             #NOTE1: The only -db now supported is EMBL. Hence it defaults to EMBL.
35             #NOTE2: -file requires a filename (and path if necessary) containing an
36             # EMBL entry
37             # -id will use Bio::DB::EMBL.pm to fetch the sequence from the web,
38             # (bioperl wraparound to [w]getz from SRS)
39             #NOTE3: To retrieve the swissprot (if possible) attached to the embl entry
40             # (to get protein domains at dna level), only Bio::DB::EMBL.pm
41             # is supported under BioPerl. Refer to Bio::LiveSeq::IO::SRS
42             # otherwise.
43             #NOTE4: NOTE3 is not implemented yet for bioperl, working on it
44              
45              
46             =head1 DESCRIPTION
47              
48             This package uses BioPerl (SeqIO) to fetch a sequence database entry,
49             analyse it and create LiveSeq objects out of it.
50              
51             A filename (or an ID that will fetch entry through the web) has to be passed
52             to this package which will return references to all translation objects
53             created from the EMBL entry. References to Transcription, DNA and Exon
54             objects can all be retrieved departing from these.
55              
56             Alternatively, a specific "gene" name can be specified, together with
57             the embl-acc ID. This will create a LiveSeq::Gene object with all
58             relevant gene features attached/created.
59              
60             ATTENTION: if web fetching is requested, the package HTTP::Request needs
61             to be installed.
62              
63              
64             =head1 AUTHOR - Joseph A.L. Insana
65              
66             Email: Insana@ebi.ac.uk, jinsana@gmx.net
67              
68             =head1 APPENDIX
69              
70             The rest of the documentation details each of the object
71             methods. Internal methods are usually preceded with a _
72              
73             =cut
74              
75             # Let the code begin...
76              
77             package Bio::LiveSeq::IO::BioPerl;
78              
79             # TODO->TOCHECK
80             # each_secondary_access not working
81             # why array from each_tag_value($qual) ? When will there be more than one
82             # element in such array?
83             # what is the annotation object? ($seqobj->annotation)
84             # unsatisfied by both BioPerl binomial and SRS "org" to retrieve Organism info
85              
86 2     2   1429 use strict;
  2         4  
  2         55  
87 2     2   8 use Carp qw(cluck croak carp);
  2         3  
  2         102  
88 2     2   10 use vars qw($DBEMBLLOADED);
  2         13  
  2         89  
89 2     2   637 use Bio::SeqIO; # for -file entry loading
  2         4  
  2         91  
90              
91             # Note, the following requires HTTP::Request. If the modules are not installed
92             # uncomment the following and use only -filename and don't request swissprotinfo
93             eval {
94             require Bio::DB::EMBL; # for -id entry loading
95             $DBEMBLLOADED = 1;
96             };
97              
98              
99 2     2   12 use base qw(Bio::LiveSeq::IO::Loader);
  2         3  
  2         635  
100              
101             # This package can in the future host other databases loading subroutines.
102             # e.g. ensembl2hash
103              
104             =head2 load
105              
106             Title : load
107             Usage : my $filename="../data/M20132";
108             $loader=Bio::LiveSeq::IO::BioPerl->load(-db=>"EMBL", -file=>"$filename");
109             or
110             $loader=Bio::LiveSeq::IO::BioPerl->load(-db=>"EMBL", -id=>"HSANDREC");
111              
112             Function: loads an entry with BioPerl from a database into a hash
113             Returns : reference to a new object of class IO::BioPerl holding an entry
114             Errorcode 0
115             Args : an filename containing an EMBL entry OR an ID or ACCESSION code
116              
117             =cut
118              
119             sub load {
120 3     3 1 24 my ($thing, %args) = @_;
121 3   33     20 my $class = ref($thing) || $thing;
122 3         6 my ($obj,%loader);
123              
124 3         11 my ($db,$filename,$id)=($args{-db},$args{-file},$args{-id});
125              
126 3 100       9 if (defined($db)) {
127 2 50       7 unless ($db eq "EMBL") {
128 0         0 carp "Note: only EMBL now supported!";
129 0         0 return(0);
130             }
131             } else {
132 1         1 $db="EMBL";
133             }
134              
135 3 50 33     13 if (defined($id) && defined($filename)) {
136 0         0 carp "You can either specify a -id or a -filename!";
137 0         0 return(0);
138             }
139              
140 3 50 33     17 unless (defined($id) || defined($filename)) {
141 0         0 carp "You must specify either a -id or a -filename!";
142 0         0 return(0);
143             }
144              
145 3         6 my $hashref;
146 3 50       9 if ($db eq "EMBL") {
147 3         6 my $test_transl=0; # change to 0 to avoid comparison of translation
148              
149             # these can be changed for future needs
150 3         9 my @embl_valid_feature_names=qw(CDS CDS_span exon prim_transcript intron repeat_unit repeat_region mRNA);
151 3         12 my @embl_valid_qual_names=qw(gene codon_start db_xref product note number rpt_family transl_table);
152              
153             # dunno yet how to implement test_transl again....
154             # probably on a one-on-one basis with each translation?
155 3 50       8 if ($test_transl) {
156 0         0 push (@embl_valid_qual_names,"translation"); # needed for test_transl
157             }
158              
159 3         4 my $seqobj; # bioperl sequence object, to be passed to embl2hash
160              
161 3 50       9 if (defined($filename)) {
162 3         22 my $stream = Bio::SeqIO->new('-file' => $filename, '-format' => 'EMBL');
163 3         11 $seqobj = $stream->next_seq();
164             } else { # i.e. if -id
165            
166 0 0       0 if( $DBEMBLLOADED ) {
167 0         0 my $embl = Bio::DB::EMBL->new();
168 0         0 $seqobj = $embl->get_Seq_by_id($id); # EMBL ID or ACC
169             } else {
170 0         0 my $root = Bio::Root::Root->new();
171 0         0 $root->warn("Must have HTTP::Request::Common installed, cannot run load without the -filename option specified, see docs for Bio::LiveSeq::IO::BioPerl");
172 0         0 return;
173             }
174             }
175              
176 3         15 $hashref=&embl2hash($seqobj,\@embl_valid_feature_names,\@embl_valid_qual_names);
177             }
178 3 50       14 unless ($hashref) { return (0); }
  0         0  
179              
180 3         18 %loader = (db => $db, filename => $filename, id => $id, hash => $hashref);
181 3         6 $obj = \%loader;
182 3         17 $obj = bless $obj, $class;
183 3         33 return $obj;
184             }
185              
186             =head2 embl2hash
187              
188             Title : embl2hash
189             Function: retrieves with BioPerl an EMBL entry, parses it and creates
190             a hash that contains all the information.
191             Returns : a reference to a hash
192             Errorcode: 0
193             Args : a BioPerl Sequence Object (from file or web fetching)
194             two array references to skip features and qualifiers (for
195             performance)
196             Example: @valid_features=qw(CDS exon prim_transcript mRNA);
197             @valid_qualifiers=qw(gene codon_start db_xref product rpt_family);
198             $hashref=&embl2hash($seqobj,\@valid_features,\@valid_qualifiers);
199              
200             =cut
201              
202             # arguments: Bioperl $seqobj
203             # to skip features and qualifiers (for performance), two array
204             # references must be passed (this can change into string arguments to
205             # be passed....)
206             # returns: a reference to a hash containing the important features requested
207             sub embl2hash {
208 3     3 1 5 my $seqobj=$_[0];
209 3         6 my %valid_features; my %valid_names;
210 3 50       9 if ($_[1]) {
211 3         5 %valid_features = map {$_, 1} @{$_[1]}; # to skip features
  24         46  
  3         7  
212             }
213 3 50       9 if ($_[2]) {
214 3         4 %valid_names = map {$_, 1} @{$_[2]}; # to skip qualifiers
  24         38  
  3         6  
215             }
216              
217 3         26 my $annobj = $seqobj->annotation(); # what's this?
218              
219 3         17 my $entry_Sequence = lc($seqobj->seq()); # SRS returns lowercase
220              
221 3         13 my $entry_ID = $seqobj->display_id;
222 3         10 my $entry_AccNumber = $seqobj->accession; # or maybe accession_number ?
223 3         5 my $secondary_acc; # to fetch the other acc numbers
224 3         7 foreach $secondary_acc ($seqobj->get_secondary_accessions) { # not working!
225 1         3 $entry_AccNumber .= " $secondary_acc";
226             }
227 3         7 my $entry_Molecule = $seqobj->molecule; # this alone returns molec+division
228 3         9 my $entry_Division = $seqobj->division;
229             # fixed: now Molecule works in BioPerl, no need for next lines
230             #my @Molecule=split(" ",$entry_Molecule);
231             #my $entry_Division = pop(@Molecule); # only division
232             #$entry_Molecule = join(" ",@Molecule); # only molecule
233 3         13 my $entry_Description = $seqobj->desc;
234              
235 3         7 my $speciesobj = $seqobj->species;
236 3         12 my $entry_Organism = $speciesobj->binomial;
237              
238 3         24 my $entry_SeqLength = $seqobj->length;
239            
240             # put into the hash
241 3         6 my %entryhash;
242 3         7 $entryhash{ID}=$entry_ID;
243 3         6 $entryhash{AccNumber}=$entry_AccNumber;
244 3         5 $entryhash{Molecule}=$entry_Molecule;
245 3         12 $entryhash{Division}=$entry_Division;
246 3         7 $entryhash{Description}=$entry_Description;
247 3         6 $entryhash{Organism}=$entry_Organism;
248 3         4 $entryhash{Sequence}=$entry_Sequence;
249 3         10 $entryhash{SeqLength}=$entry_SeqLength;
250              
251 3         17 my @topfeatures=$seqobj->top_SeqFeatures();
252             # create features array
253 3         4 my $featuresnumber= scalar(@topfeatures);
254 3         9 $entryhash{FeaturesNumber}=$featuresnumber;
255 3         16 my $feature_name;
256 3         0 my @feature_qual_names; my @feature_qual_value;
257 3         0 my ($feature_qual_name,$feature_qual_number);
258 3         0 my @features;
259              
260 3         0 my ($feat,$qual,$subfeat);
261 3         0 my @subfeat;
262 3         5 my $i=0;
263 3         6 foreach $feat (@topfeatures) {
264 34         32 my %feature;
265 34         44 $feature_name = $feat->primary_tag;
266 34 100       57 unless ($valid_features{$feature_name}) {
267             #print "skipping $feature_name\n";
268 10         13 next;
269             }
270             # works ok with 0.6.2
271             # if ($feature_name eq "CDS_span") { # case of CDS with various exons 0.6.2
272             # $feature_name="CDS"; # 0.6.2
273 24         36 my $featlocation=$feat->location; # 0.7
274 24 100 100     63 if (($feature_name eq "CDS")&&($featlocation->isa('Bio::Location::SplitLocationI'))) { # case of CDS with various exons BioPerl 0.7
275             # @subfeat=$feat->sub_SeqFeature; # 0.6.2
276 2         7 @subfeat=$featlocation->sub_Location(); # 0.7
277 2         4 my @transcript;
278 2         4 foreach $subfeat (@subfeat) {
279 15         16 my @range;
280 15 50       21 if ($subfeat->strand == -1) {
281 0         0 @range=($subfeat->end,$subfeat->start,$subfeat->strand);
282             } else {
283 15         22 @range=($subfeat->start,$subfeat->end,$subfeat->strand);
284             }
285 15         29 push (@transcript,\@range);
286             }
287 2         6 $feature{range}=\@transcript;
288             } else {
289 22         20 my @range;
290 22 50       38 ($feat->strand == -1) ? (@range = ($feat->end, $feat->start, $feat->strand) ) :
291             (@range = ( $feat->start,$feat->end,$feat->strand) );
292             # works ok with 0.6.2
293 22 100       33 if ($feature_name eq "CDS") { # case of single exon CDS (CDS name but not split location)
294 1         2 my @transcript=(\@range);
295 1         3 $feature{range}=\@transcript;
296             } else { # all other range features
297 21         39 $feature{range}=\@range;
298             }
299             }
300 24         32 $feature{location}="deprecated";
301            
302 24         28 $feature{position}=$i;
303 24         29 $feature{name}=$feature_name;
304            
305 24         41 @feature_qual_names= $feat->all_tags();
306 24         32 $feature_qual_number= scalar(@feature_qual_names);
307            
308 24         26 $feature{qual_number}=$feature_qual_number;
309            
310 24         26 my %feature_qualifiers;
311 24         26 for $qual (@feature_qual_names) {
312 41         41 $feature_qual_name=$qual;
313 41 100       60 unless ($valid_names{$feature_qual_name}) {
314 6         11 next;
315             }
316 35         55 @feature_qual_value=$feat->each_tag_value($qual);
317             #print "$qual => @feature_qual_value \n";
318 35         57 $feature_qualifiers{$feature_qual_name}=$feature_qual_value[0]; # ?
319             # maybe the whole array should be entered, not just the 1st element?
320             # what could be the other elements? TOCHECK!
321             }
322 24         36 $feature{qualifiers}=\%feature_qualifiers;
323 24         40 push (@features,\%feature); # array of features
324 24         32 $i++;
325             }
326 3         9 $entryhash{Features}=\@features; # put this also into the hash
327            
328 3         6 my @cds; # array just of CDSs
329 3         11 for $i (0..$#features) {
330 24 100       38 if ($features[$i]->{'name'} eq "CDS") {
331 3         5 push(@cds,$features[$i]);
332             }
333             }
334 3         7 $entryhash{CDS}=\@cds; # put this also into the hash
335 3         30 return (\%entryhash);
336             }
337              
338             =head2 novelaasequence2gene
339              
340             Title : novelaasequence2gene
341             Usage : $gene=Bio::LiveSeq::IO::BioPerl->novelaasequence2gene(-aasequence => "MGLAAPTRS*");
342             : $gene=Bio::LiveSeq::IO::BioPerl->novelaasequence2gene(-aasequence => "MGLAAPTRS*",
343             -cusg_data => "58 44 7 29 3 3 480 267 105 143 122 39 144 162 14 59 53 25 233 292 19 113 88 246 28 68 161 231 27 102 128 151 67 60 138 131 48 61 153 19 233 73 150 31 129 38 147 71 138 43 181 81 44 15 255 118 312 392 236 82 20 10 14 141");
344             : $gene=Bio::LiveSeq::IO::BioPerl->novelaasequence2gene(-aasequence => "MGLAAPTRS*",
345             -cusg_data => "58 44 7 29 3 3 480 267 105 143 122 39 144 162 14 59 53 25 233 292 19 113 88 246 28 68 161 231 27 102 128 151 67 60 138 131 48 61 153 19 233 73 150 31 129 38 147 71 138 43 181 81 44 15 255 118 312 392 236 82 20 10 14 141",
346             -translation_table => "2",
347             -gene_name => "tyr-kinase");
348              
349             Function: creates LiveSeq objects from a novel amino acid sequence,
350             using codon usage information (loaded from a file) to choose
351             codons according to relative frequencies.
352             If a codon_usage information is not specified,
353             the default is to use Homo sapiens data (taxonomy ID 9606).
354             If a translation_table ID is not specified, it will default to 1
355             (standard code).
356             Returns : reference to a Gene object containing references to LiveSeq objects
357             Errorcode 0
358             Args : string containing an amino acid sequence
359             string (optional) with codon usage data (64 integer numbers)
360             string (optional) specifying a gene_name
361             integer (optional) specifying a translation_table ID
362              
363             =cut
364              
365             sub novelaasequence2gene {
366 0     0 1   my ($self, %args) = @_;
367 0           my ($gene_name,$cusg_data,$aasequence,$ttabid)=($args{-gene_name},$args{-cusg_data},$args{-aasequence},$args{-translation_table});
368              
369 0           my @species_codon_usage;
370 0 0         unless ($aasequence) {
371 0           carp "aasequence not given";
372 0           return (0);
373             }
374 0 0         unless ($gene_name) {
375 0           $gene_name="Novel Unknown";
376             }
377 0 0         unless ($ttabid) {
378 0           $ttabid=1;
379             }
380 0 0         unless ($cusg_data) {
381 0           @species_codon_usage=
382             qw(68664 118404 126679 51100 125600 123646 75667 210903 435317
383             139009 79303 135218 128429 192616 49456 161556 211962 131222
384             162837 213626 69346 140780 182506 219428 76684 189374 173010
385             310626 82647 202329 180955 250410 180001 118798 76398 160764
386             317359 119013 262630 359627 218376 186915 130857 377006 162826
387             113684 317703 441298 287040 245435 174805 133427 134523 108740
388             225633 185619 78463 240138 174021 244236 142435 8187 5913
389             14381); # updated 21Jul2000
390             } else {
391 0           @species_codon_usage=split(/ /,$cusg_data);
392             }
393            
394 0           my $gene=Bio::LiveSeq::IO::Loader::_common_novelaasequence2gene(\@species_codon_usage,$ttabid,$aasequence,$gene_name);
395 0           return ($gene);
396             }
397              
398             1;