File Coverage

Bio/LiveSeq/IO/BioPerl.pm

Criterion	Covered	Total	%
statement	126	156	80.7
branch	23	44	52.2
condition	6	12	50.0
subroutine	7	8	87.5
pod	3	3	100.0
total	165	223	73.9

line	stmt	bran	cond	sub	pod	time	code
1							#
2							# bioperl module for Bio::LiveSeq::IO::BioPerl
3							#
4							# Please direct questions and support issues to
5							#
6							# Cared for by Joseph Insana
7							#
8							# Copyright Joseph Insana
9							#
10							# You may distribute this module under the same terms as perl itself
11							#
12							# POD documentation - main docs before the code
13
14							=head1 NAME
15
16							Bio::LiveSeq::IO::BioPerl - Loader for LiveSeq from EMBL entries with BioPerl
17
18							=head1 SYNOPSIS
19
20							my $db="EMBL";
21							my $file="../data/M20132";
22							my $id="HSANDREC";
23
24							my $loader=Bio::LiveSeq::IO::BioPerl->load(-db=>"$db", -file=>"$file");
25							# or
26							my $loader=Bio::LiveSeq::IO::BioPerl->load(-db=>"$db", -id=>"$id");
27
28							my @translationobjects=$loader->entry2liveseq();
29
30							my $genename="AR";
31							my $gene=$loader->gene2liveseq(-gene_name => "$genename",
32							-getswissprotinfo => 0);
33
34							#NOTE1: The only -db now supported is EMBL. Hence it defaults to EMBL.
35							#NOTE2: -file requires a filename (and path if necessary) containing an
36							# EMBL entry
37							# -id will use Bio::DB::EMBL.pm to fetch the sequence from the web,
38							# (bioperl wraparound to [w]getz from SRS)
39							#NOTE3: To retrieve the swissprot (if possible) attached to the embl entry
40							# (to get protein domains at dna level), only Bio::DB::EMBL.pm
41							# is supported under BioPerl. Refer to Bio::LiveSeq::IO::SRS
42							# otherwise.
43							#NOTE4: NOTE3 is not implemented yet for bioperl, working on it
44
45
46							=head1 DESCRIPTION
47
48							This package uses BioPerl (SeqIO) to fetch a sequence database entry,
49							analyse it and create LiveSeq objects out of it.
50
51							A filename (or an ID that will fetch entry through the web) has to be passed
52							to this package which will return references to all translation objects
53							created from the EMBL entry. References to Transcription, DNA and Exon
54							objects can all be retrieved departing from these.
55
56							Alternatively, a specific "gene" name can be specified, together with
57							the embl-acc ID. This will create a LiveSeq::Gene object with all
58							relevant gene features attached/created.
59
60							ATTENTION: if web fetching is requested, the package HTTP::Request needs
61							to be installed.
62
63
64							=head1 AUTHOR - Joseph A.L. Insana
65
66							Email: Insana@ebi.ac.uk, jinsana@gmx.net
67
68							=head1 APPENDIX
69
70							The rest of the documentation details each of the object
71							methods. Internal methods are usually preceded with a _
72
73							=cut
74
75							# Let the code begin...
76
77							package Bio::LiveSeq::IO::BioPerl;
78
79							# TODO->TOCHECK
80							# each_secondary_access not working
81							# why array from each_tag_value($qual) ? When will there be more than one
82							# element in such array?
83							# what is the annotation object? ($seqobj->annotation)
84							# unsatisfied by both BioPerl binomial and SRS "org" to retrieve Organism info
85
86	2			2		1429	use strict;
	2					4
	2					55
87	2			2		8	use Carp qw(cluck croak carp);
	2					3
	2					102
88	2			2		10	use vars qw($DBEMBLLOADED);
	2					13
	2					89
89	2			2		637	use Bio::SeqIO; # for -file entry loading
	2					4
	2					91
90
91							# Note, the following requires HTTP::Request. If the modules are not installed
92							# uncomment the following and use only -filename and don't request swissprotinfo
93							eval {
94							require Bio::DB::EMBL; # for -id entry loading
95							$DBEMBLLOADED = 1;
96							};
97
98
99	2			2		12	use base qw(Bio::LiveSeq::IO::Loader);
	2					3
	2					635
100
101							# This package can in the future host other databases loading subroutines.
102							# e.g. ensembl2hash
103
104							=head2 load
105
106							Title : load
107							Usage : my $filename="../data/M20132";
108							$loader=Bio::LiveSeq::IO::BioPerl->load(-db=>"EMBL", -file=>"$filename");
109							or
110							$loader=Bio::LiveSeq::IO::BioPerl->load(-db=>"EMBL", -id=>"HSANDREC");
111
112							Function: loads an entry with BioPerl from a database into a hash
113							Returns : reference to a new object of class IO::BioPerl holding an entry
114							Errorcode 0
115							Args : an filename containing an EMBL entry OR an ID or ACCESSION code
116
117							=cut
118
119							sub load {
120	3			3	1	24	my ($thing, %args) = @_;
121	3		33			20	my $class = ref($thing) \|\| $thing;
122	3					6	my ($obj,%loader);
123
124	3					11	my ($db,$filename,$id)=($args{-db},$args{-file},$args{-id});
125
126	3	100				9	if (defined($db)) {
127	2	50				7	unless ($db eq "EMBL") {
128	0					0	carp "Note: only EMBL now supported!";
129	0					0	return(0);
130							}
131							} else {
132	1					1	$db="EMBL";
133							}
134
135	3	50	33			13	if (defined($id) && defined($filename)) {
136	0					0	carp "You can either specify a -id or a -filename!";
137	0					0	return(0);
138							}
139
140	3	50	33			17	unless (defined($id) \|\| defined($filename)) {
141	0					0	carp "You must specify either a -id or a -filename!";
142	0					0	return(0);
143							}
144
145	3					6	my $hashref;
146	3	50				9	if ($db eq "EMBL") {
147	3					6	my $test_transl=0; # change to 0 to avoid comparison of translation
148
149							# these can be changed for future needs
150	3					9	my @embl_valid_feature_names=qw(CDS CDS_span exon prim_transcript intron repeat_unit repeat_region mRNA);
151	3					12	my @embl_valid_qual_names=qw(gene codon_start db_xref product note number rpt_family transl_table);
152
153							# dunno yet how to implement test_transl again....
154							# probably on a one-on-one basis with each translation?
155	3	50				8	if ($test_transl) {
156	0					0	push (@embl_valid_qual_names,"translation"); # needed for test_transl
157							}
158
159	3					4	my $seqobj; # bioperl sequence object, to be passed to embl2hash
160
161	3	50				9	if (defined($filename)) {
162	3					22	my $stream = Bio::SeqIO->new('-file' => $filename, '-format' => 'EMBL');
163	3					11	$seqobj = $stream->next_seq();
164							} else { # i.e. if -id
165
166	0	0				0	if( $DBEMBLLOADED ) {
167	0					0	my $embl = Bio::DB::EMBL->new();
168	0					0	$seqobj = $embl->get_Seq_by_id($id); # EMBL ID or ACC
169							} else {
170	0					0	my $root = Bio::Root::Root->new();
171	0					0	$root->warn("Must have HTTP::Request::Common installed, cannot run load without the -filename option specified, see docs for Bio::LiveSeq::IO::BioPerl");
172	0					0	return;
173							}
174							}
175
176	3					15	$hashref=&embl2hash($seqobj,\@embl_valid_feature_names,\@embl_valid_qual_names);
177							}
178	3	50				14	unless ($hashref) { return (0); }
	0					0
179
180	3					18	%loader = (db => $db, filename => $filename, id => $id, hash => $hashref);
181	3					6	$obj = \%loader;
182	3					17	$obj = bless $obj, $class;
183	3					33	return $obj;
184							}
185
186							=head2 embl2hash
187
188							Title : embl2hash
189							Function: retrieves with BioPerl an EMBL entry, parses it and creates
190							a hash that contains all the information.
191							Returns : a reference to a hash
192							Errorcode: 0
193							Args : a BioPerl Sequence Object (from file or web fetching)
194							two array references to skip features and qualifiers (for
195							performance)
196							Example: @valid_features=qw(CDS exon prim_transcript mRNA);
197							@valid_qualifiers=qw(gene codon_start db_xref product rpt_family);
198							$hashref=&embl2hash($seqobj,\@valid_features,\@valid_qualifiers);
199
200							=cut
201
202							# arguments: Bioperl $seqobj
203							# to skip features and qualifiers (for performance), two array
204							# references must be passed (this can change into string arguments to
205							# be passed....)
206							# returns: a reference to a hash containing the important features requested
207							sub embl2hash {
208	3			3	1	5	my $seqobj=$_[0];
209	3					6	my %valid_features; my %valid_names;
210	3	50				9	if ($_[1]) {
211	3					5	%valid_features = map {$_, 1} @{$_[1]}; # to skip features
	24					46
	3					7
212							}
213	3	50				9	if ($_[2]) {
214	3					4	%valid_names = map {$_, 1} @{$_[2]}; # to skip qualifiers
	24					38
	3					6
215							}
216
217	3					26	my $annobj = $seqobj->annotation(); # what's this?
218
219	3					17	my $entry_Sequence = lc($seqobj->seq()); # SRS returns lowercase
220
221	3					13	my $entry_ID = $seqobj->display_id;
222	3					10	my $entry_AccNumber = $seqobj->accession; # or maybe accession_number ?
223	3					5	my $secondary_acc; # to fetch the other acc numbers
224	3					7	foreach $secondary_acc ($seqobj->get_secondary_accessions) { # not working!
225	1					3	$entry_AccNumber .= " $secondary_acc";
226							}
227	3					7	my $entry_Molecule = $seqobj->molecule; # this alone returns molec+division
228	3					9	my $entry_Division = $seqobj->division;
229							# fixed: now Molecule works in BioPerl, no need for next lines
230							#my @Molecule=split(" ",$entry_Molecule);
231							#my $entry_Division = pop(@Molecule); # only division
232							#$entry_Molecule = join(" ",@Molecule); # only molecule
233	3					13	my $entry_Description = $seqobj->desc;
234
235	3					7	my $speciesobj = $seqobj->species;
236	3					12	my $entry_Organism = $speciesobj->binomial;
237
238	3					24	my $entry_SeqLength = $seqobj->length;
239
240							# put into the hash
241	3					6	my %entryhash;
242	3					7	$entryhash{ID}=$entry_ID;
243	3					6	$entryhash{AccNumber}=$entry_AccNumber;
244	3					5	$entryhash{Molecule}=$entry_Molecule;
245	3					12	$entryhash{Division}=$entry_Division;
246	3					7	$entryhash{Description}=$entry_Description;
247	3					6	$entryhash{Organism}=$entry_Organism;
248	3					4	$entryhash{Sequence}=$entry_Sequence;
249	3					10	$entryhash{SeqLength}=$entry_SeqLength;
250
251	3					17	my @topfeatures=$seqobj->top_SeqFeatures();
252							# create features array
253	3					4	my $featuresnumber= scalar(@topfeatures);
254	3					9	$entryhash{FeaturesNumber}=$featuresnumber;
255	3					16	my $feature_name;
256	3					0	my @feature_qual_names; my @feature_qual_value;
257	3					0	my ($feature_qual_name,$feature_qual_number);
258	3					0	my @features;
259
260	3					0	my ($feat,$qual,$subfeat);
261	3					0	my @subfeat;
262	3					5	my $i=0;
263	3					6	foreach $feat (@topfeatures) {
264	34					32	my %feature;
265	34					44	$feature_name = $feat->primary_tag;
266	34	100				57	unless ($valid_features{$feature_name}) {
267							#print "skipping $feature_name\n";
268	10					13	next;
269							}
270							# works ok with 0.6.2
271							# if ($feature_name eq "CDS_span") { # case of CDS with various exons 0.6.2
272							# $feature_name="CDS"; # 0.6.2
273	24					36	my $featlocation=$feat->location; # 0.7
274	24	100	100			63	if (($feature_name eq "CDS")&&($featlocation->isa('Bio::Location::SplitLocationI'))) { # case of CDS with various exons BioPerl 0.7
275							# @subfeat=$feat->sub_SeqFeature; # 0.6.2
276	2					7	@subfeat=$featlocation->sub_Location(); # 0.7
277	2					4	my @transcript;
278	2					4	foreach $subfeat (@subfeat) {
279	15					16	my @range;
280	15	50				21	if ($subfeat->strand == -1) {
281	0					0	@range=($subfeat->end,$subfeat->start,$subfeat->strand);
282							} else {
283	15					22	@range=($subfeat->start,$subfeat->end,$subfeat->strand);
284							}
285	15					29	push (@transcript,\@range);
286							}
287	2					6	$feature{range}=\@transcript;
288							} else {
289	22					20	my @range;
290	22	50				38	($feat->strand == -1) ? (@range = ($feat->end, $feat->start, $feat->strand) ) :
291							(@range = ( $feat->start,$feat->end,$feat->strand) );
292							# works ok with 0.6.2
293	22	100				33	if ($feature_name eq "CDS") { # case of single exon CDS (CDS name but not split location)
294	1					2	my @transcript=(\@range);
295	1					3	$feature{range}=\@transcript;
296							} else { # all other range features
297	21					39	$feature{range}=\@range;
298							}
299							}
300	24					32	$feature{location}="deprecated";
301
302	24					28	$feature{position}=$i;
303	24					29	$feature{name}=$feature_name;
304
305	24					41	@feature_qual_names= $feat->all_tags();
306	24					32	$feature_qual_number= scalar(@feature_qual_names);
307
308	24					26	$feature{qual_number}=$feature_qual_number;
309
310	24					26	my %feature_qualifiers;
311	24					26	for $qual (@feature_qual_names) {
312	41					41	$feature_qual_name=$qual;
313	41	100				60	unless ($valid_names{$feature_qual_name}) {
314	6					11	next;
315							}
316	35					55	@feature_qual_value=$feat->each_tag_value($qual);
317							#print "$qual => @feature_qual_value \n";
318	35					57	$feature_qualifiers{$feature_qual_name}=$feature_qual_value[0]; # ?
319							# maybe the whole array should be entered, not just the 1st element?
320							# what could be the other elements? TOCHECK!
321							}
322	24					36	$feature{qualifiers}=\%feature_qualifiers;
323	24					40	push (@features,\%feature); # array of features
324	24					32	$i++;
325							}
326	3					9	$entryhash{Features}=\@features; # put this also into the hash
327
328	3					6	my @cds; # array just of CDSs
329	3					11	for $i (0..$#features) {
330	24	100				38	if ($features[$i]->{'name'} eq "CDS") {
331	3					5	push(@cds,$features[$i]);
332							}
333							}
334	3					7	$entryhash{CDS}=\@cds; # put this also into the hash
335	3					30	return (\%entryhash);
336							}
337
338							=head2 novelaasequence2gene
339
340							Title : novelaasequence2gene
341							Usage : $gene=Bio::LiveSeq::IO::BioPerl->novelaasequence2gene(-aasequence => "MGLAAPTRS*");
342							: $gene=Bio::LiveSeq::IO::BioPerl->novelaasequence2gene(-aasequence => "MGLAAPTRS*",
343							-cusg_data => "58 44 7 29 3 3 480 267 105 143 122 39 144 162 14 59 53 25 233 292 19 113 88 246 28 68 161 231 27 102 128 151 67 60 138 131 48 61 153 19 233 73 150 31 129 38 147 71 138 43 181 81 44 15 255 118 312 392 236 82 20 10 14 141");
344							: $gene=Bio::LiveSeq::IO::BioPerl->novelaasequence2gene(-aasequence => "MGLAAPTRS*",
345							-cusg_data => "58 44 7 29 3 3 480 267 105 143 122 39 144 162 14 59 53 25 233 292 19 113 88 246 28 68 161 231 27 102 128 151 67 60 138 131 48 61 153 19 233 73 150 31 129 38 147 71 138 43 181 81 44 15 255 118 312 392 236 82 20 10 14 141",
346							-translation_table => "2",
347							-gene_name => "tyr-kinase");
348
349							Function: creates LiveSeq objects from a novel amino acid sequence,
350							using codon usage information (loaded from a file) to choose
351							codons according to relative frequencies.
352							If a codon_usage information is not specified,
353							the default is to use Homo sapiens data (taxonomy ID 9606).
354							If a translation_table ID is not specified, it will default to 1
355							(standard code).
356							Returns : reference to a Gene object containing references to LiveSeq objects
357							Errorcode 0
358							Args : string containing an amino acid sequence
359							string (optional) with codon usage data (64 integer numbers)
360							string (optional) specifying a gene_name
361							integer (optional) specifying a translation_table ID
362
363							=cut
364
365							sub novelaasequence2gene {
366	0			0	1		my ($self, %args) = @_;
367	0						my ($gene_name,$cusg_data,$aasequence,$ttabid)=($args{-gene_name},$args{-cusg_data},$args{-aasequence},$args{-translation_table});
368
369	0						my @species_codon_usage;
370	0	0					unless ($aasequence) {
371	0						carp "aasequence not given";
372	0						return (0);
373							}
374	0	0					unless ($gene_name) {
375	0						$gene_name="Novel Unknown";
376							}
377	0	0					unless ($ttabid) {
378	0						$ttabid=1;
379							}
380	0	0					unless ($cusg_data) {
381	0						@species_codon_usage=
382							qw(68664 118404 126679 51100 125600 123646 75667 210903 435317
383							139009 79303 135218 128429 192616 49456 161556 211962 131222
384							162837 213626 69346 140780 182506 219428 76684 189374 173010
385							310626 82647 202329 180955 250410 180001 118798 76398 160764
386							317359 119013 262630 359627 218376 186915 130857 377006 162826
387							113684 317703 441298 287040 245435 174805 133427 134523 108740
388							225633 185619 78463 240138 174021 244236 142435 8187 5913
389							14381); # updated 21Jul2000
390							} else {
391	0						@species_codon_usage=split(/ /,$cusg_data);
392							}
393
394	0						my $gene=Bio::LiveSeq::IO::Loader::_common_novelaasequence2gene(\@species_codon_usage,$ttabid,$aasequence,$gene_name);
395	0						return ($gene);
396							}
397
398							1;