File Coverage

Bio/Perl.pm
Criterion Covered Total %
statement 77 179 43.0
branch 15 82 18.2
condition 2 14 14.2
subroutine 14 21 66.6
pod 13 13 100.0
total 121 309 39.1


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::Perl
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Ewan Birney
7             #
8             # Copyright Ewan Birney
9             #
10             # You may distribute this module under the same terms as perl itself
11              
12             # POD documentation - main docs before the code
13              
14             =head1 NAME
15              
16             Bio::Perl - Functional access to BioPerl for people who don't know objects
17              
18             =head1 SYNOPSIS
19              
20             use Bio::Perl;
21              
22             # will guess file format from extension
23             $seq_object = read_sequence($filename);
24              
25             # forces genbank format
26             $seq_object = read_sequence($filename,'genbank');
27              
28             # reads an array of sequences
29             @seq_object_array = read_all_sequences($filename,'fasta');
30              
31             # sequences are Bio::Seq objects, so the following methods work
32             # for more info see Bio::Seq, or do 'perldoc Bio/Seq.pm'
33              
34             print "Sequence name is ",$seq_object->display_id,"\n";
35             print "Sequence acc is ",$seq_object->accession_number,"\n";
36             print "First 5 bases is ",$seq_object->subseq(1,5),"\n";
37              
38             # get the whole sequence as a single string
39              
40             $sequence_as_a_string = $seq_object->seq();
41              
42             # writing sequences
43              
44             write_sequence(">$filename",'genbank',$seq_object);
45              
46             write_sequence(">$filename",'genbank',@seq_object_array);
47              
48             # making a new sequence from just a string
49              
50             $seq_object = new_sequence("ATTGGTTTGGGGACCCAATTTGTGTGTTATATGTA",
51             "myname","AL12232");
52              
53             # getting a sequence from a database (assumes internet connection)
54              
55             $seq_object = get_sequence('swissprot',"ROA1_HUMAN");
56              
57             $seq_object = get_sequence('embl',"AI129902");
58              
59             $seq_object = get_sequence('genbank',"AI129902");
60              
61             # BLAST a sequence (assummes an internet connection)
62              
63             $blast_report = blast_sequence($seq_object);
64              
65             write_blast(">blast.out",$blast_report);
66              
67              
68             =head1 DESCRIPTION
69              
70             Easy first time access to BioPerl via functions.
71              
72             =head1 FEEDBACK
73              
74             =head2 Mailing Lists
75              
76             User feedback is an integral part of the evolution of this and other
77             Bioperl modules. Send your comments and suggestions preferably to one
78             of the Bioperl mailing lists. Your participation is much appreciated.
79              
80             bioperl-l@bioperl.org
81              
82             =head2 Support
83              
84             Please direct usage questions or support issues to the mailing list:
85              
86             I
87              
88             rather than to the module maintainer directly. Many experienced and
89             reponsive experts will be able look at the problem and quickly
90             address it. Please include a thorough description of the problem
91             with code and data examples if at all possible.
92              
93             =head2 Reporting Bugs
94              
95             Report bugs to the Bioperl bug tracking system to help us keep track
96             the bugs and their resolution. Bug reports can be submitted via the web:
97              
98             https://github.com/bioperl/bioperl-live/issues
99              
100             =head1 AUTHOR - Ewan Birney
101              
102             Email birney@ebi.ac.uk
103              
104             =head1 APPENDIX
105              
106             The rest of the documentation details each of the object methods.
107             Internal methods are usually preceded with a _
108              
109             =cut
110              
111             #'
112             # Let the code begin...
113              
114              
115             package Bio::Perl;
116 1     1   487 use vars qw(@EXPORT @EXPORT_OK $DBOKAY);
  1         1  
  1         48  
117 1     1   3 use strict;
  1         1  
  1         15  
118 1     1   3 use Carp;
  1         10  
  1         52  
119              
120 1     1   317 use Bio::SeqIO;
  1         3  
  1         30  
121 1     1   465 use Bio::Seq;
  1         2  
  1         29  
122 1     1   5 use Bio::Root::Version '$VERSION';
  1         1  
  1         9  
123             BEGIN {
124 1     1   67 eval {
125 1         288 require Bio::DB::EMBL;
126 1         325 require Bio::DB::GenBank;
127 1         300 require Bio::DB::SwissProt;
128 1         6 require Bio::DB::RefSeq;
129 1         268 require Bio::DB::GenPept;
130             };
131 1 50       4 if( $@ ) {
132 0         0 $DBOKAY = 0;
133             } else {
134 1         19 $DBOKAY = 1;
135             }
136             }
137              
138 1     1   4 use base qw(Exporter);
  1         1  
  1         1379  
139              
140             @EXPORT = qw(read_sequence read_all_sequences write_sequence
141             new_sequence get_sequence translate translate_as_string
142             reverse_complement revcom revcom_as_string
143             reverse_complement_as_string blast_sequence write_blast);
144              
145             @EXPORT_OK = @EXPORT;
146              
147              
148             =head2 read_sequence
149              
150             Title : read_sequence
151             Usage : $seq = read_sequence('sequences.fa')
152             $seq = read_sequence($filename,'genbank');
153              
154             # pipes are fine
155             $seq = read_sequence("my_fetching_program $id |",'fasta');
156              
157             Function: Reads the top sequence from the file. If no format is given, it will
158             try to guess the format from the filename. If a format is given, it
159             forces that format. The filename can be any valid perl open() string
160             - in particular, you can put in pipes
161              
162             Returns : A Bio::Seq object. A quick synopsis:
163             $seq_object->display_id - name of the sequence
164             $seq_object->seq - sequence as a string
165              
166             Args : Two strings, first the filename - any Perl open() string is ok
167             Second string is the format, which is optional
168              
169             For more information on Seq objects see L.
170              
171             =cut
172              
173             sub read_sequence{
174 2     2 1 10 my ($filename,$format) = @_;
175              
176 2 50       6 if( !defined $filename ) {
177 0         0 confess "read_sequence($filename) - usage incorrect";
178             }
179              
180 2         2 my $seqio;
181              
182 2 100       5 if( defined $format ) {
183 1         5 $seqio = Bio::SeqIO->new( '-file' => $filename, '-format' => $format);
184             } else {
185 1         9 $seqio = Bio::SeqIO->new( '-file' => $filename);
186             }
187              
188 2         7 my $seq = $seqio->next_seq();
189              
190 2         18 return $seq;
191             }
192              
193              
194             =head2 read_all_sequences
195              
196             Title : read_all_sequences
197             Usage : @seq_object_array = read_all_sequences($filename);
198             @seq_object_array = read_all_sequences($filename,'genbank');
199              
200             Function: Just as the function above, but reads all the sequences in the
201             file and loads them into an array.
202              
203             For very large files, you will run out of memory. When this
204             happens, you've got to use the SeqIO system directly (this is
205             not so hard! Don't worry about it!).
206              
207             Returns : array of Bio::Seq objects
208              
209             Args : two strings, first the filename (any open() string is ok)
210             second the format (which is optional)
211              
212             See L and L for more information
213              
214             =cut
215              
216             sub read_all_sequences{
217 1     1 1 6 my ($filename,$format) = @_;
218              
219 1 50       4 if( !defined $filename ) {
220 0         0 confess "read_all_sequences($filename) - usage incorrect";
221             }
222              
223 1         1 my $seqio;
224              
225 1 50       3 if( defined $format ) {
226 1         6 $seqio = Bio::SeqIO->new( '-file' => $filename, '-format' => $format);
227             } else {
228 0         0 $seqio = Bio::SeqIO->new( '-file' => $filename);
229             }
230              
231 1         3 my @seq_array;
232              
233 1         4 while( my $seq = $seqio->next_seq() ) {
234 2         8 push(@seq_array,$seq);
235             }
236              
237 1         5 return @seq_array;
238             }
239              
240              
241             =head2 write_sequence
242              
243             Title : write_sequence
244             Usage : write_sequence(">new_file.gb",'genbank',$seq)
245             write_sequence(">new_file.gb",'genbank',@array_of_sequence_objects)
246              
247             Function: writes sequences in the specified format
248              
249             Returns : true
250              
251             Args : filename as a string, must provide an open() output file
252             format as a string
253             one or more sequence objects
254              
255              
256             =cut
257              
258             sub write_sequence{
259 1     1 1 11 my ($filename,$format,@sequence_objects) = @_;
260              
261 1 50       5 if( scalar(@sequence_objects) == 0 ) {
262 0         0 confess("write_sequence(filename,format,sequence_object)");
263             }
264              
265 1         1 my $error = 0;
266 1         2 my $seqname = "sequence1";
267              
268             # catch users who haven't passed us a filename we can open
269 1 0 33     5 if( $filename !~ /^\>/ && $filename !~ /^|/ ) {
270 0         0 $filename = ">".$filename;
271             }
272              
273 1         7 my $seqio = Bio::SeqIO->new('-file' => $filename, '-format' => $format);
274              
275 1         4 foreach my $seq ( @sequence_objects ) {
276 1         1 my $seq_obj;
277              
278 1 50       6 if( !ref $seq ) {
279 0 0       0 if( length $seq > 50 ) {
280             # odds are this is a sequence as a string, and someone has not figured out
281             # how to make objects. Warn him/her and then make a sequence object
282             # from this
283 0 0       0 if( $error == 0 ) {
284 0         0 carp("WARNING: You have put in a long string into write_sequence.\n".
285             "I suspect this means that this is the actual sequence\n".
286             "In the future try the\n".
287             " new_sequence method of this module to make a new sequence object.\n".
288             "Doing this for you here\n");
289 0         0 $error = 1;
290             }
291              
292 0         0 $seq_obj = new_sequence($seq,$seqname);
293 0         0 $seqname++;
294             } else {
295 0         0 confess("You have a non object [$seq] passed to write_sequence. It maybe that you".
296             "want to use new_sequence to make this string into a sequence object?");
297             }
298             } else {
299 1 50       4 if( !$seq->isa("Bio::SeqI") ) {
300 0         0 confess("object [$seq] is not a Bio::Seq object; can't write it out");
301             }
302 1         2 $seq_obj = $seq;
303             }
304              
305             # finally... we get to write out the sequence!
306 1         5 $seqio->write_seq($seq_obj);
307             }
308 1         6 1;
309             }
310              
311             =head2 new_sequence
312              
313             Title : new_sequence
314             Usage : $seq_obj = new_sequence("GATTACA", "kino-enzyme");
315              
316             Function: Construct a sequency object from sequence string
317             Returns : A Bio::Seq object
318              
319             Args : sequence string
320             name string (optional, default "no-name-for-sequence")
321             accession - accession number (optional, no default)
322              
323             =cut
324              
325             sub new_sequence{
326 1     1 1 2 my ($seq,$name,$accession) = @_;
327              
328 1 50       4 if( !defined $seq ) {
329 0         0 confess("new_sequence(sequence_as_string) usage");
330             }
331              
332 1   50     2 $name ||= "no-name-for-sequence";
333              
334 1         9 my $seq_object = Bio::Seq->new( -seq => $seq, -id => $name);
335              
336 1 50       7 $accession && $seq_object->accession_number($accession);
337              
338 1         5 return $seq_object;
339             }
340              
341             =head2 blast_sequence
342              
343             Title : blast_sequence
344             Usage : $blast_result = blast_sequence($seq)
345             $blast_result = blast_sequence('MFVEGGTFASEDDDSASAEDE');
346              
347             Function: If the computer has Internet accessibility, blasts
348             the sequence using the NCBI BLAST server against nrdb.
349              
350             It chooses the flavour of BLAST on the basis of the sequence.
351              
352             This function uses Bio::Tools::Run::RemoteBlast, which itself
353             use Bio::SearchIO - as soon as you want to know more, check out
354             these modules
355             Returns : Bio::Search::Result::GenericResult.pm
356              
357             Args : Either a string of protein letters or nucleotides, or a
358             Bio::Seq object
359              
360             =cut
361              
362             sub blast_sequence {
363 0     0 1 0 my ($seq,$verbose) = @_;
364              
365 0 0       0 if( !defined $verbose ) {
366 0         0 $verbose = 1;
367             }
368              
369 0 0       0 if( !ref $seq ) {
    0          
370 0         0 $seq = Bio::Seq->new( -seq => $seq, -id => 'blast-sequence-temp-id');
371             } elsif ( !$seq->isa('Bio::PrimarySeqI') ) {
372 0         0 croak("[$seq] is an object, but not a Bio::Seq object, cannot be blasted");
373             }
374              
375 0         0 require Bio::Tools::Run::RemoteBlast;
376              
377 0 0       0 my $prog = ( $seq->alphabet eq 'protein' ) ? 'blastp' : 'blastn';
378 0         0 my $e_val= '1e-10';
379              
380 0         0 my @params = ( '-prog' => $prog,
381             '-expect' => $e_val,
382             '-readmethod' => 'SearchIO' );
383              
384 0         0 my $factory = Bio::Tools::Run::RemoteBlast->new(@params);
385              
386 0         0 my $r = $factory->submit_blast($seq);
387 0 0       0 if( $verbose ) {
388 0         0 print STDERR "Submitted Blast for [".$seq->id."] ";
389             }
390 0         0 sleep 5;
391              
392 0         0 my $result;
393              
394             LOOP :
395 0         0 while( my @rids = $factory->each_rid) {
396 0         0 foreach my $rid ( @rids ) {
397 0         0 my $rc = $factory->retrieve_blast($rid);
398 0 0       0 if( !ref($rc) ) {
399 0 0       0 if( $rc < 0 ) {
400 0         0 $factory->remove_rid($rid);
401             }
402 0 0       0 if( $verbose ) {
403 0         0 print STDERR ".";
404             }
405 0         0 sleep 10;
406             } else {
407 0         0 $result = $rc->next_result();
408 0         0 $factory->remove_rid($rid);
409 0         0 last LOOP;
410             }
411             }
412             }
413              
414 0 0       0 if( $verbose ) {
415 0         0 print STDERR "\n";
416             }
417 0         0 return $result;
418             }
419              
420             =head2 write_blast
421              
422             Title : write_blast
423             Usage : write_blast($filename,$blast_report);
424              
425             Function: Writes a BLAST result object (or more formally
426             a SearchIO result object) out to a filename
427             in BLAST-like format
428              
429             Returns : none
430              
431             Args : filename as a string
432             Bio::SearchIO::Results object
433              
434             =cut
435              
436             sub write_blast {
437 0     0 1 0 my ($filename,$blast) = @_;
438              
439 0 0 0     0 if( $filename !~ /^\>/ && $filename !~ /^|/ ) {
440 0         0 $filename = ">".$filename;
441             }
442              
443 0         0 my $output = Bio::SearchIO->new( -output_format => 'blast', -file => $filename);
444              
445 0         0 $output->write_result($blast);
446              
447             }
448              
449             =head2 get_sequence
450              
451             Title : get_sequence
452             Usage : $seq_object = get_sequence('swiss',"ROA1_HUMAN");
453              
454             Function: If the computer has Internet access this method gets
455             the sequence from Internet accessible databases. Currently
456             this supports Swissprot ('swiss'), EMBL ('embl'), GenBank
457             ('genbank'), GenPept ('genpept'), and RefSeq ('refseq').
458              
459             Swissprot and EMBL are more robust than GenBank fetching.
460              
461             If the user is trying to retrieve a RefSeq entry from
462             GenBank/EMBL, the query is silently redirected.
463              
464             Returns : A Bio::Seq object
465              
466             Args : database type - one of swiss, embl, genbank, genpept, or
467             refseq
468              
469             =cut
470              
471             my $genbank_db = undef;
472             my $genpept_db = undef;
473             my $embl_db = undef;
474             my $swiss_db = undef;
475             my $refseq_db = undef;
476              
477             sub get_sequence{
478 0     0 1 0 my ($db_type,$identifier) = @_;
479 0 0       0 if( ! $DBOKAY ) {
480 0         0 confess ("Your system does not have one of LWP, HTTP::Request::Common, IO::String\n".
481             "installed so the DB retrieval method is not available.\n".
482             "Full error message is:\n $!\n");
483 0         0 return;
484             }
485 0         0 $db_type = lc($db_type);
486              
487 0         0 my $db;
488              
489 0 0       0 if( $db_type =~ /genbank/ ) {
490 0 0       0 if( !defined $genbank_db ) {
491 0         0 $genbank_db = Bio::DB::GenBank->new();
492             }
493 0         0 $db = $genbank_db;
494             }
495 0 0       0 if( $db_type =~ /genpept/ ) {
496 0 0       0 if( !defined $genpept_db ) {
497 0         0 $genpept_db = Bio::DB::GenPept->new();
498             }
499 0         0 $db = $genpept_db;
500             }
501              
502 0 0       0 if( $db_type =~ /swiss/ ) {
503 0 0       0 if( !defined $swiss_db ) {
504 0         0 $swiss_db = Bio::DB::SwissProt->new();
505             }
506 0         0 $db = $swiss_db;
507             }
508              
509 0 0       0 if( $db_type =~ /embl/ ) {
510 0 0       0 if( !defined $embl_db ) {
511 0         0 $embl_db = Bio::DB::EMBL->new();
512             }
513 0         0 $db = $embl_db;
514             }
515              
516 0 0 0     0 if( $db_type =~ /refseq/ or ($db_type !~ /swiss/ and
      0        
517             $identifier =~ /^\s*N\S+_/)) {
518 0 0       0 if( !defined $refseq_db ) {
519 0         0 $refseq_db = Bio::DB::RefSeq->new();
520             }
521 0         0 $db = $refseq_db;
522             }
523              
524 0         0 my $seq;
525              
526 0 0       0 if( $identifier =~ /^\w+\d+$/ ) {
527 0         0 $seq = $db->get_Seq_by_acc($identifier);
528             } else {
529 0         0 $seq = $db->get_Seq_by_id($identifier);
530             }
531              
532 0         0 return $seq;
533             }
534              
535              
536             =head2 translate
537              
538             Title : translate
539             Usage : $seqobj = translate($seq_or_string_scalar)
540              
541             Function: translates a DNA sequence object OR just a plain
542             string of DNA to amino acids
543             Returns : A Bio::Seq object
544              
545             Args : Either a sequence object or a string of
546             just DNA sequence characters
547              
548             =cut
549              
550             sub translate {
551 4     4 1 6 my ($scalar) = shift;
552              
553 4         4 my $obj;
554 4 100       8 if( ref $scalar ) {
555 2 50       10 if( !$scalar->isa("Bio::PrimarySeqI") ) {
556 0         0 confess("Expecting a sequence object not a $scalar");
557             } else {
558 2         3 $obj= $scalar;
559             }
560             } else {
561             # check this looks vaguely like DNA
562 2         3 my $n = ( $scalar =~ tr/ATGCNatgcn/ATGCNatgcn/ );
563 2 50       8 if( $n < length($scalar) * 0.85 ) {
564 0         0 confess("Sequence [$scalar] is less than 85% ATGCN, which doesn't look very DNA to me");
565             }
566 2         10 $obj = Bio::PrimarySeq->new(-id => 'internalbioperlseq',-seq => $scalar);
567             }
568 4         27 return $obj->translate();
569             }
570              
571              
572             =head2 translate_as_string
573              
574             Title : translate_as_string
575             Usage : $seqstring = translate_as_string($seq_or_string_scalar)
576              
577             Function: translates a DNA sequence object OR just a plain
578             string of DNA to amino acids
579             Returns : A string of just amino acids
580              
581             Args : Either a sequence object or a string of
582             just DNA sequence characters
583              
584             =cut
585              
586             sub translate_as_string {
587 2     2 1 4 my ($scalar) = shift;
588 2         4 my $obj = Bio::Perl::translate($scalar);
589 2         6 return $obj->seq;
590             }
591              
592              
593             =head2 reverse_complement
594              
595             Title : reverse_complement
596             Usage : $seqobj = reverse_complement($seq_or_string_scalar)
597              
598             Function: reverse complements a string or sequence argument
599             producing a Bio::Seq - if you want a string, you
600             can use reverse_complement_as_string
601             Returns : A Bio::Seq object
602              
603             Args : Either a sequence object or a string of
604             just DNA sequence characters
605              
606             =cut
607              
608             sub reverse_complement {
609 0     0 1   my ($scalar) = shift;
610              
611 0           my $obj;
612              
613 0 0         if( ref $scalar ) {
614 0 0         if( !$scalar->isa("Bio::PrimarySeqI") ) {
615 0           confess("Expecting a sequence object not a $scalar");
616             } else {
617 0           $obj= $scalar;
618             }
619              
620             } else {
621              
622             # check this looks vaguely like DNA
623 0           my $n = ( $scalar =~ tr/ATGCNatgcn/ATGCNatgcn/ );
624              
625 0 0         if( $n < length($scalar) * 0.85 ) {
626 0           confess("Sequence [$scalar] is less than 85% ATGCN, which doesn't look very DNA to me");
627             }
628              
629 0           $obj = Bio::PrimarySeq->new(-id => 'internalbioperlseq',-seq => $scalar);
630             }
631              
632 0           return $obj->revcom();
633             }
634              
635             =head2 revcom
636              
637             Title : revcom
638             Usage : $seqobj = revcom($seq_or_string_scalar)
639              
640             Function: reverse complements a string or sequence argument
641             producing a Bio::Seq - if you want a string, you
642             can use reverse_complement_as_string
643              
644             This is an alias for reverse_complement
645             Returns : A Bio::Seq object
646              
647             Args : Either a sequence object or a string of
648             just DNA sequence characters
649              
650             =cut
651              
652             sub revcom {
653 0     0 1   return &Bio::Perl::reverse_complement(@_);
654             }
655              
656              
657             =head2 reverse_complement_as_string
658              
659             Title : reverse_complement_as_string
660             Usage : $string = reverse_complement_as_string($seq_or_string_scalar)
661              
662             Function: reverse complements a string or sequence argument
663             producing a string
664             Returns : A string of DNA letters
665              
666             Args : Either a sequence object or a string of
667             just DNA sequence characters
668              
669             =cut
670              
671             sub reverse_complement_as_string {
672 0     0 1   my ($scalar) = shift;
673 0           my $obj = &Bio::Perl::reverse_complement($scalar);
674 0           return $obj->seq;
675             }
676              
677              
678             =head2 revcom_as_string
679              
680             Title : revcom_as_string
681             Usage : $string = revcom_as_string($seq_or_string_scalar)
682              
683             Function: reverse complements a string or sequence argument
684             producing a string
685             Returns : A string of DNA letters
686              
687             Args : Either a sequence object or a string of
688             just DNA sequence characters
689              
690             =cut
691              
692             sub revcom_as_string {
693 0     0 1   my ($scalar) = shift;
694 0           my $obj = &Bio::Perl::reverse_complement($scalar);
695 0           return $obj->seq;
696             }
697              
698              
699             1;