File Coverage

Bio/PrimarySeqI.pm
Criterion Covered Total %
statement 146 202 72.2
branch 70 102 68.6
condition 48 65 73.8
subroutine 16 28 57.1
pod 17 17 100.0
total 297 414 71.7


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::PrimarySeqI
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Ewan Birney
7             #
8             # Copyright Ewan Birney
9             #
10             # You may distribute this module under the same terms as perl itself
11              
12             # POD documentation - main docs before the code
13              
14              
15             =head1 NAME
16              
17             Bio::PrimarySeqI - Interface definition for a Bio::PrimarySeq
18              
19             =head1 SYNOPSIS
20              
21             # Bio::PrimarySeqI is the interface class for sequences.
22             # If you are a newcomer to bioperl, you might want to start with
23             # Bio::Seq documentation.
24              
25             # Test if this is a seq object
26             $obj->isa("Bio::PrimarySeqI") ||
27             $obj->throw("$obj does not implement the Bio::PrimarySeqI interface");
28              
29             # Accessors
30             $string = $obj->seq();
31             $substring = $obj->subseq(12,50);
32             $display = $obj->display_id(); # for human display
33             $id = $obj->primary_id(); # unique id for this object,
34             # implementation defined
35             $unique_key= $obj->accession_number(); # unique biological id
36              
37              
38             # Object manipulation
39             eval {
40             $rev = $obj->revcom();
41             };
42             if( $@ ) {
43             $obj->throw( "Could not reverse complement. ".
44             "Probably not DNA. Actual exception\n$@\n" );
45             }
46              
47             $trunc = $obj->trunc(12,50);
48             # $rev and $trunc are Bio::PrimarySeqI compliant objects
49              
50              
51             =head1 DESCRIPTION
52              
53             This object defines an abstract interface to basic sequence
54             information - for most users of the package the documentation (and
55             methods) in this class are not useful - this is a developers-only
56             class which defines what methods have to be implmented by other Perl
57             objects to comply to the Bio::PrimarySeqI interface. Go "perldoc
58             Bio::Seq" or "man Bio::Seq" for more information on the main class for
59             sequences.
60              
61             PrimarySeq is an object just for the sequence and its name(s), nothing
62             more. Seq is the larger object complete with features. There is a pure
63             perl implementation of this in L. If you just want to
64             use L objects, then please read that module first. This
65             module defines the interface, and is of more interest to people who
66             want to wrap their own Perl Objects/RDBs/FileSystems etc in way that
67             they "are" bioperl sequence objects, even though it is not using Perl
68             to store the sequence etc.
69              
70             This interface defines what bioperl considers necessary to "be" a
71             sequence, without providing an implementation of this, an
72             implementation is provided in L. If you want to provide
73             a Bio::PrimarySeq-compliant object which in fact wraps another
74             object/database/out-of-perl experience, then this is the correct thing
75             to wrap, generally by providing a wrapper class which would inherit
76             from your object and this Bio::PrimarySeqI interface. The wrapper class
77             then would have methods lists in the "Implementation Specific
78             Functions" which would provide these methods for your object.
79              
80             =head1 FEEDBACK
81              
82             =head2 Mailing Lists
83              
84             User feedback is an integral part of the evolution of this and other
85             Bioperl modules. Send your comments and suggestions preferably to one
86             of the Bioperl mailing lists. Your participation is much appreciated.
87              
88             bioperl-l@bioperl.org - General discussion
89             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
90              
91             =head2 Support
92              
93             Please direct usage questions or support issues to the mailing list:
94              
95             I
96              
97             rather than to the module maintainer directly. Many experienced and
98             reponsive experts will be able look at the problem and quickly
99             address it. Please include a thorough description of the problem
100             with code and data examples if at all possible.
101              
102             =head2 Reporting Bugs
103              
104             Report bugs to the Bioperl bug tracking system to help us keep track
105             the bugs and their resolution. Bug reports can be submitted via the
106             web:
107              
108             https://github.com/bioperl/bioperl-live/issues
109              
110             =head1 AUTHOR - Ewan Birney
111              
112             Email birney@ebi.ac.uk
113              
114             =head1 APPENDIX
115              
116             The rest of the documentation details each of the object
117             methods. Internal methods are usually preceded with a _
118              
119             =cut
120              
121              
122             package Bio::PrimarySeqI;
123 203     203   963 use strict;
  203         289  
  203         4800  
124 203     203   69448 use Bio::Tools::CodonTable;
  203         365  
  203         4923  
125              
126 203     203   976 use base qw(Bio::Root::RootI);
  203         231  
  203         382117  
127              
128              
129             =head1 Implementation-specific Functions
130              
131             These functions are the ones that a specific implementation must
132             define.
133              
134             =head2 seq
135              
136             Title : seq
137             Usage : $string = $obj->seq()
138             Function: Returns the sequence as a string of letters. The
139             case of the letters is left up to the implementer.
140             Suggested cases are upper case for proteins and lower case for
141             DNA sequence (IUPAC standard), but implementations are suggested to
142             keep an open mind about case (some users... want mixed case!)
143             Returns : A scalar
144             Status : Virtual
145              
146             =cut
147              
148             sub seq {
149 0     0 1 0 my ($self) = @_;
150 0         0 $self->throw_not_implemented();
151             }
152              
153              
154             =head2 subseq
155              
156             Title : subseq
157             Usage : $substring = $obj->subseq(10,40);
158             Function: Returns the subseq from start to end, where the first base
159             is 1 and the number is inclusive, i.e. 1-2 are the first two
160             bases of the sequence.
161              
162             Start cannot be larger than end but can be equal.
163              
164             Returns : A string
165             Args :
166             Status : Virtual
167              
168             =cut
169              
170             sub subseq {
171 0     0 1 0 my ($self) = @_;
172 0         0 $self->throw_not_implemented();
173             }
174              
175              
176             =head2 display_id
177              
178             Title : display_id
179             Usage : $id_string = $obj->display_id();
180             Function: Returns the display id, also known as the common name of the Sequence
181             object.
182              
183             The semantics of this is that it is the most likely string
184             to be used as an identifier of the sequence, and likely to
185             have "human" readability. The id is equivalent to the ID
186             field of the GenBank/EMBL databanks and the id field of the
187             Swissprot/sptrembl database. In fasta format, the >(\S+) is
188             presumed to be the id, though some people overload the id
189             to embed other information. Bioperl does not use any
190             embedded information in the ID field, and people are
191             encouraged to use other mechanisms (accession field for
192             example, or extending the sequence object) to solve this.
193              
194             Notice that $seq->id() maps to this function, mainly for
195             legacy/convenience reasons.
196             Returns : A string
197             Args : None
198             Status : Virtual
199              
200             =cut
201              
202             sub display_id {
203 0     0 1 0 my ($self) = @_;
204 0         0 $self->throw_not_implemented();
205             }
206              
207              
208             =head2 accession_number
209              
210             Title : accession_number
211             Usage : $unique_biological_key = $obj->accession_number;
212             Function: Returns the unique biological id for a sequence, commonly
213             called the accession_number. For sequences from established
214             databases, the implementors should try to use the correct
215             accession number. Notice that primary_id() provides the
216             unique id for the implemetation, allowing multiple objects
217             to have the same accession number in a particular implementation.
218              
219             For sequences with no accession number, this method should return
220             "unknown".
221             Returns : A string
222             Args : None
223             Status : Virtual
224              
225             =cut
226              
227             sub accession_number {
228 0     0 1 0 my ($self,@args) = @_;
229 0         0 $self->throw_not_implemented();
230             }
231              
232              
233             =head2 primary_id
234              
235             Title : primary_id
236             Usage : $unique_implementation_key = $obj->primary_id;
237             Function: Returns the unique id for this object in this
238             implementation. This allows implementations to manage their
239             own object ids in a way the implementaiton can control
240             clients can expect one id to map to one object.
241              
242             For sequences with no accession number, this method should
243             return a stringified memory location.
244              
245             Returns : A string
246             Args : None
247             Status : Virtual
248              
249             =cut
250              
251             sub primary_id {
252 0     0 1 0 my ($self,@args) = @_;
253 0         0 $self->throw_not_implemented();
254             }
255              
256              
257             =head2 can_call_new
258              
259             Title : can_call_new
260             Usage : if( $obj->can_call_new ) {
261             $newobj = $obj->new( %param );
262             }
263             Function: Can_call_new returns 1 or 0 depending
264             on whether an implementation allows new
265             constructor to be called. If a new constructor
266             is allowed, then it should take the followed hashed
267             constructor list.
268              
269             $myobject->new( -seq => $sequence_as_string,
270             -display_id => $id
271             -accession_number => $accession
272             -alphabet => 'dna',
273             );
274             Returns : 1 or 0
275             Args :
276              
277              
278             =cut
279              
280             sub can_call_new {
281 0     0 1 0 my ($self,@args) = @_;
282             # we default to 0 here
283 0         0 return 0;
284             }
285              
286              
287             =head2 alphabet
288              
289             Title : alphabet
290             Usage : if( $obj->alphabet eq 'dna' ) { /Do Something/ }
291             Function: Returns the type of sequence being one of
292             'dna', 'rna' or 'protein'. This is case sensitive.
293              
294             This is not called "type" because this would cause
295             upgrade problems from the 0.5 and earlier Seq objects.
296              
297             Returns : A string either 'dna','rna','protein'. NB - the object must
298             make a call of the alphabet, if there is no alphabet specified it
299             has to guess.
300             Args : None
301             Status : Virtual
302              
303             =cut
304              
305             sub alphabet {
306 0     0 1 0 my ( $self ) = @_;
307 0         0 $self->throw_not_implemented();
308             }
309              
310              
311             =head2 moltype
312              
313             Title : moltype
314             Usage : Deprecated. Use alphabet() instead.
315              
316             =cut
317              
318             sub moltype {
319 0     0 1 0 my ($self,@args) = @_;
320 0         0 $self->warn("moltype: pre v1.0 method. Calling alphabet() instead...");
321 0         0 return $self->alphabet(@args);
322             }
323              
324              
325             =head1 Implementation-optional Functions
326              
327             The following functions rely on the above functions. An
328             implementing class does not need to provide these functions, as they
329             will be provided by this class, but is free to override these
330             functions.
331              
332             The revcom(), trunc(), and translate() methods create new sequence
333             objects. They will call new() on the class of the sequence object
334             instance passed as argument, unless can_call_new() returns FALSE. In
335             the latter case a Bio::PrimarySeq object will be created. Implementors
336             which really want to control how objects are created (eg, for object
337             persistence over a database, or objects in a CORBA framework), they
338             are encouraged to override these methods
339              
340             =head2 revcom
341              
342             Title : revcom
343             Usage : $rev = $seq->revcom()
344             Function: Produces a new Bio::PrimarySeqI implementing object which
345             is the reversed complement of the sequence. For protein
346             sequences this throws an exception of "Sequence is a
347             protein. Cannot revcom".
348              
349             The id is the same id as the original sequence, and the
350             accession number is also indentical. If someone wants to
351             track that this sequence has be reversed, it needs to
352             define its own extensions.
353              
354             To do an inplace edit of an object you can go:
355              
356             $seq = $seq->revcom();
357              
358             This of course, causes Perl to handle the garbage
359             collection of the old object, but it is roughly speaking as
360             efficient as an inplace edit.
361              
362             Returns : A new (fresh) Bio::PrimarySeqI object
363             Args : None
364              
365              
366             =cut
367              
368             sub revcom {
369 11271     11271 1 9271 my ($self) = @_;
370              
371             # Create a new fresh object if $self is 'Bio::Seq::LargePrimarySeq'
372             # or 'Bio::Seq::LargeSeq', if not take advantage of
373             # Bio::Root::clone to get an object copy
374 11271         8035 my $out;
375 11271 50 33     82274 if ( $self->isa('Bio::Seq::LargePrimarySeq')
376             or $self->isa('Bio::Seq::LargeSeq')
377             ) {
378 0         0 my ($seqclass, $opts) = $self->_setup_class;
379 0         0 $out = $seqclass->new(
380             -seq => $self->_revcom_from_string($self->seq, $self->alphabet),
381             -is_circular => $self->is_circular,
382             -display_id => $self->display_id,
383             -accession_number => $self->accession_number,
384             -alphabet => $self->alphabet,
385             -desc => $self->desc,
386             -verbose => $self->verbose,
387             %$opts,
388             );
389             } else {
390 11271         21396 $out = $self->clone;
391 11271         18501 $out->seq( $out->_revcom_from_string($out->seq, $out->alphabet) );
392             }
393 11271         22718 return $out;
394             }
395              
396              
397             sub _revcom_from_string {
398 11337     11337   12025 my ($self, $string, $alphabet) = @_;
399              
400             # Check that reverse-complementing makes sense
401 11337 50       16156 if( $alphabet eq 'protein' ) {
402 0         0 $self->throw("Sequence is a protein. Cannot revcom.");
403             }
404 11337 50 66     19250 if( $alphabet ne 'dna' && $alphabet ne 'rna' ) {
405 0         0 my $msg = "Sequence is not dna or rna, but [$alphabet]. Attempting to revcom, ".
406             "but unsure if this is right.";
407 0 0       0 if( $self->can('warn') ) {
408 0         0 $self->warn($msg);
409             } else {
410 0         0 warn("[$self] $msg");
411             }
412             }
413              
414             # If sequence is RNA, map to DNA (then map back later)
415 11337 100       14300 if( $alphabet eq 'rna' ) {
416 1         3 $string =~ tr/uU/tT/;
417             }
418              
419             # Reverse-complement now
420 11337         13053 $string =~ tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/;
421 11337         11413 $string = CORE::reverse $string;
422              
423             # Map back RNA to DNA
424 11337 100       13465 if( $alphabet eq 'rna' ) {
425 1         1 $string =~ tr/tT/uU/;
426             }
427              
428 11337         18972 return $string;
429             }
430              
431              
432             =head2 trunc
433              
434             Title : trunc
435             Usage : $subseq = $myseq->trunc(10,100);
436             Function: Provides a truncation of a sequence.
437             Returns : A fresh Bio::PrimarySeqI implementing object.
438             Args : Two integers denoting first and last base of the sub-sequence.
439              
440              
441             =cut
442              
443             sub trunc {
444 320     320 1 357 my ($self,$start,$end) = @_;
445              
446 320         249 my $str;
447 320 100 66     1631 if( defined $start && ref($start) &&
    50 66        
    50          
448             $start->isa('Bio::LocationI') ) {
449 2         4 $str = $self->subseq($start); # start is a location actually
450             } elsif( !$end ) {
451 0         0 $self->throw("trunc start,end -- there was no end for $start");
452             } elsif( $end < $start ) {
453 0         0 my $msg = "start [$start] is greater than end [$end]. \n".
454             "If you want to truncated and reverse complement, \n".
455             "you must call trunc followed by revcom. Sorry.";
456 0         0 $self->throw($msg);
457             } else {
458 318         580 $str = $self->subseq($start,$end);
459             }
460              
461             # Create a new fresh object if $self is 'Bio::Seq::LargePrimarySeq'
462             # or 'Bio::Seq::LargeSeq', if not take advantage of
463             # Bio::Root::clone to get an object copy
464 320         271 my $out;
465 320 100 66     3447 if ( $self->isa('Bio::Seq::LargePrimarySeq')
      100        
466             or $self->isa('Bio::Seq::LargeSeq')
467             or $self->isa('Bio::Seq::RichSeq')
468             ) {
469 8         25 my ($seqclass, $opts) = $self->_setup_class;
470 8         29 $out = $seqclass->new(
471             -seq => $str,
472             -is_circular => $self->is_circular,
473             -display_id => $self->display_id,
474             -accession_number => $self->accession_number,
475             -alphabet => $self->alphabet,
476             -desc => $self->desc,
477             -verbose => $self->verbose,
478             %$opts,
479             );
480             } else {
481 312         702 $out = $self->clone;
482 312         606 $out->seq($str);
483             }
484 320         623 return $out;
485             }
486              
487              
488             =head2 translate
489              
490             Title : translate
491             Usage : $protein_seq_obj = $dna_seq_obj->translate
492              
493             Or if you expect a complete coding sequence (CDS) translation,
494             with initiator at the beginning and terminator at the end:
495              
496             $protein_seq_obj = $cds_seq_obj->translate(-complete => 1);
497              
498             Or if you want translate() to find the first initiation
499             codon and return the corresponding protein:
500              
501             $protein_seq_obj = $cds_seq_obj->translate(-orf => 1);
502              
503             Function: Provides the translation of the DNA sequence using full
504             IUPAC ambiguities in DNA/RNA and amino acid codes.
505              
506             The complete CDS translation is identical to EMBL/TREMBL
507             database translation. Note that the trailing terminator
508             character is removed before returning the translated protein
509             object.
510              
511             Note: if you set $dna_seq_obj->verbose(1) you will get a
512             warning if the first codon is not a valid initiator.
513              
514             Returns : A Bio::PrimarySeqI implementing object
515             Args : -terminator
516             character for terminator, default '*'
517             -unknown
518             character for unknown, default 'X'
519             -frame
520             positive integer frame shift (in bases), default 0
521             -codontable_id
522             integer codon table id, default 1
523             -complete
524             boolean, if true, complete CDS is expected. default false
525             -complete_codons
526             boolean, if true, codons which are incomplete are translated if a
527             suitable amino acid is found. For instance, if the incomplete
528             codon is 'GG', the completed codon is 'GGN', which is glycine
529             (G). Defaults to 'false'; setting '-complete' also makes this
530             true.
531             -throw
532             boolean, throw exception if ORF not complete, default false
533             -orf
534             if 'longest', find longest ORF. other true value, find
535             first ORF. default 0
536             -codontable
537             optional L object to use for
538             translation
539             -start
540             optional three-character string to force as initiation
541             codon (e.g. 'atg'). If unset, start codons are
542             determined by the CodonTable. Case insensitive.
543             -offset
544             optional positive integer offset for fuzzy locations.
545             if set, must be either 1, 2, or 3
546              
547             =head3 Notes
548              
549             The -start argument only applies when -orf is set to 1. By default all
550             initiation codons found in the given codon table are used but when
551             "start" is set to some codon this codon will be used exclusively as
552             the initiation codon. Note that the default codon table (NCBI
553             "Standard") has 3 initiation codons!
554              
555             By default translate() translates termination codons to the some
556             character (default is *), both internal and trailing codons. Setting
557             "-complete" to 1 tells translate() to remove the trailing character.
558              
559             -offset is used for seqfeatures which contain the the \codon_start tag
560             and can be set to 1, 2, or 3. This is the offset by which the
561             sequence translation starts relative to the first base of the feature
562              
563             For details on codon tables used by translate() see L.
564              
565             Deprecated argument set (v. 1.5.1 and prior versions) where each argument is an
566             element in an array:
567              
568             1: character for terminator (optional), defaults to '*'.
569             2: character for unknown amino acid (optional), defaults to 'X'.
570             3: frame (optional), valid values are 0, 1, 2, defaults to 0.
571             4: codon table id (optional), defaults to 1.
572             5: complete coding sequence expected, defaults to 0 (false).
573             6: boolean, throw exception if not complete coding sequence
574             (true), defaults to warning (false)
575             7: codontable, a custom Bio::Tools::CodonTable object (optional).
576              
577             =cut
578              
579             sub translate {
580 287     287 1 1213 my ($self,@args) = @_;
581 287         265 my ($terminator, $unknown, $frame, $codonTableId, $complete,
582             $complete_codons, $throw, $codonTable, $orf, $start_codon, $offset);
583              
584             ## new API with named parameters, post 1.5.1
585 287 100 100     758 if ($args[0] && $args[0] =~ /^-[A-Z]+/i) {
586 24         88 ($terminator, $unknown, $frame, $codonTableId, $complete,
587             $complete_codons, $throw,$codonTable, $orf, $start_codon, $offset) =
588             $self->_rearrange([qw(TERMINATOR
589             UNKNOWN
590             FRAME
591             CODONTABLE_ID
592             COMPLETE
593             COMPLETE_CODONS
594             THROW
595             CODONTABLE
596             ORF
597             START
598             OFFSET)], @args);
599             ## old API, 1.5.1 and preceding versions
600             } else {
601 263         359 ($terminator, $unknown, $frame, $codonTableId,
602             $complete, $throw, $codonTable, $offset) = @args;
603             }
604            
605             ## Initialize termination codon, unknown codon, codon table id, frame
606 287 100 66     679 $terminator = '*' unless (defined($terminator) and $terminator ne '');
607 287 100 66     563 $unknown = "X" unless (defined($unknown) and $unknown ne '');
608 287 100 66     556 $frame = 0 unless (defined($frame) and $frame ne '');
609 287 100 66     592 $codonTableId = 1 unless (defined($codonTableId) and $codonTableId ne '');
610 287   100     1197 $complete_codons ||= $complete || 0;
      100        
611            
612             ## Get a CodonTable, error if custom CodonTable is invalid
613 287 100       373 if ($codonTable) {
614 1 50       4 $self->throw("Need a Bio::Tools::CodonTable object, not ". $codonTable)
615             unless $codonTable->isa('Bio::Tools::CodonTable');
616             } else {
617            
618             # shouldn't this be cached? Seems wasteful to have a new instance
619             # every time...
620 286         1092 $codonTable = Bio::Tools::CodonTable->new( -id => $codonTableId);
621             }
622              
623             ## Error if alphabet is "protein"
624 287 50       626 $self->throw("Can't translate an amino acid sequence.") if
625             ($self->alphabet =~ /protein/i);
626              
627             ## Error if -start parameter isn't a valid codon
628 287 100       448 if ($start_codon) {
629 1 50       3 $self->throw("Invalid start codon: $start_codon.") if
630             ( $start_codon !~ /^[A-Z]{3}$/i );
631             }
632              
633 287         241 my $seq;
634 287 50       413 if ($offset) {
635 0 0       0 $self->throw("Offset must be 1, 2, or 3.") if
636             ( $offset !~ /^[123]$/ );
637 0         0 my ($start, $end) = ($offset, $self->length);
638 0         0 ($seq) = $self->subseq($start, $end);
639             } else {
640 287         505 ($seq) = $self->seq();
641             }
642              
643             ## ignore frame if an ORF is supposed to be found
644 287 100       459 if ( $orf ) {
645 12 100       36 my ($orf_region) = $self->_find_orfs_nucleotide( $seq, $codonTable, $start_codon, $orf eq 'longest' ? 0 : 'first_only' );
646 11         24 $seq = $self->_orf_sequence( $seq, $orf_region );
647             } else {
648             ## use frame, error if frame is not 0, 1 or 2
649 275 50 100     651 $self->throw("Valid values for frame are 0, 1, or 2, not $frame.")
      66        
650             unless ($frame == 0 or $frame == 1 or $frame == 2);
651 275         583 $seq = substr($seq,$frame);
652             }
653              
654             ## Translate it
655 286         683 my $output = $codonTable->translate($seq, $complete_codons);
656             # Use user-input terminator/unknown
657 286         1038 $output =~ s/\*/$terminator/g;
658 286         406 $output =~ s/X/$unknown/g;
659              
660             ## Only if we are expecting to translate a complete coding region
661 286 100       443 if ($complete) {
662 9         19 my $id = $self->display_id;
663             # remove the terminator character
664 9 50       16 if( substr($output,-1,1) eq $terminator ) {
665 9         11 chop $output;
666             } else {
667 0 0       0 $throw && $self->throw("Seq [$id]: Not using a valid terminator codon!");
668 0         0 $self->warn("Seq [$id]: Not using a valid terminator codon!");
669             }
670             # test if there are terminator characters inside the protein sequence!
671 9 100       48 if ($output =~ /\Q$terminator\E/) {
672 2   100     8 $id ||= '';
673 2 50       14 $throw && $self->throw("Seq [$id]: Terminator codon inside CDS!");
674 0         0 $self->warn("Seq [$id]: Terminator codon inside CDS!");
675             }
676             # if the initiator codon is not ATG, the amino acid needs to be changed to M
677 7 100       15 if ( substr($output,0,1) ne 'M' ) {
678 3 50       10 if ($codonTable->is_start_codon(substr($seq, 0, 3)) ) {
    0          
679 3         4 $output = 'M'. substr($output,1);
680             } elsif ($throw) {
681 0         0 $self->throw("Seq [$id]: Not using a valid initiator codon!");
682             } else {
683 0         0 $self->warn("Seq [$id]: Not using a valid initiator codon!");
684             }
685             }
686             }
687              
688             # Create a new fresh object if $self is 'Bio::Seq::LargePrimarySeq'
689             # or 'Bio::Seq::LargeSeq', if not take advantage of
690             # Bio::Root::clone to get an object copy
691 284         244 my $out;
692 284 100 100     3023 if ( $self->isa('Bio::Seq::LargePrimarySeq')
693             or $self->isa('Bio::Seq::LargeSeq')
694             ) {
695 3         11 my ($seqclass, $opts) = $self->_setup_class;
696 3         12 $out = $seqclass->new(
697             -seq => $output,
698             -is_circular => $self->is_circular,
699             -display_id => $self->display_id,
700             -accession_number => $self->accession_number,
701             -alphabet => 'protein',
702             -desc => $self->desc,
703             -verbose => $self->verbose,
704             %$opts,
705             );
706             } else {
707 281         698 $out = $self->clone;
708 281         706 $out->seq($output);
709 281         519 $out->alphabet('protein');
710             }
711 284         1014 return $out;
712             }
713              
714              
715             =head2 transcribe()
716              
717             Title : transcribe
718             Usage : $xseq = $seq->transcribe
719             Function: Convert base T to base U
720             Returns : PrimarySeqI object of alphabet 'rna' or
721             undef if $seq->alphabet ne 'dna'
722             Args :
723              
724             =cut
725              
726             sub transcribe {
727 2     2 1 3 my $self = shift;
728 2 100       4 return unless $self->alphabet eq 'dna';
729 1         2 my $s = $self->seq;
730 1         2 $s =~ tr/tT/uU/;
731 1   50     1 my $desc = $self->desc || '';
732              
733             # Create a new fresh object if $self is 'Bio::Seq::LargePrimarySeq'
734             # or 'Bio::Seq::LargeSeq', if not take advantage of
735             # Bio::Root::clone to get an object copy
736 1         2 my $out;
737 1 50 33     10 if ( $self->isa('Bio::Seq::LargePrimarySeq')
738             or $self->isa('Bio::Seq::LargeSeq')
739             ) {
740 0         0 my ($seqclass, $opts) = $self->_setup_class;
741 0         0 $out = $seqclass->new(
742             -seq => $s,
743             -is_circular => $self->is_circular,
744             -display_id => $self->display_id,
745             -accession_number => $self->accession_number,
746             -alphabet => 'rna',
747             -desc => "${desc}[TRANSCRIBED]",
748             -verbose => $self->verbose,
749             %$opts,
750             );
751             } else {
752 1         3 $out = $self->clone;
753 1         3 $out->seq($s);
754 1         3 $out->alphabet('rna');
755 1         3 $out->desc($desc . "[TRANSCRIBED]");
756             }
757 1         4 return $out;
758             }
759              
760              
761             =head2 rev_transcribe()
762              
763             Title : rev_transcribe
764             Usage : $rtseq = $seq->rev_transcribe
765             Function: Convert base U to base T
766             Returns : PrimarySeqI object of alphabet 'dna' or
767             undef if $seq->alphabet ne 'rna'
768             Args :
769              
770             =cut
771              
772             sub rev_transcribe {
773 1     1 1 2 my $self = shift;
774 1 50       3 return unless $self->alphabet eq 'rna';
775 1         2 my $s = $self->seq;
776 1         2 $s =~ tr/uU/tT/;
777 1   50     3 my $desc = $self->desc || '';
778              
779             # Create a new fresh object if $self is 'Bio::Seq::LargePrimarySeq'
780             # or 'Bio::Seq::LargeSeq', if not take advantage of
781             # Bio::Root::clone to get an object copy
782 1         1 my $out;
783 1 50 33     10 if ( $self->isa('Bio::Seq::LargePrimarySeq')
784             or $self->isa('Bio::Seq::LargeSeq')
785             ) {
786 0         0 my ($seqclass, $opts) = $self->_setup_class;
787 0         0 $out = $seqclass->new(
788             -seq => $s,
789             -is_circular => $self->is_circular,
790             -display_id => $self->display_id,
791             -accession_number => $self->accession_number,
792             -alphabet => 'dna',
793             -desc => $self->desc . "[REVERSE TRANSCRIBED]",
794             -verbose => $self->verbose,
795             %$opts,
796             );
797             } else {
798 1         4 $out = $self->clone;
799 1         2 $out->seq($s);
800 1         2 $out->alphabet('dna');
801 1         4 $out->desc($desc . "[REVERSE TRANSCRIBED]");
802             }
803 1         3 return $out;
804             }
805              
806              
807             =head2 id
808              
809             Title : id
810             Usage : $id = $seq->id()
811             Function: ID of the sequence. This should normally be (and actually is in
812             the implementation provided here) just a synonym for display_id().
813             Returns : A string.
814             Args :
815              
816             =cut
817              
818             sub id {
819 1     1 1 1 my ($self)= @_;
820 1         3 return $self->display_id();
821             }
822              
823              
824             =head2 length
825              
826             Title : length
827             Usage : $len = $seq->length()
828             Function:
829             Returns : Integer representing the length of the sequence.
830             Args :
831              
832             =cut
833              
834             sub length {
835 0     0 1 0 my ($self)= @_;
836 0         0 $self->throw_not_implemented();
837             }
838              
839              
840             =head2 desc
841              
842             Title : desc
843             Usage : $seq->desc($newval);
844             $description = $seq->desc();
845             Function: Get/set description text for a seq object
846             Returns : Value of desc
847             Args : newvalue (optional)
848              
849             =cut
850              
851             sub desc {
852 0     0 1 0 shift->throw_not_implemented();
853             }
854              
855              
856             =head2 is_circular
857              
858             Title : is_circular
859             Usage : if( $obj->is_circular) { # Do something }
860             Function: Returns true if the molecule is circular
861             Returns : Boolean value
862             Args : none
863              
864             =cut
865              
866             sub is_circular {
867 0     0 1 0 shift->throw_not_implemented;
868             }
869              
870              
871             =head1 Private functions
872              
873             These are some private functions for the PrimarySeqI interface. You do not
874             need to implement these functions
875              
876             =head2 _find_orfs_nucleotide
877              
878             Title : _find_orfs_nucleotide
879             Usage :
880             Function: Finds ORF starting at 1st initiation codon in nucleotide sequence.
881             The ORF is not required to have a termination codon.
882             Example :
883             Returns : a list of string coordinates of ORF locations (0-based half-open),
884             sorted descending by length (so that the longest is first)
885             as: [ start, end, frame, length ], [ start, end, frame, length ], ...
886             Args : Nucleotide sequence,
887             CodonTable object,
888             (optional) alternative initiation codon (e.g. 'ATA'),
889             (optional) boolean that, if true, stops after finding the
890             first available ORF
891              
892             =cut
893              
894             sub _find_orfs_nucleotide {
895 14     14   19 my ( $self, $sequence, $codon_table, $start_codon, $first_only ) = @_;
896 14         18 $sequence = uc $sequence;
897 14 100       19 $start_codon = uc $start_codon if $start_codon;
898              
899             my $is_start = $start_codon
900 12     12   27 ? sub { shift eq $start_codon }
901 14 100   697   42 : sub { $codon_table->is_start_codon( shift ) };
  697         795  
902              
903             # stores the begin index of the currently-running ORF in each
904             # reading frame
905 14         22 my @current_orf_start = (-1,-1,-1);
906              
907             #< stores coordinates of longest observed orf (so far) in each
908             # reading frame
909 14         8 my @orfs;
910              
911             # go through each base of the sequence, and each reading frame for each base
912 14         9 my $seqlen = CORE::length $sequence;
913 14         11 my @start_frame_order;
914 14         28 for( my $j = 0; $j <= $seqlen-3; $j++ ) {
915 1239         928 my $frame = $j % 3;
916              
917 1239         973 my $this_codon = substr( $sequence, $j, 3 );
918              
919             # if in an orf and this is either a stop codon or the last in-frame codon in the string
920 1239 100       1396 if ( $current_orf_start[$frame] >= 0 ) {
    100          
921 530 100 100     600 if ( $codon_table->is_ter_codon( $this_codon ) ||( my $is_last_codon_in_frame = ($j >= $seqlen-5)) ) {
922             # record ORF start, end (half-open), length, and frame
923 44         63 my @this_orf = ( $current_orf_start[$frame], $j+3, undef, $frame );
924 44         47 my $this_orf_length = $this_orf[2] = ( $this_orf[1] - $this_orf[0] );
925              
926 44 100 100     95 if ($first_only && $frame == $start_frame_order[0]) {
927 10 100       20 $self->warn( "Translating partial ORF "
928             .$self->_truncate_seq( $self->_orf_sequence( $sequence, \@this_orf ))
929             .' from end of nucleotide sequence'
930             ) if $is_last_codon_in_frame;
931 9         33 return \@this_orf;
932             }
933 34         30 push @orfs, \@this_orf;
934 34         73 $current_orf_start[$frame] = -1;
935             }
936             }
937             # if this is a start codon
938             elsif ( $is_start->($this_codon) ) {
939 44         29 $current_orf_start[$frame] = $j;
940 44         80 push @start_frame_order, $frame;
941             }
942             }
943              
944 4         20 return sort { $b->[2] <=> $a->[2] } @orfs;
  88         73  
945             }
946              
947              
948             sub _truncate_seq {
949 4     4   5 my ($self, $seq) = @_;
950 4 50       16 return CORE::length($seq) > 200 ? substr($seq,0,50).'...'.substr($seq,-50) : $seq;
951             }
952              
953              
954             sub _orf_sequence {
955 15     15   15 my ($self, $seq, $orf ) = @_;
956 15 50       22 return '' unless $orf;
957 15         39 return substr( $seq, $orf->[0], $orf->[2] )
958             }
959              
960              
961             =head2 _attempt_to_load_Seq
962              
963             Title : _attempt_to_load_Seq
964             Usage :
965             Function:
966             Example :
967             Returns :
968             Args :
969              
970             =cut
971              
972             sub _attempt_to_load_Seq {
973 0     0   0 my ($self) = @_;
974              
975 0 0       0 if( $main::{'Bio::PrimarySeq'} ) {
976 0         0 return 1;
977             } else {
978 0         0 eval {
979 0         0 require Bio::PrimarySeq;
980             };
981 0 0       0 if( $@ ) {
982 0         0 my $text = "Bio::PrimarySeq could not be loaded for [$self]\n".
983             "This indicates that you are using Bio::PrimarySeqI ".
984             "without Bio::PrimarySeq loaded or without providing a ".
985             "complete implementation.\nThe most likely problem is that there ".
986             "has been a misconfiguration of the bioperl environment\n".
987             "Actual exception:\n\n";
988 0         0 $self->throw("$text$@\n");
989 0         0 return 0;
990             }
991 0         0 return 1;
992             }
993             }
994              
995              
996             sub _setup_class {
997             # Return name of class and setup some default parameters
998 11     11   11 my ($self) = @_;
999 11         11 my $seqclass;
1000 11 50       28 if ($self->can_call_new()) {
1001 11         17 $seqclass = ref($self);
1002             } else {
1003 0         0 $seqclass = 'Bio::PrimarySeq';
1004 0         0 $self->_attempt_to_load_Seq();
1005             }
1006 11         7 my %opts;
1007 11 50       21 if ($seqclass eq 'Bio::PrimarySeq') {
1008             # Since sequence is in a Seq object, it has already been validated.
1009             # We do not need to validate its trunc(), revcom(), etc
1010 0         0 $opts{ -direct } = 1;
1011             }
1012 11         19 return $seqclass, \%opts;
1013             }
1014              
1015              
1016             1;