File Coverage

Bio/PrimarySeqI.pm

Criterion	Covered	Total	%
statement	146	202	72.2
branch	70	102	68.6
condition	48	65	73.8
subroutine	16	28	57.1
pod	17	17	100.0
total	297	414	71.7

line	stmt	bran	cond	sub	pod	time	code
1							#
2							# BioPerl module for Bio::PrimarySeqI
3							#
4							# Please direct questions and support issues to
5							#
6							# Cared for by Ewan Birney
7							#
8							# Copyright Ewan Birney
9							#
10							# You may distribute this module under the same terms as perl itself
11
12							# POD documentation - main docs before the code
13
14
15							=head1 NAME
16
17							Bio::PrimarySeqI - Interface definition for a Bio::PrimarySeq
18
19							=head1 SYNOPSIS
20
21							# Bio::PrimarySeqI is the interface class for sequences.
22							# If you are a newcomer to bioperl, you might want to start with
23							# Bio::Seq documentation.
24
25							# Test if this is a seq object
26							$obj->isa("Bio::PrimarySeqI") \|\|
27							$obj->throw("$obj does not implement the Bio::PrimarySeqI interface");
28
29							# Accessors
30							$string = $obj->seq();
31							$substring = $obj->subseq(12,50);
32							$display = $obj->display_id(); # for human display
33							$id = $obj->primary_id(); # unique id for this object,
34							# implementation defined
35							$unique_key= $obj->accession_number(); # unique biological id
36
37
38							# Object manipulation
39							eval {
40							$rev = $obj->revcom();
41							};
42							if( $@ ) {
43							$obj->throw( "Could not reverse complement. ".
44							"Probably not DNA. Actual exception\n$@\n" );
45							}
46
47							$trunc = $obj->trunc(12,50);
48							# $rev and $trunc are Bio::PrimarySeqI compliant objects
49
50
51							=head1 DESCRIPTION
52
53							This object defines an abstract interface to basic sequence
54							information - for most users of the package the documentation (and
55							methods) in this class are not useful - this is a developers-only
56							class which defines what methods have to be implmented by other Perl
57							objects to comply to the Bio::PrimarySeqI interface. Go "perldoc
58							Bio::Seq" or "man Bio::Seq" for more information on the main class for
59							sequences.
60
61							PrimarySeq is an object just for the sequence and its name(s), nothing
62							more. Seq is the larger object complete with features. There is a pure
63							perl implementation of this in L. If you just want to
64							use L objects, then please read that module first. This
65							module defines the interface, and is of more interest to people who
66							want to wrap their own Perl Objects/RDBs/FileSystems etc in way that
67							they "are" bioperl sequence objects, even though it is not using Perl
68							to store the sequence etc.
69
70							This interface defines what bioperl considers necessary to "be" a
71							sequence, without providing an implementation of this, an
72							implementation is provided in L. If you want to provide
73							a Bio::PrimarySeq-compliant object which in fact wraps another
74							object/database/out-of-perl experience, then this is the correct thing
75							to wrap, generally by providing a wrapper class which would inherit
76							from your object and this Bio::PrimarySeqI interface. The wrapper class
77							then would have methods lists in the "Implementation Specific
78							Functions" which would provide these methods for your object.
79
80							=head1 FEEDBACK
81
82							=head2 Mailing Lists
83
84							User feedback is an integral part of the evolution of this and other
85							Bioperl modules. Send your comments and suggestions preferably to one
86							of the Bioperl mailing lists. Your participation is much appreciated.
87
88							bioperl-l@bioperl.org - General discussion
89							http://bioperl.org/wiki/Mailing_lists - About the mailing lists
90
91							=head2 Support
92
93							Please direct usage questions or support issues to the mailing list:
94
95							I
96
97							rather than to the module maintainer directly. Many experienced and
98							reponsive experts will be able look at the problem and quickly
99							address it. Please include a thorough description of the problem
100							with code and data examples if at all possible.
101
102							=head2 Reporting Bugs
103
104							Report bugs to the Bioperl bug tracking system to help us keep track
105							the bugs and their resolution. Bug reports can be submitted via the
106							web:
107
108							https://github.com/bioperl/bioperl-live/issues
109
110							=head1 AUTHOR - Ewan Birney
111
112							Email birney@ebi.ac.uk
113
114							=head1 APPENDIX
115
116							The rest of the documentation details each of the object
117							methods. Internal methods are usually preceded with a _
118
119							=cut
120
121
122							package Bio::PrimarySeqI;
123	203			203		963	use strict;
	203					289
	203					4800
124	203			203		69448	use Bio::Tools::CodonTable;
	203					365
	203					4923
125
126	203			203		976	use base qw(Bio::Root::RootI);
	203					231
	203					382117
127
128
129							=head1 Implementation-specific Functions
130
131							These functions are the ones that a specific implementation must
132							define.
133
134							=head2 seq
135
136							Title : seq
137							Usage : $string = $obj->seq()
138							Function: Returns the sequence as a string of letters. The
139							case of the letters is left up to the implementer.
140							Suggested cases are upper case for proteins and lower case for
141							DNA sequence (IUPAC standard), but implementations are suggested to
142							keep an open mind about case (some users... want mixed case!)
143							Returns : A scalar
144							Status : Virtual
145
146							=cut
147
148							sub seq {
149	0			0	1	0	my ($self) = @_;
150	0					0	$self->throw_not_implemented();
151							}
152
153
154							=head2 subseq
155
156							Title : subseq
157							Usage : $substring = $obj->subseq(10,40);
158							Function: Returns the subseq from start to end, where the first base
159							is 1 and the number is inclusive, i.e. 1-2 are the first two
160							bases of the sequence.
161
162							Start cannot be larger than end but can be equal.
163
164							Returns : A string
165							Args :
166							Status : Virtual
167
168							=cut
169
170							sub subseq {
171	0			0	1	0	my ($self) = @_;
172	0					0	$self->throw_not_implemented();
173							}
174
175
176							=head2 display_id
177
178							Title : display_id
179							Usage : $id_string = $obj->display_id();
180							Function: Returns the display id, also known as the common name of the Sequence
181							object.
182
183							The semantics of this is that it is the most likely string
184							to be used as an identifier of the sequence, and likely to
185							have "human" readability. The id is equivalent to the ID
186							field of the GenBank/EMBL databanks and the id field of the
187							Swissprot/sptrembl database. In fasta format, the >(\S+) is
188							presumed to be the id, though some people overload the id
189							to embed other information. Bioperl does not use any
190							embedded information in the ID field, and people are
191							encouraged to use other mechanisms (accession field for
192							example, or extending the sequence object) to solve this.
193
194							Notice that $seq->id() maps to this function, mainly for
195							legacy/convenience reasons.
196							Returns : A string
197							Args : None
198							Status : Virtual
199
200							=cut
201
202							sub display_id {
203	0			0	1	0	my ($self) = @_;
204	0					0	$self->throw_not_implemented();
205							}
206
207
208							=head2 accession_number
209
210							Title : accession_number
211							Usage : $unique_biological_key = $obj->accession_number;
212							Function: Returns the unique biological id for a sequence, commonly
213							called the accession_number. For sequences from established
214							databases, the implementors should try to use the correct
215							accession number. Notice that primary_id() provides the
216							unique id for the implemetation, allowing multiple objects
217							to have the same accession number in a particular implementation.
218
219							For sequences with no accession number, this method should return
220							"unknown".
221							Returns : A string
222							Args : None
223							Status : Virtual
224
225							=cut
226
227							sub accession_number {
228	0			0	1	0	my ($self,@args) = @_;
229	0					0	$self->throw_not_implemented();
230							}
231
232
233							=head2 primary_id
234
235							Title : primary_id
236							Usage : $unique_implementation_key = $obj->primary_id;
237							Function: Returns the unique id for this object in this
238							implementation. This allows implementations to manage their
239							own object ids in a way the implementaiton can control
240							clients can expect one id to map to one object.
241
242							For sequences with no accession number, this method should
243							return a stringified memory location.
244
245							Returns : A string
246							Args : None
247							Status : Virtual
248
249							=cut
250
251							sub primary_id {
252	0			0	1	0	my ($self,@args) = @_;
253	0					0	$self->throw_not_implemented();
254							}
255
256
257							=head2 can_call_new
258
259							Title : can_call_new
260							Usage : if( $obj->can_call_new ) {
261							$newobj = $obj->new( %param );
262							}
263							Function: Can_call_new returns 1 or 0 depending
264							on whether an implementation allows new
265							constructor to be called. If a new constructor
266							is allowed, then it should take the followed hashed
267							constructor list.
268
269							$myobject->new( -seq => $sequence_as_string,
270							-display_id => $id
271							-accession_number => $accession
272							-alphabet => 'dna',
273							);
274							Returns : 1 or 0
275							Args :
276
277
278							=cut
279
280							sub can_call_new {
281	0			0	1	0	my ($self,@args) = @_;
282							# we default to 0 here
283	0					0	return 0;
284							}
285
286
287							=head2 alphabet
288
289							Title : alphabet
290							Usage : if( $obj->alphabet eq 'dna' ) { /Do Something/ }
291							Function: Returns the type of sequence being one of
292							'dna', 'rna' or 'protein'. This is case sensitive.
293
294							This is not called "type" because this would cause
295							upgrade problems from the 0.5 and earlier Seq objects.
296
297							Returns : A string either 'dna','rna','protein'. NB - the object must
298							make a call of the alphabet, if there is no alphabet specified it
299							has to guess.
300							Args : None
301							Status : Virtual
302
303							=cut
304
305							sub alphabet {
306	0			0	1	0	my ( $self ) = @_;
307	0					0	$self->throw_not_implemented();
308							}
309
310
311							=head2 moltype
312
313							Title : moltype
314							Usage : Deprecated. Use alphabet() instead.
315
316							=cut
317
318							sub moltype {
319	0			0	1	0	my ($self,@args) = @_;
320	0					0	$self->warn("moltype: pre v1.0 method. Calling alphabet() instead...");
321	0					0	return $self->alphabet(@args);
322							}
323
324
325							=head1 Implementation-optional Functions
326
327							The following functions rely on the above functions. An
328							implementing class does not need to provide these functions, as they
329							will be provided by this class, but is free to override these
330							functions.
331
332							The revcom(), trunc(), and translate() methods create new sequence
333							objects. They will call new() on the class of the sequence object
334							instance passed as argument, unless can_call_new() returns FALSE. In
335							the latter case a Bio::PrimarySeq object will be created. Implementors
336							which really want to control how objects are created (eg, for object
337							persistence over a database, or objects in a CORBA framework), they
338							are encouraged to override these methods
339
340							=head2 revcom
341
342							Title : revcom
343							Usage : $rev = $seq->revcom()
344							Function: Produces a new Bio::PrimarySeqI implementing object which
345							is the reversed complement of the sequence. For protein
346							sequences this throws an exception of "Sequence is a
347							protein. Cannot revcom".
348
349							The id is the same id as the original sequence, and the
350							accession number is also indentical. If someone wants to
351							track that this sequence has be reversed, it needs to
352							define its own extensions.
353
354							To do an inplace edit of an object you can go:
355
356							$seq = $seq->revcom();
357
358							This of course, causes Perl to handle the garbage
359							collection of the old object, but it is roughly speaking as
360							efficient as an inplace edit.
361
362							Returns : A new (fresh) Bio::PrimarySeqI object
363							Args : None
364
365
366							=cut
367
368							sub revcom {
369	11271			11271	1	9271	my ($self) = @_;
370
371							# Create a new fresh object if $self is 'Bio::Seq::LargePrimarySeq'
372							# or 'Bio::Seq::LargeSeq', if not take advantage of
373							# Bio::Root::clone to get an object copy
374	11271					8035	my $out;
375	11271	50	33			82274	if ( $self->isa('Bio::Seq::LargePrimarySeq')
376							or $self->isa('Bio::Seq::LargeSeq')
377							) {
378	0					0	my ($seqclass, $opts) = $self->_setup_class;
379	0					0	$out = $seqclass->new(
380							-seq => $self->_revcom_from_string($self->seq, $self->alphabet),
381							-is_circular => $self->is_circular,
382							-display_id => $self->display_id,
383							-accession_number => $self->accession_number,
384							-alphabet => $self->alphabet,
385							-desc => $self->desc,
386							-verbose => $self->verbose,
387							%$opts,
388							);
389							} else {
390	11271					21396	$out = $self->clone;
391	11271					18501	$out->seq( $out->_revcom_from_string($out->seq, $out->alphabet) );
392							}
393	11271					22718	return $out;
394							}
395
396
397							sub _revcom_from_string {
398	11337			11337		12025	my ($self, $string, $alphabet) = @_;
399
400							# Check that reverse-complementing makes sense
401	11337	50				16156	if( $alphabet eq 'protein' ) {
402	0					0	$self->throw("Sequence is a protein. Cannot revcom.");
403							}
404	11337	50	66			19250	if( $alphabet ne 'dna' && $alphabet ne 'rna' ) {
405	0					0	my $msg = "Sequence is not dna or rna, but [$alphabet]. Attempting to revcom, ".
406							"but unsure if this is right.";
407	0	0				0	if( $self->can('warn') ) {
408	0					0	$self->warn($msg);
409							} else {
410	0					0	warn("[$self] $msg");
411							}
412							}
413
414							# If sequence is RNA, map to DNA (then map back later)
415	11337	100				14300	if( $alphabet eq 'rna' ) {
416	1					3	$string =~ tr/uU/tT/;
417							}
418
419							# Reverse-complement now
420	11337					13053	$string =~ tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/;
421	11337					11413	$string = CORE::reverse $string;
422
423							# Map back RNA to DNA
424	11337	100				13465	if( $alphabet eq 'rna' ) {
425	1					1	$string =~ tr/tT/uU/;
426							}
427
428	11337					18972	return $string;
429							}
430
431
432							=head2 trunc
433
434							Title : trunc
435							Usage : $subseq = $myseq->trunc(10,100);
436							Function: Provides a truncation of a sequence.
437							Returns : A fresh Bio::PrimarySeqI implementing object.
438							Args : Two integers denoting first and last base of the sub-sequence.
439
440
441							=cut
442
443							sub trunc {
444	320			320	1	357	my ($self,$start,$end) = @_;
445
446	320					249	my $str;
447	320	100	66			1631	if( defined $start && ref($start) &&
		50	66
		50
448							$start->isa('Bio::LocationI') ) {
449	2					4	$str = $self->subseq($start); # start is a location actually
450							} elsif( !$end ) {
451	0					0	$self->throw("trunc start,end -- there was no end for $start");
452							} elsif( $end < $start ) {
453	0					0	my $msg = "start [$start] is greater than end [$end]. \n".
454							"If you want to truncated and reverse complement, \n".
455							"you must call trunc followed by revcom. Sorry.";
456	0					0	$self->throw($msg);
457							} else {
458	318					580	$str = $self->subseq($start,$end);
459							}
460
461							# Create a new fresh object if $self is 'Bio::Seq::LargePrimarySeq'
462							# or 'Bio::Seq::LargeSeq', if not take advantage of
463							# Bio::Root::clone to get an object copy
464	320					271	my $out;
465	320	100	66			3447	if ( $self->isa('Bio::Seq::LargePrimarySeq')
			100
466							or $self->isa('Bio::Seq::LargeSeq')
467							or $self->isa('Bio::Seq::RichSeq')
468							) {
469	8					25	my ($seqclass, $opts) = $self->_setup_class;
470	8					29	$out = $seqclass->new(
471							-seq => $str,
472							-is_circular => $self->is_circular,
473							-display_id => $self->display_id,
474							-accession_number => $self->accession_number,
475							-alphabet => $self->alphabet,
476							-desc => $self->desc,
477							-verbose => $self->verbose,
478							%$opts,
479							);
480							} else {
481	312					702	$out = $self->clone;
482	312					606	$out->seq($str);
483							}
484	320					623	return $out;
485							}
486
487
488							=head2 translate
489
490							Title : translate
491							Usage : $protein_seq_obj = $dna_seq_obj->translate
492
493							Or if you expect a complete coding sequence (CDS) translation,
494							with initiator at the beginning and terminator at the end:
495
496							$protein_seq_obj = $cds_seq_obj->translate(-complete => 1);
497
498							Or if you want translate() to find the first initiation
499							codon and return the corresponding protein:
500
501							$protein_seq_obj = $cds_seq_obj->translate(-orf => 1);
502
503							Function: Provides the translation of the DNA sequence using full
504							IUPAC ambiguities in DNA/RNA and amino acid codes.
505
506							The complete CDS translation is identical to EMBL/TREMBL
507							database translation. Note that the trailing terminator
508							character is removed before returning the translated protein
509							object.
510
511							Note: if you set $dna_seq_obj->verbose(1) you will get a
512							warning if the first codon is not a valid initiator.
513
514							Returns : A Bio::PrimarySeqI implementing object
515							Args : -terminator
516							character for terminator, default '*'
517							-unknown
518							character for unknown, default 'X'
519							-frame
520							positive integer frame shift (in bases), default 0
521							-codontable_id
522							integer codon table id, default 1
523							-complete
524							boolean, if true, complete CDS is expected. default false
525							-complete_codons
526							boolean, if true, codons which are incomplete are translated if a
527							suitable amino acid is found. For instance, if the incomplete
528							codon is 'GG', the completed codon is 'GGN', which is glycine
529							(G). Defaults to 'false'; setting '-complete' also makes this
530							true.
531							-throw
532							boolean, throw exception if ORF not complete, default false
533							-orf
534							if 'longest', find longest ORF. other true value, find
535							first ORF. default 0
536							-codontable
537							optional L object to use for
538							translation
539							-start
540							optional three-character string to force as initiation
541							codon (e.g. 'atg'). If unset, start codons are
542							determined by the CodonTable. Case insensitive.
543							-offset
544							optional positive integer offset for fuzzy locations.
545							if set, must be either 1, 2, or 3
546
547							=head3 Notes
548
549							The -start argument only applies when -orf is set to 1. By default all
550							initiation codons found in the given codon table are used but when
551							"start" is set to some codon this codon will be used exclusively as
552							the initiation codon. Note that the default codon table (NCBI
553							"Standard") has 3 initiation codons!
554
555							By default translate() translates termination codons to the some
556							character (default is *), both internal and trailing codons. Setting
557							"-complete" to 1 tells translate() to remove the trailing character.
558
559							-offset is used for seqfeatures which contain the the \codon_start tag
560							and can be set to 1, 2, or 3. This is the offset by which the
561							sequence translation starts relative to the first base of the feature
562
563							For details on codon tables used by translate() see L.
564
565							Deprecated argument set (v. 1.5.1 and prior versions) where each argument is an
566							element in an array:
567
568							1: character for terminator (optional), defaults to '*'.
569							2: character for unknown amino acid (optional), defaults to 'X'.
570							3: frame (optional), valid values are 0, 1, 2, defaults to 0.
571							4: codon table id (optional), defaults to 1.
572							5: complete coding sequence expected, defaults to 0 (false).
573							6: boolean, throw exception if not complete coding sequence
574							(true), defaults to warning (false)
575							7: codontable, a custom Bio::Tools::CodonTable object (optional).
576
577							=cut
578
579							sub translate {
580	287			287	1	1213	my ($self,@args) = @_;
581	287					265	my ($terminator, $unknown, $frame, $codonTableId, $complete,
582							$complete_codons, $throw, $codonTable, $orf, $start_codon, $offset);
583
584							## new API with named parameters, post 1.5.1
585	287	100	100			758	if ($args[0] && $args[0] =~ /^-[A-Z]+/i) {
586	24					88	($terminator, $unknown, $frame, $codonTableId, $complete,
587							$complete_codons, $throw,$codonTable, $orf, $start_codon, $offset) =
588							$self->_rearrange([qw(TERMINATOR
589							UNKNOWN
590							FRAME
591							CODONTABLE_ID
592							COMPLETE
593							COMPLETE_CODONS
594							THROW
595							CODONTABLE
596							ORF
597							START
598							OFFSET)], @args);
599							## old API, 1.5.1 and preceding versions
600							} else {
601	263					359	($terminator, $unknown, $frame, $codonTableId,
602							$complete, $throw, $codonTable, $offset) = @args;
603							}
604
605							## Initialize termination codon, unknown codon, codon table id, frame
606	287	100	66			679	$terminator = '*' unless (defined($terminator) and $terminator ne '');
607	287	100	66			563	$unknown = "X" unless (defined($unknown) and $unknown ne '');
608	287	100	66			556	$frame = 0 unless (defined($frame) and $frame ne '');
609	287	100	66			592	$codonTableId = 1 unless (defined($codonTableId) and $codonTableId ne '');
610	287		100			1197	$complete_codons \|\|= $complete \|\| 0;
			100
611
612							## Get a CodonTable, error if custom CodonTable is invalid
613	287	100				373	if ($codonTable) {
614	1	50				4	$self->throw("Need a Bio::Tools::CodonTable object, not ". $codonTable)
615							unless $codonTable->isa('Bio::Tools::CodonTable');
616							} else {
617
618							# shouldn't this be cached? Seems wasteful to have a new instance
619							# every time...
620	286					1092	$codonTable = Bio::Tools::CodonTable->new( -id => $codonTableId);
621							}
622
623							## Error if alphabet is "protein"
624	287	50				626	$self->throw("Can't translate an amino acid sequence.") if
625							($self->alphabet =~ /protein/i);
626
627							## Error if -start parameter isn't a valid codon
628	287	100				448	if ($start_codon) {
629	1	50				3	$self->throw("Invalid start codon: $start_codon.") if
630							( $start_codon !~ /^[A-Z]{3}$/i );
631							}
632
633	287					241	my $seq;
634	287	50				413	if ($offset) {
635	0	0				0	$self->throw("Offset must be 1, 2, or 3.") if
636							( $offset !~ /^[123]$/ );
637	0					0	my ($start, $end) = ($offset, $self->length);
638	0					0	($seq) = $self->subseq($start, $end);
639							} else {
640	287					505	($seq) = $self->seq();
641							}
642
643							## ignore frame if an ORF is supposed to be found
644	287	100				459	if ( $orf ) {
645	12	100				36	my ($orf_region) = $self->_find_orfs_nucleotide( $seq, $codonTable, $start_codon, $orf eq 'longest' ? 0 : 'first_only' );
646	11					24	$seq = $self->_orf_sequence( $seq, $orf_region );
647							} else {
648							## use frame, error if frame is not 0, 1 or 2
649	275	50	100			651	$self->throw("Valid values for frame are 0, 1, or 2, not $frame.")
			66
650							unless ($frame == 0 or $frame == 1 or $frame == 2);
651	275					583	$seq = substr($seq,$frame);
652							}
653
654							## Translate it
655	286					683	my $output = $codonTable->translate($seq, $complete_codons);
656							# Use user-input terminator/unknown
657	286					1038	$output =~ s/\*/$terminator/g;
658	286					406	$output =~ s/X/$unknown/g;
659
660							## Only if we are expecting to translate a complete coding region
661	286	100				443	if ($complete) {
662	9					19	my $id = $self->display_id;
663							# remove the terminator character
664	9	50				16	if( substr($output,-1,1) eq $terminator ) {
665	9					11	chop $output;
666							} else {
667	0	0				0	$throw && $self->throw("Seq [$id]: Not using a valid terminator codon!");
668	0					0	$self->warn("Seq [$id]: Not using a valid terminator codon!");
669							}
670							# test if there are terminator characters inside the protein sequence!
671	9	100				48	if ($output =~ /\Q$terminator\E/) {
672	2		100			8	$id \|\|= '';
673	2	50				14	$throw && $self->throw("Seq [$id]: Terminator codon inside CDS!");
674	0					0	$self->warn("Seq [$id]: Terminator codon inside CDS!");
675							}
676							# if the initiator codon is not ATG, the amino acid needs to be changed to M
677	7	100				15	if ( substr($output,0,1) ne 'M' ) {
678	3	50				10	if ($codonTable->is_start_codon(substr($seq, 0, 3)) ) {
		0
679	3					4	$output = 'M'. substr($output,1);
680							} elsif ($throw) {
681	0					0	$self->throw("Seq [$id]: Not using a valid initiator codon!");
682							} else {
683	0					0	$self->warn("Seq [$id]: Not using a valid initiator codon!");
684							}
685							}
686							}
687
688							# Create a new fresh object if $self is 'Bio::Seq::LargePrimarySeq'
689							# or 'Bio::Seq::LargeSeq', if not take advantage of
690							# Bio::Root::clone to get an object copy
691	284					244	my $out;
692	284	100	100			3023	if ( $self->isa('Bio::Seq::LargePrimarySeq')
693							or $self->isa('Bio::Seq::LargeSeq')
694							) {
695	3					11	my ($seqclass, $opts) = $self->_setup_class;
696	3					12	$out = $seqclass->new(
697							-seq => $output,
698							-is_circular => $self->is_circular,
699							-display_id => $self->display_id,
700							-accession_number => $self->accession_number,
701							-alphabet => 'protein',
702							-desc => $self->desc,
703							-verbose => $self->verbose,
704							%$opts,
705							);
706							} else {
707	281					698	$out = $self->clone;
708	281					706	$out->seq($output);
709	281					519	$out->alphabet('protein');
710							}
711	284					1014	return $out;
712							}
713
714
715							=head2 transcribe()
716
717							Title : transcribe
718							Usage : $xseq = $seq->transcribe
719							Function: Convert base T to base U
720							Returns : PrimarySeqI object of alphabet 'rna' or
721							undef if $seq->alphabet ne 'dna'
722							Args :
723
724							=cut
725
726							sub transcribe {
727	2			2	1	3	my $self = shift;
728	2	100				4	return unless $self->alphabet eq 'dna';
729	1					2	my $s = $self->seq;
730	1					2	$s =~ tr/tT/uU/;
731	1		50			1	my $desc = $self->desc \|\| '';
732
733							# Create a new fresh object if $self is 'Bio::Seq::LargePrimarySeq'
734							# or 'Bio::Seq::LargeSeq', if not take advantage of
735							# Bio::Root::clone to get an object copy
736	1					2	my $out;
737	1	50	33			10	if ( $self->isa('Bio::Seq::LargePrimarySeq')
738							or $self->isa('Bio::Seq::LargeSeq')
739							) {
740	0					0	my ($seqclass, $opts) = $self->_setup_class;
741	0					0	$out = $seqclass->new(
742							-seq => $s,
743							-is_circular => $self->is_circular,
744							-display_id => $self->display_id,
745							-accession_number => $self->accession_number,
746							-alphabet => 'rna',
747							-desc => "${desc}[TRANSCRIBED]",
748							-verbose => $self->verbose,
749							%$opts,
750							);
751							} else {
752	1					3	$out = $self->clone;
753	1					3	$out->seq($s);
754	1					3	$out->alphabet('rna');
755	1					3	$out->desc($desc . "[TRANSCRIBED]");
756							}
757	1					4	return $out;
758							}
759
760
761							=head2 rev_transcribe()
762
763							Title : rev_transcribe
764							Usage : $rtseq = $seq->rev_transcribe
765							Function: Convert base U to base T
766							Returns : PrimarySeqI object of alphabet 'dna' or
767							undef if $seq->alphabet ne 'rna'
768							Args :
769
770							=cut
771
772							sub rev_transcribe {
773	1			1	1	2	my $self = shift;
774	1	50				3	return unless $self->alphabet eq 'rna';
775	1					2	my $s = $self->seq;
776	1					2	$s =~ tr/uU/tT/;
777	1		50			3	my $desc = $self->desc \|\| '';
778
779							# Create a new fresh object if $self is 'Bio::Seq::LargePrimarySeq'
780							# or 'Bio::Seq::LargeSeq', if not take advantage of
781							# Bio::Root::clone to get an object copy
782	1					1	my $out;
783	1	50	33			10	if ( $self->isa('Bio::Seq::LargePrimarySeq')
784							or $self->isa('Bio::Seq::LargeSeq')
785							) {
786	0					0	my ($seqclass, $opts) = $self->_setup_class;
787	0					0	$out = $seqclass->new(
788							-seq => $s,
789							-is_circular => $self->is_circular,
790							-display_id => $self->display_id,
791							-accession_number => $self->accession_number,
792							-alphabet => 'dna',
793							-desc => $self->desc . "[REVERSE TRANSCRIBED]",
794							-verbose => $self->verbose,
795							%$opts,
796							);
797							} else {
798	1					4	$out = $self->clone;
799	1					2	$out->seq($s);
800	1					2	$out->alphabet('dna');
801	1					4	$out->desc($desc . "[REVERSE TRANSCRIBED]");
802							}
803	1					3	return $out;
804							}
805
806
807							=head2 id
808
809							Title : id
810							Usage : $id = $seq->id()
811							Function: ID of the sequence. This should normally be (and actually is in
812							the implementation provided here) just a synonym for display_id().
813							Returns : A string.
814							Args :
815
816							=cut
817
818							sub id {
819	1			1	1	1	my ($self)= @_;
820	1					3	return $self->display_id();
821							}
822
823
824							=head2 length
825
826							Title : length
827							Usage : $len = $seq->length()
828							Function:
829							Returns : Integer representing the length of the sequence.
830							Args :
831
832							=cut
833
834							sub length {
835	0			0	1	0	my ($self)= @_;
836	0					0	$self->throw_not_implemented();
837							}
838
839
840							=head2 desc
841
842							Title : desc
843							Usage : $seq->desc($newval);
844							$description = $seq->desc();
845							Function: Get/set description text for a seq object
846							Returns : Value of desc
847							Args : newvalue (optional)
848
849							=cut
850
851							sub desc {
852	0			0	1	0	shift->throw_not_implemented();
853							}
854
855
856							=head2 is_circular
857
858							Title : is_circular
859							Usage : if( $obj->is_circular) { # Do something }
860							Function: Returns true if the molecule is circular
861							Returns : Boolean value
862							Args : none
863
864							=cut
865
866							sub is_circular {
867	0			0	1	0	shift->throw_not_implemented;
868							}
869
870
871							=head1 Private functions
872
873							These are some private functions for the PrimarySeqI interface. You do not
874							need to implement these functions
875
876							=head2 _find_orfs_nucleotide
877
878							Title : _find_orfs_nucleotide
879							Usage :
880							Function: Finds ORF starting at 1st initiation codon in nucleotide sequence.
881							The ORF is not required to have a termination codon.
882							Example :
883							Returns : a list of string coordinates of ORF locations (0-based half-open),
884							sorted descending by length (so that the longest is first)
885							as: [ start, end, frame, length ], [ start, end, frame, length ], ...
886							Args : Nucleotide sequence,
887							CodonTable object,
888							(optional) alternative initiation codon (e.g. 'ATA'),
889							(optional) boolean that, if true, stops after finding the
890							first available ORF
891
892							=cut
893
894							sub _find_orfs_nucleotide {
895	14			14		19	my ( $self, $sequence, $codon_table, $start_codon, $first_only ) = @_;
896	14					18	$sequence = uc $sequence;
897	14	100				19	$start_codon = uc $start_codon if $start_codon;
898
899							my $is_start = $start_codon
900	12			12		27	? sub { shift eq $start_codon }
901	14	100		697		42	: sub { $codon_table->is_start_codon( shift ) };
	697					795
902
903							# stores the begin index of the currently-running ORF in each
904							# reading frame
905	14					22	my @current_orf_start = (-1,-1,-1);
906
907							#< stores coordinates of longest observed orf (so far) in each
908							# reading frame
909	14					8	my @orfs;
910
911							# go through each base of the sequence, and each reading frame for each base
912	14					9	my $seqlen = CORE::length $sequence;
913	14					11	my @start_frame_order;
914	14					28	for( my $j = 0; $j <= $seqlen-3; $j++ ) {
915	1239					928	my $frame = $j % 3;
916
917	1239					973	my $this_codon = substr( $sequence, $j, 3 );
918
919							# if in an orf and this is either a stop codon or the last in-frame codon in the string
920	1239	100				1396	if ( $current_orf_start[$frame] >= 0 ) {
		100
921	530	100	100			600	if ( $codon_table->is_ter_codon( $this_codon ) \|\|( my $is_last_codon_in_frame = ($j >= $seqlen-5)) ) {
922							# record ORF start, end (half-open), length, and frame
923	44					63	my @this_orf = ( $current_orf_start[$frame], $j+3, undef, $frame );
924	44					47	my $this_orf_length = $this_orf[2] = ( $this_orf[1] - $this_orf[0] );
925
926	44	100	100			95	if ($first_only && $frame == $start_frame_order[0]) {
927	10	100				20	$self->warn( "Translating partial ORF "
928							.$self->_truncate_seq( $self->_orf_sequence( $sequence, \@this_orf ))
929							.' from end of nucleotide sequence'
930							) if $is_last_codon_in_frame;
931	9					33	return \@this_orf;
932							}
933	34					30	push @orfs, \@this_orf;
934	34					73	$current_orf_start[$frame] = -1;
935							}
936							}
937							# if this is a start codon
938							elsif ( $is_start->($this_codon) ) {
939	44					29	$current_orf_start[$frame] = $j;
940	44					80	push @start_frame_order, $frame;
941							}
942							}
943
944	4					20	return sort { $b->[2] <=> $a->[2] } @orfs;
	88					73
945							}
946
947
948							sub _truncate_seq {
949	4			4		5	my ($self, $seq) = @_;
950	4	50				16	return CORE::length($seq) > 200 ? substr($seq,0,50).'...'.substr($seq,-50) : $seq;
951							}
952
953
954							sub _orf_sequence {
955	15			15		15	my ($self, $seq, $orf ) = @_;
956	15	50				22	return '' unless $orf;
957	15					39	return substr( $seq, $orf->[0], $orf->[2] )
958							}
959
960
961							=head2 _attempt_to_load_Seq
962
963							Title : _attempt_to_load_Seq
964							Usage :
965							Function:
966							Example :
967							Returns :
968							Args :
969
970							=cut
971
972							sub _attempt_to_load_Seq {
973	0			0		0	my ($self) = @_;
974
975	0	0				0	if( $main::{'Bio::PrimarySeq'} ) {
976	0					0	return 1;
977							} else {
978	0					0	eval {
979	0					0	require Bio::PrimarySeq;
980							};
981	0	0				0	if( $@ ) {
982	0					0	my $text = "Bio::PrimarySeq could not be loaded for [$self]\n".
983							"This indicates that you are using Bio::PrimarySeqI ".
984							"without Bio::PrimarySeq loaded or without providing a ".
985							"complete implementation.\nThe most likely problem is that there ".
986							"has been a misconfiguration of the bioperl environment\n".
987							"Actual exception:\n\n";
988	0					0	$self->throw("$text$@\n");
989	0					0	return 0;
990							}
991	0					0	return 1;
992							}
993							}
994
995
996							sub _setup_class {
997							# Return name of class and setup some default parameters
998	11			11		11	my ($self) = @_;
999	11					11	my $seqclass;
1000	11	50				28	if ($self->can_call_new()) {
1001	11					17	$seqclass = ref($self);
1002							} else {
1003	0					0	$seqclass = 'Bio::PrimarySeq';
1004	0					0	$self->_attempt_to_load_Seq();
1005							}
1006	11					7	my %opts;
1007	11	50				21	if ($seqclass eq 'Bio::PrimarySeq') {
1008							# Since sequence is in a Seq object, it has already been validated.
1009							# We do not need to validate its trunc(), revcom(), etc
1010	0					0	$opts{ -direct } = 1;
1011							}
1012	11					19	return $seqclass, \%opts;
1013							}
1014
1015
1016							1;