File Coverage

Bio/PrimarySeq.pm
Criterion Covered Total %
statement 172 183 93.9
branch 121 130 93.0
condition 28 32 87.5
subroutine 24 25 96.0
pod 19 20 95.0
total 364 390 93.3


line stmt bran cond sub pod time code
1             #
2             # bioperl module for Bio::PrimarySeq
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Ewan Birney
7             #
8             # Copyright Ewan Birney
9             #
10             # You may distribute this module under the same terms as perl itself
11              
12             # POD documentation - main docs before the code
13              
14             =head1 NAME
15              
16             Bio::PrimarySeq - Bioperl lightweight sequence object
17              
18             =head1 SYNOPSIS
19              
20             # Bio::SeqIO for file reading, Bio::DB::GenBank for
21             # database reading
22              
23             use Bio::Seq;
24             use Bio::SeqIO;
25             use Bio::DB::GenBank;
26              
27             # make from memory
28              
29             $seqobj = Bio::PrimarySeq->new (
30             -seq => 'ATGGGGTGGGCGGTGGGTGGTTTG',
31             -id => 'GeneFragment-12',
32             -accession_number => 'X78121',
33             -alphabet => 'dna',
34             -is_circular => 1,
35             );
36             print "Sequence ", $seqobj->id(), " with accession ",
37             $seqobj->accession_number, "\n";
38              
39             # read from file
40              
41             $inputstream = Bio::SeqIO->new(
42             -file => "myseq.fa",
43             -format => 'Fasta',
44             );
45             $seqobj = $inputstream->next_seq();
46             print "Sequence ", $seqobj->id(), " and desc ", $seqobj->desc, "\n";
47              
48             # to get out parts of the sequence.
49              
50             print "Sequence ", $seqobj->id(), " with accession ",
51             $seqobj->accession_number, " and desc ", $seqobj->desc, "\n";
52              
53             $string = $seqobj->seq();
54             $string2 = $seqobj->subseq(1,40);
55              
56             =head1 DESCRIPTION
57              
58             PrimarySeq is a lightweight sequence object, storing the sequence, its
59             name, a computer-useful unique name, and other fundamental attributes.
60             It does not contain sequence features or other information. To have a
61             sequence with sequence features you should use the Seq object which uses
62             this object.
63              
64             Although new users will use Bio::PrimarySeq a lot, in general you will
65             be using it from the Bio::Seq object. For more information on Bio::Seq
66             see L. For interest you might like to know that
67             Bio::Seq has-a Bio::PrimarySeq and forwards most of the function calls
68             to do with sequence to it (the has-a relationship lets us get out of a
69             otherwise nasty cyclical reference in Perl which would leak memory).
70              
71             Sequence objects are defined by the Bio::PrimarySeqI interface, and this
72             object is a pure Perl implementation of the interface. If that's
73             gibberish to you, don't worry. The take home message is that this
74             object is the bioperl default sequence object, but other people can
75             use their own objects as sequences if they so wish. If you are
76             interested in wrapping your own objects as compliant Bioperl sequence
77             objects, then you should read the Bio::PrimarySeqI documentation
78              
79             The documentation of this object is a merge of the Bio::PrimarySeq and
80             Bio::PrimarySeqI documentation. This allows all the methods which you can
81             call on sequence objects here.
82              
83             =head1 FEEDBACK
84              
85             =head2 Mailing Lists
86              
87             User feedback is an integral part of the evolution of this and other
88             Bioperl modules. Send your comments and suggestions preferably to one
89             of the Bioperl mailing lists. Your participation is much appreciated.
90              
91             bioperl-l@bioperl.org - General discussion
92             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
93              
94             =head2 Support
95              
96             Please direct usage questions or support issues to the mailing list:
97              
98             I
99              
100             rather than to the module maintainer directly. Many experienced and
101             reponsive experts will be able look at the problem and quickly
102             address it. Please include a thorough description of the problem
103             with code and data examples if at all possible.
104              
105             =head2 Reporting Bugs
106              
107             Report bugs to the Bioperl bug tracking system to help us keep track
108             the bugs and their resolution. Bug reports can be submitted via the
109             web:
110              
111             https://github.com/bioperl/bioperl-live/issues
112              
113             =head1 AUTHOR - Ewan Birney
114              
115             Email birney@ebi.ac.uk
116              
117             =head1 APPENDIX
118              
119             The rest of the documentation details each of the object
120             methods. Internal methods are usually preceded with a _
121              
122             =cut
123              
124              
125              
126             package Bio::PrimarySeq;
127              
128 203     203   9205 use strict;
  203         407  
  203         9701  
129              
130             our $MATCHPATTERN = 'A-Za-z\-\.\*\?=~';
131             our $GAP_SYMBOLS = '-~';
132              
133 203         69635 use base qw(Bio::Root::Root Bio::PrimarySeqI
134 203     203   984 Bio::IdentifiableI Bio::DescribableI);
  203         339  
135              
136              
137             # Setup the allowed values for alphabet()
138             my %valid_type = map {$_, 1} qw( dna rna protein );
139              
140              
141             =head2 new
142              
143             Title : new
144             Usage : $seqobj = Bio::PrimarySeq->new( -seq => 'ATGGGGGTGGTGGTACCCT',
145             -id => 'human_id',
146             -accession_number => 'AL000012',
147             );
148             Function: Returns a new primary seq object from
149             basic constructors, being a string for the sequence
150             and strings for id and accession_number.
151              
152             Note that you can provide an empty sequence string. However, in
153             this case you MUST specify the type of sequence you wish to
154             initialize by the parameter -alphabet. See alphabet() for possible
155             values.
156             Returns : a new Bio::PrimarySeq object
157             Args : -seq => sequence string
158             -ref_to_seq => ... or reference to a sequence string
159             -display_id => display id of the sequence (locus name)
160             -accession_number => accession number
161             -primary_id => primary id (Genbank id)
162             -version => version number
163             -namespace => the namespace for the accession
164             -authority => the authority for the namespace
165             -description => description text
166             -desc => alias for description
167             -alphabet => skip alphabet guess and set it to dna, rna or protein
168             -id => alias for display id
169             -is_circular => boolean to indicate that sequence is circular
170             -direct => boolean to directly set sequences. The next time -seq,
171             seq() or -ref_to_seq is use, the sequence will not be
172             validated. Be careful with this...
173             -nowarnonempty => boolean to avoid warning when sequence is empty
174              
175             =cut
176              
177             sub new {
178 14902     14902 1 52355 my ($class, @args) = @_;
179 14902         35553 my $self = $class->SUPER::new(@args);
180 14902         63847 my ($seq, $id, $acc, $pid, $ns, $auth, $v, $oid, $desc, $description,
181             $alphabet, $given_id, $is_circular, $direct, $ref_to_seq, $len,
182             $nowarnonempty) =
183             $self->_rearrange([qw(SEQ
184             DISPLAY_ID
185             ACCESSION_NUMBER
186             PRIMARY_ID
187             NAMESPACE
188             AUTHORITY
189             VERSION
190             OBJECT_ID
191             DESC
192             DESCRIPTION
193             ALPHABET
194             ID
195             IS_CIRCULAR
196             DIRECT
197             REF_TO_SEQ
198             LENGTH
199             NOWARNONEMPTY
200             )],
201             @args);
202              
203             # Private var _nowarnonempty, needs to be set before calling _guess_alphabet
204 14902         40571 $self->{'_nowarnonempty'} = $nowarnonempty;
205 14902         21600 $self->{'_direct'} = $direct;
206              
207 14902 100 100     31238 if( defined $id && defined $given_id ) {
208 6 50       24 if( $id ne $given_id ) {
209 0         0 $self->throw("Provided both id and display_id constructors: [$id] [$given_id]");
210             }
211             }
212 14902 100       24895 if( defined $given_id ) { $id = $given_id; }
  11743         16264  
213              
214             # Bernd's idea: set ids now for more informative invalid sequence messages
215 14902 100       38606 defined $id && $self->display_id($id);
216 14902 100       23366 $acc && $self->accession_number($acc);
217 14902 100       21643 defined $pid && $self->primary_id($pid);
218              
219             # Set alphabet now to avoid guessing it later, when sequence is set
220 14902 100       33192 $alphabet && $self->alphabet($alphabet);
221              
222             # Set the length before the seq. If there is a seq, length will be updated later
223 14902   100     38898 $self->{'length'} = $len || 0;
224              
225             # Set the sequence (but also alphabet and length)
226 14902 100       23413 if ($ref_to_seq) {
227 1         3 $self->_set_seq_by_ref($ref_to_seq, $alphabet);
228             } else {
229 14901 100       22472 if (defined $seq) {
230             # Note: the sequence string may be empty
231 14189         22432 $self->seq($seq);
232             }
233             }
234              
235 14900 100       28823 $desc && $self->desc($desc);
236 14900 100       21908 $description && $self->description($description);
237 14900 100       21629 $ns && $self->namespace($ns);
238 14900 100       21017 $auth && $self->authority($auth);
239             # Any variable that can have a value "0" must be tested with defined
240             # or it will fail to be added to the new object
241 14900 100       22094 defined($v) && $self->version($v);
242 14900 50       23177 defined($oid) && $self->object_id($oid);
243 14900 100       21161 defined($is_circular) && $self->is_circular($is_circular);
244              
245 14900         46894 return $self;
246             }
247              
248              
249             =head2 seq
250              
251             Title : seq
252             Usage : $string = $seqobj->seq();
253             Function: Get or set the sequence as a string of letters. The case of
254             the letters is left up to the implementer. Suggested cases are
255             upper case for proteins and lower case for DNA sequence (IUPAC
256             standard), but you should not rely on this. An error is thrown if
257             the sequence contains invalid characters: see validate_seq().
258             Returns : A scalar
259             Args : - Optional new sequence value (a string) to set
260             - Optional alphabet (it is guessed by default)
261              
262             =cut
263              
264             sub seq {
265 168331     168331 1 253516 my ($self, @args) = @_;
266              
267 168331 100       233023 if( scalar @args == 0 ) {
268 140950         342302 return $self->{'seq'};
269             }
270              
271 27381         40587 my ($seq_str, $alphabet) = @args;
272 27381 50       41124 if (@args) {
273 27381         44455 $self->_set_seq_by_ref(\$seq_str, $alphabet);
274             }
275              
276 27378         44649 return $self->{'seq'};
277             }
278              
279              
280             sub _set_seq_by_ref {
281             # Set a sequence by reference. A reference is used to avoid the cost of
282             # copying the sequence (which can be very large) between functions.
283 27382     27382   37124 my ($self, $seq_str_ref, $alphabet) = @_;
284              
285             # Validate sequence if sequence is not empty and we are not in direct mode
286 27382 100 100     82370 if ( (! $self->{'_direct'}) && (defined $$seq_str_ref) ) {
287 27210         45989 $self->validate_seq($$seq_str_ref, 1);
288             }
289 27379         36974 delete $self->{'_direct'}; # next sequence will have to be validated
290              
291             # Record sequence length
292 27379   100     50196 my $len = CORE::length($$seq_str_ref || '');
293 27379   100     59229 my $is_changed_seq = (exists $self->{'seq'}) && ($len > 0);
294             # Note: if the new seq is empty or undef, this is not considered a change
295 27379 100       43842 delete $self->{'_freeze_length'} if $is_changed_seq;
296 27379 100       46339 $self->{'length'} = $len if not exists $self->{'_freeze_length'};
297              
298             # Set sequence
299 27379         38134 $self->{'seq'} = $$seq_str_ref;
300              
301             # Set or guess alphabet
302 27379 100 100     65173 if ($alphabet) {
    100          
303             # Alphabet specified, set it no matter what
304 18         31 $self->alphabet($alphabet);
305             } elsif ($is_changed_seq || (! defined($self->alphabet()))) {
306             # If we changed a previous sequence to a new one or if there is no
307             # alphabet yet at all, we need to guess the (possibly new) alphabet
308 15706         25638 $self->_guess_alphabet();
309             } # else (seq not changed and alphabet was defined) do nothing
310              
311 27379         37368 return 1;
312             }
313              
314              
315             =head2 validate_seq
316              
317             Title : validate_seq
318             Usage : if(! $seqobj->validate_seq($seq_str) ) {
319             print "sequence $seq_str is not valid for an object of
320             alphabet ",$seqobj->alphabet, "\n";
321             }
322             Function: Test that the given sequence is valid, i.e. contains only valid
323             characters. The allowed characters are all letters (A-Z) and '-','.',
324             '*','?','=' and '~'. Spaces are not valid. Note that this
325             implementation does not take alphabet() into account and that empty
326             sequences are considered valid.
327             Returns : 1 if the supplied sequence string is valid, 0 otherwise.
328             Args : - Sequence string to be validated
329             - Boolean to optionally throw an error if the sequence is invalid
330              
331             =cut
332              
333             sub validate_seq {
334 23109     23109 1 33264 my ($self, $seqstr, $throw) = @_;
335 23109 100 100     134477 if ( (defined $seqstr ) &&
336             ($seqstr !~ /^[$MATCHPATTERN]*$/) ) {
337 8 100       18 if ($throw) {
338 3   50     8 $self->throw("Failed validation of sequence '".(defined($self->id) ||
339             '[unidentified sequence]')."'. Invalid characters were: " .
340             join('',($seqstr =~ /[^$MATCHPATTERN]/g)));
341             }
342 5         25 return 0;
343             }
344 23101         37534 return 1;
345             }
346              
347              
348             =head2 subseq
349              
350             Title : subseq
351             Usage : $substring = $seqobj->subseq(10,40);
352             $substring = $seqobj->subseq(10,40,'nogap');
353             $substring = $seqobj->subseq(-start=>10, -end=>40, -replace_with=>'tga');
354             $substring = $seqobj->subseq($location_obj);
355             $substring = $seqobj->subseq($location_obj, -nogap => 1);
356             Function: Return the subseq from start to end, where the first sequence
357             character has coordinate 1 number is inclusive, ie 1-2 are the
358             first two characters of the sequence. The given start coordinate
359             has to be larger than the end, even if the sequence is circular.
360             Returns : a string
361             Args : integer for start position
362             integer for end position
363             OR
364             Bio::LocationI location for subseq (strand honored)
365             Specify -NOGAP=>1 to return subseq with gap characters removed
366             Specify -REPLACE_WITH=>$new_subseq to replace the subseq returned
367             with $new_subseq in the sequence object
368              
369             =cut
370              
371             sub subseq {
372 10043     10043 1 10712 my $self = shift;
373 10043         12891 my @args = @_;
374 10043         24111 my ($start, $end, $nogap, $replace) = $self->_rearrange([qw(START
375             END
376             NOGAP
377             REPLACE_WITH)], @args);
378              
379             # If -replace_with is specified, validate the replacement sequence
380 10043 100       18441 if (defined $replace) {
381 2 100       4 $self->validate_seq( $replace ) ||
382             $self->throw("Replacement sequence does not look valid");
383             }
384              
385 10042 100 66     29026 if( ref($start) && $start->isa('Bio::LocationI') ) {
    50 33        
386 52         60 my $loc = $start;
387 52         75 my $seq = '';
388              
389             # For Split objects if Guide Strand is negative,
390             # pass the sublocations in reverse
391 52         59 my $order = 0;
392 52 100       114 if ($loc->isa('Bio::Location::SplitLocationI')) {
393             # guide_strand can return undef, so don't compare directly
394             # to avoid 'uninitialized value' warning
395 44 100       103 my $guide_strand = defined ($loc->guide_strand) ? ($loc->guide_strand) : 0;
396 44 100       82 $order = ($guide_strand == -1) ? -1 : 0;
397             }
398             # Reversing order using ->each_Location(-1) does not work well for
399             # cut by origin-splits (like "complement(join(1900..END,START..50))"),
400             # so use "reverse" instead
401 52 100       128 my @sublocs = ($order == -1) ? reverse $loc->each_Location(): $loc->each_Location;
402 52         70 foreach my $subloc (@sublocs) {
403 120         237 my $piece = $self->subseq(-start => $subloc->start(),
404             -end => $subloc->end(),
405             -replace_with => $replace,
406             -nogap => $nogap);
407 120 100       210 $piece =~ s/[$GAP_SYMBOLS]//g if $nogap;
408              
409             # strand can return undef, so don't compare directly
410             # to avoid 'uninitialized value' warning
411 120 100       236 my $strand = defined ($subloc->strand) ? ($subloc->strand) : 0;
412 120 100       219 if ($strand < 0) {
413 59         98 $piece = $self->_revcom_from_string($piece, $self->alphabet);
414             }
415 120         227 $seq .= $piece;
416             }
417 52         224 return $seq;
418             } elsif( defined $start && defined $end ) {
419 9990 50       13464 if( $start > $end ){
420 0         0 $self->throw("Bad start,end parameters. Start [$start] has to be ".
421             "less than end [$end]");
422             }
423 9990 50       12541 if( $start <= 0 ) {
424 0         0 $self->throw("Bad start parameter ($start). Start must be positive.");
425             }
426              
427             # Remove one from start, and then length is end-start
428 9990         9376 $start--;
429              
430 9990         8557 my $seqstr;
431 9990 100       11248 if (defined $replace) {
432 1         4 $seqstr = substr $self->{seq}, $start, $end-$start, $replace;
433             } else {
434 9989         17443 $seqstr = substr $self->{seq}, $start, $end-$start;
435             }
436              
437              
438 9990 100       14489 if ($end > $self->length) {
439 1 50       2 if ($self->is_circular) {
440 1         2 my $start = 0;
441 1         3 my $end = $end - $self->length;
442              
443 1         2 my $appendstr;
444 1 50       2 if (defined $replace) {
445 0         0 $appendstr = substr $self->{seq}, $start, $end-$start, $replace;
446             } else {
447 1         2 $appendstr = substr $self->{seq}, $start, $end-$start;
448             }
449              
450 1         2 $seqstr .= $appendstr;
451             } else {
452 0         0 $self->throw("Bad end parameter ($end). End must be less than ".
453             "the total length of sequence (total=".$self->length.")")
454             }
455             }
456              
457 9990 100       13400 $seqstr =~ s/[$GAP_SYMBOLS]//g if ($nogap);
458 9990         21032 return $seqstr;
459              
460             } else {
461 0         0 $self->warn("Incorrect parameters to subseq - must be two integers or ".
462             "a Bio::LocationI object. Got:", $self,$start,$end,$replace,$nogap);
463 0         0 return;
464             }
465             }
466              
467              
468             =head2 length
469              
470             Title : length
471             Usage : $len = $seqobj->length();
472             Function: Get the stored length of the sequence in number of symbols (bases
473             or amino acids). In some circumstances, you can also set this attribute:
474              
475             1. For empty sequences, you can set the length to anything you want:
476             my $seqobj = Bio::PrimarySeq->new( -length => 123 );
477             my $len = $seqobj->len; # 123
478             2. To save memory when using very long sequences, you can set the
479             length of the sequence to the length of the sequence (and nothing
480             else):
481             my $seqobj = Bio::PrimarySeq->new( -seq => 'ACGT...' ); # 1 Mbp sequence
482             # process $seqobj... then after you're done with it
483             $seqobj->length($seqobj->length);
484             $seqobj->seq(undef); # free memory!
485             my $len = $seqobj->len; # 1 Mbp
486              
487             Note that if you set seq() to a value other than undef at any time,
488             the length attribute will be reset.
489             Returns : integer representing the length of the sequence.
490             Args : Optionally, the value on set
491              
492             =cut
493              
494             sub length {
495 30706     30706 1 43613 my ($self, $val) = @_;
496 30706 100       39800 if (defined $val) {
497 5         10 my $len = $self->{'length'};
498 5 100 100     17 if ($len && ($len != $val)) {
499 1         6 $self->throw("Can not set the length to $val, current length value is $len");
500             }
501 4         7 $self->{'length'} = $val;
502 4         8 $self->{'_freeze_length'} = undef;
503             }
504 30705         49670 return $self->{'length'};
505             }
506              
507              
508             =head2 display_id
509              
510             Title : display_id or display_name
511             Usage : $id_string = $seqobj->display_id();
512             Function: Get or set the display id, aka the common name of the sequence object.
513              
514             The semantics of this is that it is the most likely string to
515             be used as an identifier of the sequence, and likely to have
516             "human" readability. The id is equivalent to the ID field of
517             the GenBank/EMBL databanks and the id field of the
518             Swissprot/sptrembl database. In fasta format, the >(\S+) is
519             presumed to be the id, though some people overload the id to
520             embed other information. Bioperl does not use any embedded
521             information in the ID field, and people are encouraged to use
522             other mechanisms (accession field for example, or extending
523             the sequence object) to solve this.
524              
525             With the new Bio::DescribeableI interface, display_name aliases
526             to this method.
527             Returns : A string for the display ID
528             Args : Optional string for the display ID to set
529              
530             =cut
531              
532             sub display_id {
533 23738     23738 1 37244 my ($self, $value) = @_;
534 23738 100       34722 if( defined $value) {
535 13993         21316 $self->{'display_id'} = $value;
536             }
537 23738         42199 return $self->{'display_id'};
538             }
539              
540              
541             =head2 accession_number
542              
543             Title : accession_number or object_id
544             Usage : $unique_key = $seqobj->accession_number;
545             Function: Returns the unique biological id for a sequence, commonly
546             called the accession_number. For sequences from established
547             databases, the implementors should try to use the correct
548             accession number. Notice that primary_id() provides the
549             unique id for the implementation, allowing multiple objects
550             to have the same accession number in a particular implementation.
551              
552             For sequences with no accession number, this method should
553             return "unknown".
554              
555             [Note this method name is likely to change in 1.3]
556              
557             With the new Bio::IdentifiableI interface, this is aliased
558             to object_id
559             Returns : A string
560             Args : A string (optional) for setting
561              
562             =cut
563              
564             sub accession_number {
565 900     900 1 1935 my( $self, $acc ) = @_;
566 900 100       1794 if (defined $acc) {
567 654         1127 $self->{'accession_number'} = $acc;
568             } else {
569 246         417 $acc = $self->{'accession_number'};
570 246 100       557 $acc = 'unknown' unless defined $acc;
571             }
572 900         1772 return $acc;
573             }
574              
575              
576             =head2 primary_id
577              
578             Title : primary_id
579             Usage : $unique_key = $seqobj->primary_id;
580             Function: Returns the unique id for this object in this
581             implementation. This allows implementations to manage their
582             own object ids in a way the implementation can control
583             clients can expect one id to map to one object.
584              
585             For sequences with no natural primary id, this method
586             should return a stringified memory location.
587             Returns : A string
588             Args : A string (optional, for setting)
589              
590             =cut
591              
592             sub primary_id {
593 511     511 1 913 my $self = shift;
594              
595 511 100       1004 if(@_) {
596 453         717 $self->{'primary_id'} = shift;
597             }
598 511 100       1059 if( ! defined($self->{'primary_id'}) ) {
599 21         250 return "$self";
600             }
601 490         667 return $self->{'primary_id'};
602             }
603              
604              
605             =head2 alphabet
606              
607             Title : alphabet
608             Usage : if( $seqobj->alphabet eq 'dna' ) { # Do something }
609             Function: Get/set the alphabet of sequence, one of
610             'dna', 'rna' or 'protein'. This is case sensitive.
611              
612             This is not called because this would cause
613             upgrade problems from the 0.5 and earlier Seq objects.
614             Returns : a string either 'dna','rna','protein'. NB - the object must
615             make a call of the type - if there is no alphabet specified it
616             has to guess.
617             Args : optional string to set : 'dna' | 'rna' | 'protein'
618              
619              
620             =cut
621              
622             sub alphabet {
623 54742     54742 1 83427 my ($self,$value) = @_;
624 54742 100       78401 if (defined $value) {
625 27849         40535 $value = lc $value;
626 27849 50       49051 unless ( $valid_type{$value} ) {
627 0         0 $self->throw("Alphabet '$value' is not a valid alphabet (".
628             join(',', map "'$_'", sort keys %valid_type) .") lowercase");
629             }
630 27849         39547 $self->{'alphabet'} = $value;
631             }
632 54742         98214 return $self->{'alphabet'};
633             }
634              
635              
636             =head2 desc
637              
638             Title : desc or description
639             Usage : $seqobj->desc($newval);
640             Function: Get/set description of the sequence.
641              
642             'description' is an alias for this for compliance with the
643             Bio::DescribeableI interface.
644             Returns : value of desc (a string)
645             Args : newvalue (a string or undef, optional)
646              
647              
648             =cut
649              
650             sub desc{
651 1711     1711 1 8288 my $self = shift;
652              
653 1711 100       4325 return $self->{'desc'} = shift if @_;
654 454         1637 return $self->{'desc'};
655             }
656              
657              
658             =head2 can_call_new
659              
660             Title : can_call_new
661             Usage :
662             Function:
663             Example :
664             Returns : true
665             Args :
666              
667             =cut
668              
669             sub can_call_new {
670 10     10 1 98 my ($self) = @_;
671              
672 10         26 return 1;
673             }
674              
675              
676             =head2 id
677              
678             Title : id
679             Usage : $id = $seqobj->id();
680             Function: This is mapped on display_id
681             Example :
682             Returns :
683             Args :
684              
685             =cut
686              
687             sub id {
688 9010     9010 1 19459 return shift->display_id(@_);
689             }
690              
691              
692             =head2 is_circular
693              
694             Title : is_circular
695             Usage : if( $seqobj->is_circular) { # Do something }
696             Function: Returns true if the molecule is circular
697             Returns : Boolean value
698             Args : none
699              
700             =cut
701              
702             sub is_circular{
703 143408     143408 1 140853 my $self = shift;
704 143408 100       189359 return $self->{'is_circular'} = shift if @_;
705 143382         272558 return $self->{'is_circular'};
706             }
707              
708              
709             =head1 Methods for Bio::IdentifiableI compliance
710              
711             =head2 object_id
712              
713             Title : object_id
714             Usage : $string = $seqobj->object_id();
715             Function: Get or set a string which represents the stable primary identifier
716             in this namespace of this object. For DNA sequences this
717             is its accession_number, similarly for protein sequences.
718              
719             This is aliased to accession_number().
720             Returns : A scalar
721             Args : Optional object ID to set.
722              
723             =cut
724              
725             sub object_id {
726 4     4 1 10 return shift->accession_number(@_);
727             }
728              
729              
730             =head2 version
731              
732             Title : version
733             Usage : $version = $seqobj->version();
734             Function: Get or set a number which differentiates between versions of
735             the same object. Higher numbers are considered to be
736             later and more relevant, but a single object described
737             the same identifier should represent the same concept.
738             Returns : A number
739             Args : Optional version to set.
740              
741             =cut
742              
743             sub version{
744 3582     3582 1 4940 my ($self,$value) = @_;
745 3582 100       5374 if( defined $value) {
746 292         761 $self->{'_version'} = $value;
747             }
748 3582         6760 return $self->{'_version'};
749             }
750              
751              
752             =head2 authority
753              
754             Title : authority
755             Usage : $authority = $seqobj->authority();
756             Function: Get or set a string which represents the organisation which
757             granted the namespace, written as the DNS name of the
758             organisation (eg, wormbase.org).
759             Returns : A scalar
760             Args : Optional authority to set.
761              
762             =cut
763              
764             sub authority {
765 91     91 1 111 my ($self, $value) = @_;
766 91 100       117 if( defined $value) {
767 86         110 $self->{'authority'} = $value;
768             }
769 91         118 return $self->{'authority'};
770             }
771              
772              
773             =head2 namespace
774              
775             Title : namespace
776             Usage : $string = $seqobj->namespace();
777             Function: Get or set a string representing the name space this identifier
778             is valid in, often the database name or the name describing the
779             collection.
780             Returns : A scalar
781             Args : Optional namespace to set.
782              
783             =cut
784              
785             sub namespace{
786 527     527 1 847 my ($self,$value) = @_;
787 527 100       837 if( defined $value) {
788 493         937 $self->{'namespace'} = $value;
789             }
790 527   100     1154 return $self->{'namespace'} || "";
791             }
792              
793              
794             =head1 Methods for Bio::DescribableI compliance
795              
796             This comprises of display_name and description.
797              
798             =head2 display_name
799              
800             Title : display_name
801             Usage : $string = $seqobj->display_name();
802             Function: Get or set a string which is what should be displayed to the user.
803             The string should have no spaces (ideally, though a cautious
804             user of this interface would not assume this) and should be
805             less than thirty characters (though again, double checking
806             this is a good idea).
807              
808             This is aliased to display_id().
809             Returns : A string for the display name
810             Args : Optional string for the display name to set.
811              
812             =cut
813              
814             sub display_name {
815 2     2 1 7 return shift->display_id(@_);
816             }
817              
818              
819             =head2 description
820              
821             Title : description
822             Usage : $string = $seqobj->description();
823             Function: Get or set a text string suitable for displaying to the user a
824             description. This string is likely to have spaces, but
825             should not have any newlines or formatting - just plain
826             text. The string should not be greater than 255 characters
827             and clients can feel justified at truncating strings at 255
828             characters for the purposes of display.
829              
830             This is aliased to desc().
831             Returns : A string for the description
832             Args : Optional string for the description to set.
833              
834             =cut
835              
836             sub description {
837 91     91 1 202 return shift->desc(@_);
838             }
839              
840              
841             =head1 Methods Inherited from Bio::PrimarySeqI
842              
843             These methods are available on Bio::PrimarySeq, although they are
844             actually implemented on Bio::PrimarySeqI
845              
846             =head2 revcom
847              
848             Title : revcom
849             Usage : $rev = $seqobj->revcom();
850             Function: Produces a new Bio::SeqI implementing object which
851             is the reversed complement of the sequence. For protein
852             sequences this throws an exception of
853             "Sequence is a protein. Cannot revcom".
854              
855             The id is the same id as the original sequence, and the
856             accession number is also identical. If someone wants to
857             track that this sequence has be reversed, it needs to
858             define its own extensions.
859              
860             To do an inplace edit of an object you can go:
861              
862             $seqobj = $seqobj->revcom();
863              
864             This of course, causes Perl to handle the garbage
865             collection of the old object, but it is roughly speaking as
866             efficient as an inplace edit.
867             Returns : A new (fresh) Bio::SeqI object
868             Args : none
869              
870             =head2 trunc
871              
872             Title : trunc
873             Usage : $subseq = $myseq->trunc(10,100);
874             Function: Provides a truncation of a sequence,
875             Returns : A fresh Bio::SeqI implementing object.
876             Args : Numbers for the start and end positions
877              
878             =head1 Internal methods
879              
880             These are internal methods to PrimarySeq
881              
882             =head2 _guess_alphabet
883              
884             Title : _guess_alphabet
885             Usage :
886             Function: Automatically guess and set the type of sequence: dna, rna, protein
887             or '' if the sequence was empty. This method first removes dots (.),
888             dashes (-) and question marks (?) before guessing the alphabet
889             using the IUPAC conventions for ambiguous residues. Since the DNA and
890             RNA characters are also valid characters for proteins, there is
891             no foolproof way of determining the right alphabet. This is our best
892             guess only!
893             Returns : string 'dna', 'rna', 'protein' or ''.
894             Args : none
895              
896             =cut
897              
898             sub _guess_alphabet {
899 15781     15781   20920 my ($self) = @_;
900             # Guess alphabet
901 15781         25791 my $alphabet = $self->_guess_alphabet_from_string($self->seq, $self->{'_nowarnonempty'});
902             # Set alphabet unless it is unknown
903 15781 100       39836 $self->alphabet($alphabet) if $alphabet;
904 15781         20107 return $alphabet;
905             }
906              
907              
908             sub _guess_alphabet_from_string {
909             # Get the alphabet from a sequence string
910 18778     18778   26845 my ($self, $str, $nowarnonempty) = @_;
911              
912 18778 100       29371 $nowarnonempty = 0 if not defined $nowarnonempty;
913              
914             # Remove chars that clearly don't denote nucleic or amino acids
915 18778         59226 $str =~ s/[-.?]//gi;
916              
917             # Check for sequences without valid letters
918 18778         19706 my $alphabet;
919 18778         21524 my $total = CORE::length($str);
920 18778 100       28552 if( $total == 0 ) {
921 2 100       5 if (not $nowarnonempty) {
922 1         7 $self->warn("Got a sequence without letters. Could not guess alphabet");
923             }
924 2         5 $alphabet = '';
925             }
926              
927             # Determine alphabet now
928 18778 100       28130 if (not defined $alphabet) {
929 18776 100       35567 if ($str =~ m/[EFIJLOPQXZ]/i) {
930             # Start with a safe method to find proteins.
931             # Unambiguous IUPAC letters for proteins are: E,F,I,J,L,O,P,Q,X,Z
932 1758         2464 $alphabet = 'protein';
933             } else {
934             # Alphabet is unsure, could still be DNA, RNA or protein
935             # DNA and RNA contain mostly A, T, U, G, C and N, but the other
936             # letters they use are also among the 15 valid letters that a
937             # protein sequence can contain at this stage. Make our best guess
938             # based on sequence composition. If it contains over 70% of ACGTUN,
939             # it is likely nucleic.
940 17018 100       40208 if( ($str =~ tr/ATUGCNWSKMatugcnwskm//) / $total > 0.7 ) {
941 16011 100       29402 if ( $str =~ m/U/i ) {
942 53         92 $alphabet = 'rna';
943             } else {
944 15958         19898 $alphabet = 'dna';
945             }
946             } else {
947 1007         1435 $alphabet = 'protein';
948             }
949             }
950             }
951              
952 18778         31618 return $alphabet;
953             }
954              
955              
956             ############################################################################
957             # aliases due to name changes or to compensate for our lack of consistency #
958             ############################################################################
959              
960             sub accession {
961 0     0 0   my $self = shift;
962              
963 0           $self->warn(ref($self)."::accession is deprecated, ".
964             "use accession_number() instead");
965 0           return $self->accession_number(@_);
966             }
967              
968             1;