File Coverage

Bio/PrimarySeq.pm
Criterion Covered Total %
statement 172 183 93.9
branch 121 130 93.0
condition 28 32 87.5
subroutine 24 25 96.0
pod 19 20 95.0
total 364 390 93.3


line stmt bran cond sub pod time code
1             #
2             # bioperl module for Bio::PrimarySeq
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Ewan Birney
7             #
8             # Copyright Ewan Birney
9             #
10             # You may distribute this module under the same terms as perl itself
11              
12             # POD documentation - main docs before the code
13              
14             =head1 NAME
15              
16             Bio::PrimarySeq - Bioperl lightweight sequence object
17              
18             =head1 SYNOPSIS
19              
20             # Bio::SeqIO for file reading, Bio::DB::GenBank for
21             # database reading
22              
23             use Bio::Seq;
24             use Bio::SeqIO;
25             use Bio::DB::GenBank;
26              
27             # make from memory
28              
29             $seqobj = Bio::PrimarySeq->new (
30             -seq => 'ATGGGGTGGGCGGTGGGTGGTTTG',
31             -id => 'GeneFragment-12',
32             -accession_number => 'X78121',
33             -alphabet => 'dna',
34             -is_circular => 1,
35             );
36             print "Sequence ", $seqobj->id(), " with accession ",
37             $seqobj->accession_number, "\n";
38              
39             # read from file
40              
41             $inputstream = Bio::SeqIO->new(
42             -file => "myseq.fa",
43             -format => 'Fasta',
44             );
45             $seqobj = $inputstream->next_seq();
46             print "Sequence ", $seqobj->id(), " and desc ", $seqobj->desc, "\n";
47              
48             # to get out parts of the sequence.
49              
50             print "Sequence ", $seqobj->id(), " with accession ",
51             $seqobj->accession_number, " and desc ", $seqobj->desc, "\n";
52              
53             $string = $seqobj->seq();
54             $string2 = $seqobj->subseq(1,40);
55              
56             =head1 DESCRIPTION
57              
58             PrimarySeq is a lightweight sequence object, storing the sequence, its
59             name, a computer-useful unique name, and other fundamental attributes.
60             It does not contain sequence features or other information. To have a
61             sequence with sequence features you should use the Seq object which uses
62             this object.
63              
64             Although new users will use Bio::PrimarySeq a lot, in general you will
65             be using it from the Bio::Seq object. For more information on Bio::Seq
66             see L. For interest you might like to know that
67             Bio::Seq has-a Bio::PrimarySeq and forwards most of the function calls
68             to do with sequence to it (the has-a relationship lets us get out of a
69             otherwise nasty cyclical reference in Perl which would leak memory).
70              
71             Sequence objects are defined by the Bio::PrimarySeqI interface, and this
72             object is a pure Perl implementation of the interface. If that's
73             gibberish to you, don't worry. The take home message is that this
74             object is the bioperl default sequence object, but other people can
75             use their own objects as sequences if they so wish. If you are
76             interested in wrapping your own objects as compliant Bioperl sequence
77             objects, then you should read the Bio::PrimarySeqI documentation
78              
79             The documentation of this object is a merge of the Bio::PrimarySeq and
80             Bio::PrimarySeqI documentation. This allows all the methods which you can
81             call on sequence objects here.
82              
83             =head1 FEEDBACK
84              
85             =head2 Mailing Lists
86              
87             User feedback is an integral part of the evolution of this and other
88             Bioperl modules. Send your comments and suggestions preferably to one
89             of the Bioperl mailing lists. Your participation is much appreciated.
90              
91             bioperl-l@bioperl.org - General discussion
92             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
93              
94             =head2 Support
95              
96             Please direct usage questions or support issues to the mailing list:
97              
98             I
99              
100             rather than to the module maintainer directly. Many experienced and
101             reponsive experts will be able look at the problem and quickly
102             address it. Please include a thorough description of the problem
103             with code and data examples if at all possible.
104              
105             =head2 Reporting Bugs
106              
107             Report bugs to the Bioperl bug tracking system to help us keep track
108             the bugs and their resolution. Bug reports can be submitted via the
109             web:
110              
111             https://github.com/bioperl/bioperl-live/issues
112              
113             =head1 AUTHOR - Ewan Birney
114              
115             Email birney@ebi.ac.uk
116              
117             =head1 APPENDIX
118              
119             The rest of the documentation details each of the object
120             methods. Internal methods are usually preceded with a _
121              
122             =cut
123              
124              
125              
126             package Bio::PrimarySeq;
127              
128 203     203   8208 use strict;
  203         345  
  203         8859  
129              
130             our $MATCHPATTERN = 'A-Za-z\-\.\*\?=~';
131             our $GAP_SYMBOLS = '-~';
132              
133 203         76810 use base qw(Bio::Root::Root Bio::PrimarySeqI
134 203     203   643 Bio::IdentifiableI Bio::DescribableI);
  203         261  
135              
136              
137             # Setup the allowed values for alphabet()
138             my %valid_type = map {$_, 1} qw( dna rna protein );
139              
140              
141             =head2 new
142              
143             Title : new
144             Usage : $seqobj = Bio::PrimarySeq->new( -seq => 'ATGGGGGTGGTGGTACCCT',
145             -id => 'human_id',
146             -accession_number => 'AL000012',
147             );
148             Function: Returns a new primary seq object from
149             basic constructors, being a string for the sequence
150             and strings for id and accession_number.
151              
152             Note that you can provide an empty sequence string. However, in
153             this case you MUST specify the type of sequence you wish to
154             initialize by the parameter -alphabet. See alphabet() for possible
155             values.
156             Returns : a new Bio::PrimarySeq object
157             Args : -seq => sequence string
158             -ref_to_seq => ... or reference to a sequence string
159             -display_id => display id of the sequence (locus name)
160             -accession_number => accession number
161             -primary_id => primary id (Genbank id)
162             -version => version number
163             -namespace => the namespace for the accession
164             -authority => the authority for the namespace
165             -description => description text
166             -desc => alias for description
167             -alphabet => skip alphabet guess and set it to dna, rna or protein
168             -id => alias for display id
169             -is_circular => boolean to indicate that sequence is circular
170             -direct => boolean to directly set sequences. The next time -seq,
171             seq() or -ref_to_seq is use, the sequence will not be
172             validated. Be careful with this...
173             -nowarnonempty => boolean to avoid warning when sequence is empty
174              
175             =cut
176              
177             sub new {
178 14995     14995 1 43767 my ($class, @args) = @_;
179 14995         30934 my $self = $class->SUPER::new(@args);
180 14995         56280 my ($seq, $id, $acc, $pid, $ns, $auth, $v, $oid, $desc, $description,
181             $alphabet, $given_id, $is_circular, $direct, $ref_to_seq, $len,
182             $nowarnonempty) =
183             $self->_rearrange([qw(SEQ
184             DISPLAY_ID
185             ACCESSION_NUMBER
186             PRIMARY_ID
187             NAMESPACE
188             AUTHORITY
189             VERSION
190             OBJECT_ID
191             DESC
192             DESCRIPTION
193             ALPHABET
194             ID
195             IS_CIRCULAR
196             DIRECT
197             REF_TO_SEQ
198             LENGTH
199             NOWARNONEMPTY
200             )],
201             @args);
202              
203             # Private var _nowarnonempty, needs to be set before calling _guess_alphabet
204 14995         44320 $self->{'_nowarnonempty'} = $nowarnonempty;
205 14995         13084 $self->{'_direct'} = $direct;
206              
207 14995 100 100     30268 if( defined $id && defined $given_id ) {
208 6 50       15 if( $id ne $given_id ) {
209 0         0 $self->throw("Provided both id and display_id constructors: [$id] [$given_id]");
210             }
211             }
212 14995 100       21655 if( defined $given_id ) { $id = $given_id; }
  11740         10795  
213              
214             # Bernd's idea: set ids now for more informative invalid sequence messages
215 14995 100       31708 defined $id && $self->display_id($id);
216 14995 100       21859 $acc && $self->accession_number($acc);
217 14995 100       19492 defined $pid && $self->primary_id($pid);
218              
219             # Set alphabet now to avoid guessing it later, when sequence is set
220 14995 100       26411 $alphabet && $self->alphabet($alphabet);
221              
222             # Set the length before the seq. If there is a seq, length will be updated later
223 14995   100     39267 $self->{'length'} = $len || 0;
224              
225             # Set the sequence (but also alphabet and length)
226 14995 100       17746 if ($ref_to_seq) {
227 1         2 $self->_set_seq_by_ref($ref_to_seq, $alphabet);
228             } else {
229 14994 100       20710 if (defined $seq) {
230             # Note: the sequence string may be empty
231 14268         18208 $self->seq($seq);
232             }
233             }
234              
235 14993 100       20662 $desc && $self->desc($desc);
236 14993 100       18166 $description && $self->description($description);
237 14993 100       18220 $ns && $self->namespace($ns);
238 14993 100       18391 $auth && $self->authority($auth);
239             # Any variable that can have a value "0" must be tested with defined
240             # or it will fail to be added to the new object
241 14993 100       19336 defined($v) && $self->version($v);
242 14993 50       17674 defined($oid) && $self->object_id($oid);
243 14993 100       19990 defined($is_circular) && $self->is_circular($is_circular);
244              
245 14993         41106 return $self;
246             }
247              
248              
249             =head2 seq
250              
251             Title : seq
252             Usage : $string = $seqobj->seq();
253             Function: Get or set the sequence as a string of letters. The case of
254             the letters is left up to the implementer. Suggested cases are
255             upper case for proteins and lower case for DNA sequence (IUPAC
256             standard), but you should not rely on this. An error is thrown if
257             the sequence contains invalid characters: see validate_seq().
258             Returns : A scalar
259             Args : - Optional new sequence value (a string) to set
260             - Optional alphabet (it is guessed by default)
261              
262             =cut
263              
264             sub seq {
265 168450     168450 1 185967 my ($self, @args) = @_;
266              
267 168450 100       215391 if( scalar @args == 0 ) {
268 140999         298541 return $self->{'seq'};
269             }
270              
271 27451         26003 my ($seq_str, $alphabet) = @args;
272 27451 50       36294 if (@args) {
273 27451         37508 $self->_set_seq_by_ref(\$seq_str, $alphabet);
274             }
275              
276 27448         38948 return $self->{'seq'};
277             }
278              
279              
280             sub _set_seq_by_ref {
281             # Set a sequence by reference. A reference is used to avoid the cost of
282             # copying the sequence (which can be very large) between functions.
283 27452     27452   22532 my ($self, $seq_str_ref, $alphabet) = @_;
284              
285             # Validate sequence if sequence is not empty and we are not in direct mode
286 27452 100 100     82019 if ( (! $self->{'_direct'}) && (defined $$seq_str_ref) ) {
287 27280         34142 $self->validate_seq($$seq_str_ref, 1);
288             }
289 27449         26938 delete $self->{'_direct'}; # next sequence will have to be validated
290              
291             # Record sequence length
292 27449   100     39882 my $len = CORE::length($$seq_str_ref || '');
293 27449   100     52916 my $is_changed_seq = (exists $self->{'seq'}) && ($len > 0);
294             # Note: if the new seq is empty or undef, this is not considered a change
295 27449 100       35517 delete $self->{'_freeze_length'} if $is_changed_seq;
296 27449 100       39860 $self->{'length'} = $len if not exists $self->{'_freeze_length'};
297              
298             # Set sequence
299 27449         25318 $self->{'seq'} = $$seq_str_ref;
300              
301             # Set or guess alphabet
302 27449 100 100     65829 if ($alphabet) {
    100          
303             # Alphabet specified, set it no matter what
304 18         28 $self->alphabet($alphabet);
305             } elsif ($is_changed_seq || (! defined($self->alphabet()))) {
306             # If we changed a previous sequence to a new one or if there is no
307             # alphabet yet at all, we need to guess the (possibly new) alphabet
308 15779         22449 $self->_guess_alphabet();
309             } # else (seq not changed and alphabet was defined) do nothing
310              
311 27449         25948 return 1;
312             }
313              
314              
315             =head2 validate_seq
316              
317             Title : validate_seq
318             Usage : if(! $seqobj->validate_seq($seq_str) ) {
319             print "sequence $seq_str is not valid for an object of
320             alphabet ",$seqobj->alphabet, "\n";
321             }
322             Function: Test that the given sequence is valid, i.e. contains only valid
323             characters. The allowed characters are all letters (A-Z) and '-','.',
324             '*','?','=' and '~'. Spaces are not valid. Note that this
325             implementation does not take alphabet() into account and that empty
326             sequences are considered valid.
327             Returns : 1 if the supplied sequence string is valid, 0 otherwise.
328             Args : - Sequence string to be validated
329             - Boolean to optionally throw an error if the sequence is invalid
330              
331             =cut
332              
333             sub validate_seq {
334 23098     23098 1 20944 my ($self, $seqstr, $throw) = @_;
335 23098 100 100     135349 if ( (defined $seqstr ) &&
336             ($seqstr !~ /^[$MATCHPATTERN]*$/) ) {
337 8 100       14 if ($throw) {
338 3   50     7 $self->throw("Failed validation of sequence '".(defined($self->id) ||
339             '[unidentified sequence]')."'. Invalid characters were: " .
340             join('',($seqstr =~ /[^$MATCHPATTERN]/g)));
341             }
342 5         21 return 0;
343             }
344 23090         26677 return 1;
345             }
346              
347              
348             =head2 subseq
349              
350             Title : subseq
351             Usage : $substring = $seqobj->subseq(10,40);
352             $substring = $seqobj->subseq(10,40,'nogap');
353             $substring = $seqobj->subseq(-start=>10, -end=>40, -replace_with=>'tga');
354             $substring = $seqobj->subseq($location_obj);
355             $substring = $seqobj->subseq($location_obj, -nogap => 1);
356             Function: Return the subseq from start to end, where the first sequence
357             character has coordinate 1 number is inclusive, ie 1-2 are the
358             first two characters of the sequence. The given start coordinate
359             has to be larger than the end, even if the sequence is circular.
360             Returns : a string
361             Args : integer for start position
362             integer for end position
363             OR
364             Bio::LocationI location for subseq (strand honored)
365             Specify -NOGAP=>1 to return subseq with gap characters removed
366             Specify -REPLACE_WITH=>$new_subseq to replace the subseq returned
367             with $new_subseq in the sequence object
368              
369             =cut
370              
371             sub subseq {
372 10039     10039 1 7102 my $self = shift;
373 10039         10179 my @args = @_;
374 10039         20487 my ($start, $end, $nogap, $replace) = $self->_rearrange([qw(START
375             END
376             NOGAP
377             REPLACE_WITH)], @args);
378              
379             # If -replace_with is specified, validate the replacement sequence
380 10039 100       14972 if (defined $replace) {
381 2 100       4 $self->validate_seq( $replace ) ||
382             $self->throw("Replacement sequence does not look valid");
383             }
384              
385 10038 100 66     33518 if( ref($start) && $start->isa('Bio::LocationI') ) {
    50 33        
386 52         43 my $loc = $start;
387 52         59 my $seq = '';
388              
389             # For Split objects if Guide Strand is negative,
390             # pass the sublocations in reverse
391 52         38 my $order = 0;
392 52 100       114 if ($loc->isa('Bio::Location::SplitLocationI')) {
393             # guide_strand can return undef, so don't compare directly
394             # to avoid 'uninitialized value' warning
395 44 100       72 my $guide_strand = defined ($loc->guide_strand) ? ($loc->guide_strand) : 0;
396 44 100       58 $order = ($guide_strand == -1) ? -1 : 0;
397             }
398             # Reversing order using ->each_Location(-1) does not work well for
399             # cut by origin-splits (like "complement(join(1900..END,START..50))"),
400             # so use "reverse" instead
401 52 100       110 my @sublocs = ($order == -1) ? reverse $loc->each_Location(): $loc->each_Location;
402 52         53 foreach my $subloc (@sublocs) {
403 120         247 my $piece = $self->subseq(-start => $subloc->start(),
404             -end => $subloc->end(),
405             -replace_with => $replace,
406             -nogap => $nogap);
407 120 100       186 $piece =~ s/[$GAP_SYMBOLS]//g if $nogap;
408              
409             # strand can return undef, so don't compare directly
410             # to avoid 'uninitialized value' warning
411 120 100       175 my $strand = defined ($subloc->strand) ? ($subloc->strand) : 0;
412 120 100       159 if ($strand < 0) {
413 59         77 $piece = $self->_revcom_from_string($piece, $self->alphabet);
414             }
415 120         167 $seq .= $piece;
416             }
417 52         201 return $seq;
418             } elsif( defined $start && defined $end ) {
419 9986 50       11744 if( $start > $end ){
420 0         0 $self->throw("Bad start,end parameters. Start [$start] has to be ".
421             "less than end [$end]");
422             }
423 9986 50       11246 if( $start <= 0 ) {
424 0         0 $self->throw("Bad start parameter ($start). Start must be positive.");
425             }
426              
427             # Remove one from start, and then length is end-start
428 9986         6453 $start--;
429              
430 9986         5994 my $seqstr;
431 9986 100       9300 if (defined $replace) {
432 1         3 $seqstr = substr $self->{seq}, $start, $end-$start, $replace;
433             } else {
434 9985         13403 $seqstr = substr $self->{seq}, $start, $end-$start;
435             }
436              
437              
438 9986 100       11534 if ($end > $self->length) {
439 1 50       4 if ($self->is_circular) {
440 1         2 my $start = 0;
441 1         2 my $end = $end - $self->length;
442              
443 1         2 my $appendstr;
444 1 50       2 if (defined $replace) {
445 0         0 $appendstr = substr $self->{seq}, $start, $end-$start, $replace;
446             } else {
447 1         2 $appendstr = substr $self->{seq}, $start, $end-$start;
448             }
449              
450 1         1 $seqstr .= $appendstr;
451             } else {
452 0         0 $self->throw("Bad end parameter ($end). End must be less than ".
453             "the total length of sequence (total=".$self->length.")")
454             }
455             }
456              
457 9986 100       11657 $seqstr =~ s/[$GAP_SYMBOLS]//g if ($nogap);
458 9986         16955 return $seqstr;
459              
460             } else {
461 0         0 $self->warn("Incorrect parameters to subseq - must be two integers or ".
462             "a Bio::LocationI object. Got:", $self,$start,$end,$replace,$nogap);
463 0         0 return;
464             }
465             }
466              
467              
468             =head2 length
469              
470             Title : length
471             Usage : $len = $seqobj->length();
472             Function: Get the stored length of the sequence in number of symbols (bases
473             or amino acids). In some circumstances, you can also set this attribute:
474              
475             1. For empty sequences, you can set the length to anything you want:
476             my $seqobj = Bio::PrimarySeq->new( -length => 123 );
477             my $len = $seqobj->len; # 123
478             2. To save memory when using very long sequences, you can set the
479             length of the sequence to the length of the sequence (and nothing
480             else):
481             my $seqobj = Bio::PrimarySeq->new( -seq => 'ACGT...' ); # 1 Mbp sequence
482             # process $seqobj... then after you're done with it
483             $seqobj->length($seqobj->length);
484             $seqobj->seq(undef); # free memory!
485             my $len = $seqobj->len; # 1 Mbp
486              
487             Note that if you set seq() to a value other than undef at any time,
488             the length attribute will be reset.
489             Returns : integer representing the length of the sequence.
490             Args : Optionally, the value on set
491              
492             =cut
493              
494             sub length {
495 30692     30692 1 29034 my ($self, $val) = @_;
496 30692 100       36962 if (defined $val) {
497 5         7 my $len = $self->{'length'};
498 5 100 100     18 if ($len && ($len != $val)) {
499 1         6 $self->throw("Can not set the length to $val, current length value is $len");
500             }
501 4         5 $self->{'length'} = $val;
502 4         8 $self->{'_freeze_length'} = undef;
503             }
504 30691         43626 return $self->{'length'};
505             }
506              
507              
508             =head2 display_id
509              
510             Title : display_id or display_name
511             Usage : $id_string = $seqobj->display_id();
512             Function: Get or set the display id, aka the common name of the sequence object.
513              
514             The semantics of this is that it is the most likely string to
515             be used as an identifier of the sequence, and likely to have
516             "human" readability. The id is equivalent to the ID field of
517             the GenBank/EMBL databanks and the id field of the
518             Swissprot/sptrembl database. In fasta format, the >(\S+) is
519             presumed to be the id, though some people overload the id to
520             embed other information. Bioperl does not use any embedded
521             information in the ID field, and people are encouraged to use
522             other mechanisms (accession field for example, or extending
523             the sequence object) to solve this.
524              
525             With the new Bio::DescribeableI interface, display_name aliases
526             to this method.
527             Returns : A string for the display ID
528             Args : Optional string for the display ID to set
529              
530             =cut
531              
532             sub display_id {
533 23861     23861 1 26349 my ($self, $value) = @_;
534 23861 100       30890 if( defined $value) {
535 14090         14600 $self->{'display_id'} = $value;
536             }
537 23861         33891 return $self->{'display_id'};
538             }
539              
540              
541             =head2 accession_number
542              
543             Title : accession_number or object_id
544             Usage : $unique_key = $seqobj->accession_number;
545             Function: Returns the unique biological id for a sequence, commonly
546             called the accession_number. For sequences from established
547             databases, the implementors should try to use the correct
548             accession number. Notice that primary_id() provides the
549             unique id for the implemetation, allowing multiple objects
550             to have the same accession number in a particular implementation.
551              
552             For sequences with no accession number, this method should
553             return "unknown".
554              
555             [Note this method name is likely to change in 1.3]
556              
557             With the new Bio::IdentifiableI interface, this is aliased
558             to object_id
559             Returns : A string
560             Args : A string (optional) for setting
561              
562             =cut
563              
564             sub accession_number {
565 900     900 1 1742 my( $self, $acc ) = @_;
566 900 100       1374 if (defined $acc) {
567 654         902 $self->{'accession_number'} = $acc;
568             } else {
569 246         361 $acc = $self->{'accession_number'};
570 246 100       532 $acc = 'unknown' unless defined $acc;
571             }
572 900         1376 return $acc;
573             }
574              
575              
576             =head2 primary_id
577              
578             Title : primary_id
579             Usage : $unique_key = $seqobj->primary_id;
580             Function: Returns the unique id for this object in this
581             implementation. This allows implementations to manage their
582             own object ids in a way the implementaiton can control
583             clients can expect one id to map to one object.
584              
585             For sequences with no natural primary id, this method
586             should return a stringified memory location.
587             Returns : A string
588             Args : A string (optional, for setting)
589              
590             =cut
591              
592             sub primary_id {
593 505     505 1 774 my $self = shift;
594              
595 505 100       833 if(@_) {
596 447         560 $self->{'primary_id'} = shift;
597             }
598 505 100       910 if( ! defined($self->{'primary_id'}) ) {
599 21         143 return "$self";
600             }
601 484         564 return $self->{'primary_id'};
602             }
603              
604              
605             =head2 alphabet
606              
607             Title : alphabet
608             Usage : if( $seqobj->alphabet eq 'dna' ) { # Do something }
609             Function: Get/set the alphabet of sequence, one of
610             'dna', 'rna' or 'protein'. This is case sensitive.
611              
612             This is not called because this would cause
613             upgrade problems from the 0.5 and earlier Seq objects.
614             Returns : a string either 'dna','rna','protein'. NB - the object must
615             make a call of the type - if there is no alphabet specified it
616             has to guess.
617             Args : optional string to set : 'dna' | 'rna' | 'protein'
618              
619              
620             =cut
621              
622             sub alphabet {
623 54891     54891 1 46598 my ($self,$value) = @_;
624 54891 100       71144 if (defined $value) {
625 27919         25612 $value = lc $value;
626 27919 50       41353 unless ( $valid_type{$value} ) {
627 0         0 $self->throw("Alphabet '$value' is not a valid alphabet (".
628             join(',', map "'$_'", sort keys %valid_type) .") lowercase");
629             }
630 27919         29053 $self->{'alphabet'} = $value;
631             }
632 54891         85460 return $self->{'alphabet'};
633             }
634              
635              
636             =head2 desc
637              
638             Title : desc or description
639             Usage : $seqobj->desc($newval);
640             Function: Get/set description of the sequence.
641              
642             'description' is an alias for this for compliance with the
643             Bio::DescribeableI interface.
644             Returns : value of desc (a string)
645             Args : newvalue (a string or undef, optional)
646              
647              
648             =cut
649              
650             sub desc{
651 1711     1711 1 7642 my $self = shift;
652              
653 1711 100       4079 return $self->{'desc'} = shift if @_;
654 454         1415 return $self->{'desc'};
655             }
656              
657              
658             =head2 can_call_new
659              
660             Title : can_call_new
661             Usage :
662             Function:
663             Example :
664             Returns : true
665             Args :
666              
667             =cut
668              
669             sub can_call_new {
670 10     10 1 8 my ($self) = @_;
671              
672 10         153 return 1;
673             }
674              
675              
676             =head2 id
677              
678             Title : id
679             Usage : $id = $seqobj->id();
680             Function: This is mapped on display_id
681             Example :
682             Returns :
683             Args :
684              
685             =cut
686              
687             sub id {
688 9032     9032 1 16480 return shift->display_id(@_);
689             }
690              
691              
692             =head2 is_circular
693              
694             Title : is_circular
695             Usage : if( $seqobj->is_circular) { # Do something }
696             Function: Returns true if the molecule is circular
697             Returns : Boolean value
698             Args : none
699              
700             =cut
701              
702             sub is_circular{
703 143388     143388 1 108946 my $self = shift;
704 143388 100       204957 return $self->{'is_circular'} = shift if @_;
705 143362         317482 return $self->{'is_circular'};
706             }
707              
708              
709             =head1 Methods for Bio::IdentifiableI compliance
710              
711             =head2 object_id
712              
713             Title : object_id
714             Usage : $string = $seqobj->object_id();
715             Function: Get or set a string which represents the stable primary identifier
716             in this namespace of this object. For DNA sequences this
717             is its accession_number, similarly for protein sequences.
718              
719             This is aliased to accession_number().
720             Returns : A scalar
721             Args : Optional object ID to set.
722              
723             =cut
724              
725             sub object_id {
726 4     4 1 6 return shift->accession_number(@_);
727             }
728              
729              
730             =head2 version
731              
732             Title : version
733             Usage : $version = $seqobj->version();
734             Function: Get or set a number which differentiates between versions of
735             the same object. Higher numbers are considered to be
736             later and more relevant, but a single object described
737             the same identifier should represent the same concept.
738             Returns : A number
739             Args : Optional version to set.
740              
741             =cut
742              
743             sub version{
744 3582     3582 1 2775 my ($self,$value) = @_;
745 3582 100       4620 if( defined $value) {
746 292         607 $self->{'_version'} = $value;
747             }
748 3582         5437 return $self->{'_version'};
749             }
750              
751              
752             =head2 authority
753              
754             Title : authority
755             Usage : $authority = $seqobj->authority();
756             Function: Get or set a string which represents the organisation which
757             granted the namespace, written as the DNS name of the
758             organisation (eg, wormbase.org).
759             Returns : A scalar
760             Args : Optional authority to set.
761              
762             =cut
763              
764             sub authority {
765 91     91 1 72 my ($self, $value) = @_;
766 91 100       124 if( defined $value) {
767 86         78 $self->{'authority'} = $value;
768             }
769 91         87 return $self->{'authority'};
770             }
771              
772              
773             =head2 namespace
774              
775             Title : namespace
776             Usage : $string = $seqobj->namespace();
777             Function: Get or set a string representing the name space this identifier
778             is valid in, often the database name or the name describing the
779             collection.
780             Returns : A scalar
781             Args : Optional namespace to set.
782              
783             =cut
784              
785             sub namespace{
786 527     527 1 486 my ($self,$value) = @_;
787 527 100       808 if( defined $value) {
788 493         832 $self->{'namespace'} = $value;
789             }
790 527   100     1049 return $self->{'namespace'} || "";
791             }
792              
793              
794             =head1 Methods for Bio::DescribableI compliance
795              
796             This comprises of display_name and description.
797              
798             =head2 display_name
799              
800             Title : display_name
801             Usage : $string = $seqobj->display_name();
802             Function: Get or set a string which is what should be displayed to the user.
803             The string should have no spaces (ideally, though a cautious
804             user of this interface would not assumme this) and should be
805             less than thirty characters (though again, double checking
806             this is a good idea).
807              
808             This is aliased to display_id().
809             Returns : A string for the display name
810             Args : Optional string for the display name to set.
811              
812             =cut
813              
814             sub display_name {
815 2     2 1 6 return shift->display_id(@_);
816             }
817              
818              
819             =head2 description
820              
821             Title : description
822             Usage : $string = $seqobj->description();
823             Function: Get or set a text string suitable for displaying to the user a
824             description. This string is likely to have spaces, but
825             should not have any newlines or formatting - just plain
826             text. The string should not be greater than 255 characters
827             and clients can feel justified at truncating strings at 255
828             characters for the purposes of display.
829              
830             This is aliased to desc().
831             Returns : A string for the description
832             Args : Optional string for the description to set.
833              
834             =cut
835              
836             sub description {
837 91     91 1 171 return shift->desc(@_);
838             }
839              
840              
841             =head1 Methods Inherited from Bio::PrimarySeqI
842              
843             These methods are available on Bio::PrimarySeq, although they are
844             actually implemented on Bio::PrimarySeqI
845              
846             =head2 revcom
847              
848             Title : revcom
849             Usage : $rev = $seqobj->revcom();
850             Function: Produces a new Bio::SeqI implementing object which
851             is the reversed complement of the sequence. For protein
852             sequences this throws an exception of
853             "Sequence is a protein. Cannot revcom".
854              
855             The id is the same id as the orginal sequence, and the
856             accession number is also indentical. If someone wants to
857             track that this sequence has be reversed, it needs to
858             define its own extensions.
859              
860             To do an inplace edit of an object you can go:
861              
862             $seqobj = $seqobj->revcom();
863              
864             This of course, causes Perl to handle the garbage
865             collection of the old object, but it is roughly speaking as
866             efficient as an inplace edit.
867             Returns : A new (fresh) Bio::SeqI object
868             Args : none
869              
870             =head2 trunc
871              
872             Title : trunc
873             Usage : $subseq = $myseq->trunc(10,100);
874             Function: Provides a truncation of a sequence,
875             Returns : A fresh Bio::SeqI implementing object.
876             Args : Numbers for the start and end positions
877              
878             =head1 Internal methods
879              
880             These are internal methods to PrimarySeq
881              
882             =head2 _guess_alphabet
883              
884             Title : _guess_alphabet
885             Usage :
886             Function: Automatically guess and set the type of sequence: dna, rna, protein
887             or '' if the sequence was empty. This method first removes dots (.),
888             dashes (-) and question marks (?) before guessing the alphabet
889             using the IUPAC conventions for ambiguous residues. Since the DNA and
890             RNA characters are also valid characters for proteins, there is
891             no foolproof way of determining the right alphabet. This is our best
892             guess only!
893             Returns : string 'dna', 'rna', 'protein' or ''.
894             Args : none
895              
896             =cut
897              
898             sub _guess_alphabet {
899 15854     15854   14733 my ($self) = @_;
900             # Guess alphabet
901 15854         21922 my $alphabet = $self->_guess_alphabet_from_string($self->seq, $self->{'_nowarnonempty'});
902             # Set alphabet unless it is unknown
903 15854 100       31906 $self->alphabet($alphabet) if $alphabet;
904 15854         13063 return $alphabet;
905             }
906              
907              
908             sub _guess_alphabet_from_string {
909             # Get the alphabet from a sequence string
910 18851     18851   16739 my ($self, $str, $nowarnonempty) = @_;
911              
912 18851 100       26867 $nowarnonempty = 0 if not defined $nowarnonempty;
913              
914             # Remove chars that clearly don't denote nucleic or amino acids
915 18851         51437 $str =~ s/[-.?]//gi;
916              
917             # Check for sequences without valid letters
918 18851         12531 my $alphabet;
919 18851         14314 my $total = CORE::length($str);
920 18851 100       25459 if( $total == 0 ) {
921 2 100       5 if (not $nowarnonempty) {
922 1         7 $self->warn("Got a sequence without letters. Could not guess alphabet");
923             }
924 2         3 $alphabet = '';
925             }
926              
927             # Determine alphabet now
928 18851 100       24130 if (not defined $alphabet) {
929 18849 100       29041 if ($str =~ m/[EFIJLOPQXZ]/i) {
930             # Start with a safe method to find proteins.
931             # Unambiguous IUPAC letters for proteins are: E,F,I,J,L,O,P,Q,X,Z
932 1762         1760 $alphabet = 'protein';
933             } else {
934             # Alphabet is unsure, could still be DNA, RNA or protein
935             # DNA and RNA contain mostly A, T, U, G, C and N, but the other
936             # letters they use are also among the 15 valid letters that a
937             # protein sequence can contain at this stage. Make our best guess
938             # based on sequence composition. If it contains over 70% of ACGTUN,
939             # it is likely nucleic.
940 17087 100       33320 if( ($str =~ tr/ATUGCNWSKMatugcnwskm//) / $total > 0.7 ) {
941 16080 100       24589 if ( $str =~ m/U/i ) {
942 53         76 $alphabet = 'rna';
943             } else {
944 16027         15897 $alphabet = 'dna';
945             }
946             } else {
947 1007         1205 $alphabet = 'protein';
948             }
949             }
950             }
951              
952 18851         23795 return $alphabet;
953             }
954              
955              
956             ############################################################################
957             # aliases due to name changes or to compensate for our lack of consistency #
958             ############################################################################
959              
960             sub accession {
961 0     0 0   my $self = shift;
962              
963 0           $self->warn(ref($self)."::accession is deprecated, ".
964             "use accession_number() instead");
965 0           return $self->accession_number(@_);
966             }
967              
968             1;