File Coverage

Bio/PrimarySeq.pm
Criterion Covered Total %
statement 172 183 93.9
branch 121 130 93.0
condition 28 32 87.5
subroutine 24 25 96.0
pod 19 20 95.0
total 364 390 93.3


line stmt bran cond sub pod time code
1             #
2             # bioperl module for Bio::PrimarySeq
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Ewan Birney
7             #
8             # Copyright Ewan Birney
9             #
10             # You may distribute this module under the same terms as perl itself
11              
12             # POD documentation - main docs before the code
13              
14             =head1 NAME
15              
16             Bio::PrimarySeq - Bioperl lightweight sequence object
17              
18             =head1 SYNOPSIS
19              
20             # Bio::SeqIO for file reading, Bio::DB::GenBank for
21             # database reading
22              
23             use Bio::Seq;
24             use Bio::SeqIO;
25             use Bio::DB::GenBank;
26              
27             # make from memory
28              
29             $seqobj = Bio::PrimarySeq->new (
30             -seq => 'ATGGGGTGGGCGGTGGGTGGTTTG',
31             -id => 'GeneFragment-12',
32             -accession_number => 'X78121',
33             -alphabet => 'dna',
34             -is_circular => 1,
35             );
36             print "Sequence ", $seqobj->id(), " with accession ",
37             $seqobj->accession_number, "\n";
38              
39             # read from file
40              
41             $inputstream = Bio::SeqIO->new(
42             -file => "myseq.fa",
43             -format => 'Fasta',
44             );
45             $seqobj = $inputstream->next_seq();
46             print "Sequence ", $seqobj->id(), " and desc ", $seqobj->desc, "\n";
47              
48             # to get out parts of the sequence.
49              
50             print "Sequence ", $seqobj->id(), " with accession ",
51             $seqobj->accession_number, " and desc ", $seqobj->desc, "\n";
52              
53             $string = $seqobj->seq();
54             $string2 = $seqobj->subseq(1,40);
55              
56             =head1 DESCRIPTION
57              
58             PrimarySeq is a lightweight sequence object, storing the sequence, its
59             name, a computer-useful unique name, and other fundamental attributes.
60             It does not contain sequence features or other information. To have a
61             sequence with sequence features you should use the Seq object which uses
62             this object.
63              
64             Although new users will use Bio::PrimarySeq a lot, in general you will
65             be using it from the Bio::Seq object. For more information on Bio::Seq
66             see L. For interest you might like to know that
67             Bio::Seq has-a Bio::PrimarySeq and forwards most of the function calls
68             to do with sequence to it (the has-a relationship lets us get out of a
69             otherwise nasty cyclical reference in Perl which would leak memory).
70              
71             Sequence objects are defined by the Bio::PrimarySeqI interface, and this
72             object is a pure Perl implementation of the interface. If that's
73             gibberish to you, don't worry. The take home message is that this
74             object is the bioperl default sequence object, but other people can
75             use their own objects as sequences if they so wish. If you are
76             interested in wrapping your own objects as compliant Bioperl sequence
77             objects, then you should read the Bio::PrimarySeqI documentation
78              
79             The documentation of this object is a merge of the Bio::PrimarySeq and
80             Bio::PrimarySeqI documentation. This allows all the methods which you can
81             call on sequence objects here.
82              
83             =head1 FEEDBACK
84              
85             =head2 Mailing Lists
86              
87             User feedback is an integral part of the evolution of this and other
88             Bioperl modules. Send your comments and suggestions preferably to one
89             of the Bioperl mailing lists. Your participation is much appreciated.
90              
91             bioperl-l@bioperl.org - General discussion
92             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
93              
94             =head2 Support
95              
96             Please direct usage questions or support issues to the mailing list:
97              
98             I
99              
100             rather than to the module maintainer directly. Many experienced and
101             reponsive experts will be able look at the problem and quickly
102             address it. Please include a thorough description of the problem
103             with code and data examples if at all possible.
104              
105             =head2 Reporting Bugs
106              
107             Report bugs to the Bioperl bug tracking system to help us keep track
108             the bugs and their resolution. Bug reports can be submitted via the
109             web:
110              
111             https://github.com/bioperl/bioperl-live/issues
112              
113             =head1 AUTHOR - Ewan Birney
114              
115             Email birney@ebi.ac.uk
116              
117             =head1 APPENDIX
118              
119             The rest of the documentation details each of the object
120             methods. Internal methods are usually preceded with a _
121              
122             =cut
123              
124              
125              
126             package Bio::PrimarySeq;
127              
128 203     203   8216 use strict;
  203         316  
  203         8952  
129              
130             our $MATCHPATTERN = 'A-Za-z\-\.\*\?=~';
131             our $GAP_SYMBOLS = '-~';
132              
133 203         76876 use base qw(Bio::Root::Root Bio::PrimarySeqI
134 203     203   657 Bio::IdentifiableI Bio::DescribableI);
  203         282  
135              
136              
137             # Setup the allowed values for alphabet()
138             my %valid_type = map {$_, 1} qw( dna rna protein );
139              
140              
141             =head2 new
142              
143             Title : new
144             Usage : $seqobj = Bio::PrimarySeq->new( -seq => 'ATGGGGGTGGTGGTACCCT',
145             -id => 'human_id',
146             -accession_number => 'AL000012',
147             );
148             Function: Returns a new primary seq object from
149             basic constructors, being a string for the sequence
150             and strings for id and accession_number.
151              
152             Note that you can provide an empty sequence string. However, in
153             this case you MUST specify the type of sequence you wish to
154             initialize by the parameter -alphabet. See alphabet() for possible
155             values.
156             Returns : a new Bio::PrimarySeq object
157             Args : -seq => sequence string
158             -ref_to_seq => ... or reference to a sequence string
159             -display_id => display id of the sequence (locus name)
160             -accession_number => accession number
161             -primary_id => primary id (Genbank id)
162             -version => version number
163             -namespace => the namespace for the accession
164             -authority => the authority for the namespace
165             -description => description text
166             -desc => alias for description
167             -alphabet => skip alphabet guess and set it to dna, rna or protein
168             -id => alias for display id
169             -is_circular => boolean to indicate that sequence is circular
170             -direct => boolean to directly set sequences. The next time -seq,
171             seq() or -ref_to_seq is use, the sequence will not be
172             validated. Be careful with this...
173             -nowarnonempty => boolean to avoid warning when sequence is empty
174              
175             =cut
176              
177             sub new {
178 14995     14995 1 38180 my ($class, @args) = @_;
179 14995         28824 my $self = $class->SUPER::new(@args);
180 14995         55120 my ($seq, $id, $acc, $pid, $ns, $auth, $v, $oid, $desc, $description,
181             $alphabet, $given_id, $is_circular, $direct, $ref_to_seq, $len,
182             $nowarnonempty) =
183             $self->_rearrange([qw(SEQ
184             DISPLAY_ID
185             ACCESSION_NUMBER
186             PRIMARY_ID
187             NAMESPACE
188             AUTHORITY
189             VERSION
190             OBJECT_ID
191             DESC
192             DESCRIPTION
193             ALPHABET
194             ID
195             IS_CIRCULAR
196             DIRECT
197             REF_TO_SEQ
198             LENGTH
199             NOWARNONEMPTY
200             )],
201             @args);
202              
203             # Private var _nowarnonempty, needs to be set before calling _guess_alphabet
204 14995         44412 $self->{'_nowarnonempty'} = $nowarnonempty;
205 14995         13599 $self->{'_direct'} = $direct;
206              
207 14995 100 100     30508 if( defined $id && defined $given_id ) {
208 6 50       12 if( $id ne $given_id ) {
209 0         0 $self->throw("Provided both id and display_id constructors: [$id] [$given_id]");
210             }
211             }
212 14995 100       19768 if( defined $given_id ) { $id = $given_id; }
  11740         9756  
213              
214             # Bernd's idea: set ids now for more informative invalid sequence messages
215 14995 100       30132 defined $id && $self->display_id($id);
216 14995 100       20689 $acc && $self->accession_number($acc);
217 14995 100       19517 defined $pid && $self->primary_id($pid);
218              
219             # Set alphabet now to avoid guessing it later, when sequence is set
220 14995 100       26207 $alphabet && $self->alphabet($alphabet);
221              
222             # Set the length before the seq. If there is a seq, length will be updated later
223 14995   100     38849 $self->{'length'} = $len || 0;
224              
225             # Set the sequence (but also alphabet and length)
226 14995 100       18557 if ($ref_to_seq) {
227 1         2 $self->_set_seq_by_ref($ref_to_seq, $alphabet);
228             } else {
229 14994 100       20259 if (defined $seq) {
230             # Note: the sequence string may be empty
231 14268         19504 $self->seq($seq);
232             }
233             }
234              
235 14993 100       20389 $desc && $self->desc($desc);
236 14993 100       17842 $description && $self->description($description);
237 14993 100       17912 $ns && $self->namespace($ns);
238 14993 100       18510 $auth && $self->authority($auth);
239             # Any variable that can have a value "0" must be tested with defined
240             # or it will fail to be added to the new object
241 14993 100       18812 defined($v) && $self->version($v);
242 14993 50       19852 defined($oid) && $self->object_id($oid);
243 14993 100       17984 defined($is_circular) && $self->is_circular($is_circular);
244              
245 14993         39727 return $self;
246             }
247              
248              
249             =head2 seq
250              
251             Title : seq
252             Usage : $string = $seqobj->seq();
253             Function: Get or set the sequence as a string of letters. The case of
254             the letters is left up to the implementer. Suggested cases are
255             upper case for proteins and lower case for DNA sequence (IUPAC
256             standard), but you should not rely on this. An error is thrown if
257             the sequence contains invalid characters: see validate_seq().
258             Returns : A scalar
259             Args : - Optional new sequence value (a string) to set
260             - Optional alphabet (it is guessed by default)
261              
262             =cut
263              
264             sub seq {
265 168456     168456 1 175702 my ($self, @args) = @_;
266              
267 168456 100       208552 if( scalar @args == 0 ) {
268 141004         300461 return $self->{'seq'};
269             }
270              
271 27452         25481 my ($seq_str, $alphabet) = @args;
272 27452 50       35374 if (@args) {
273 27452         35435 $self->_set_seq_by_ref(\$seq_str, $alphabet);
274             }
275              
276 27449         37834 return $self->{'seq'};
277             }
278              
279              
280             sub _set_seq_by_ref {
281             # Set a sequence by reference. A reference is used to avoid the cost of
282             # copying the sequence (which can be very large) between functions.
283 27453     27453   22933 my ($self, $seq_str_ref, $alphabet) = @_;
284              
285             # Validate sequence if sequence is not empty and we are not in direct mode
286 27453 100 100     83254 if ( (! $self->{'_direct'}) && (defined $$seq_str_ref) ) {
287 27281         38720 $self->validate_seq($$seq_str_ref, 1);
288             }
289 27450         28453 delete $self->{'_direct'}; # next sequence will have to be validated
290              
291             # Record sequence length
292 27450   100     39888 my $len = CORE::length($$seq_str_ref || '');
293 27450   100     53983 my $is_changed_seq = (exists $self->{'seq'}) && ($len > 0);
294             # Note: if the new seq is empty or undef, this is not considered a change
295 27450 100       37237 delete $self->{'_freeze_length'} if $is_changed_seq;
296 27450 100       39525 $self->{'length'} = $len if not exists $self->{'_freeze_length'};
297              
298             # Set sequence
299 27450         27382 $self->{'seq'} = $$seq_str_ref;
300              
301             # Set or guess alphabet
302 27450 100 100     66844 if ($alphabet) {
    100          
303             # Alphabet specified, set it no matter what
304 18         24 $self->alphabet($alphabet);
305             } elsif ($is_changed_seq || (! defined($self->alphabet()))) {
306             # If we changed a previous sequence to a new one or if there is no
307             # alphabet yet at all, we need to guess the (possibly new) alphabet
308 15780         21838 $self->_guess_alphabet();
309             } # else (seq not changed and alphabet was defined) do nothing
310              
311 27450         28287 return 1;
312             }
313              
314              
315             =head2 validate_seq
316              
317             Title : validate_seq
318             Usage : if(! $seqobj->validate_seq($seq_str) ) {
319             print "sequence $seq_str is not valid for an object of
320             alphabet ",$seqobj->alphabet, "\n";
321             }
322             Function: Test that the given sequence is valid, i.e. contains only valid
323             characters. The allowed characters are all letters (A-Z) and '-','.',
324             '*','?','=' and '~'. Spaces are not valid. Note that this
325             implementation does not take alphabet() into account and that empty
326             sequences are considered valid.
327             Returns : 1 if the supplied sequence string is valid, 0 otherwise.
328             Args : - Sequence string to be validated
329             - Boolean to optionally throw an error if the sequence is invalid
330              
331             =cut
332              
333             sub validate_seq {
334 23098     23098 1 20666 my ($self, $seqstr, $throw) = @_;
335 23098 100 100     134851 if ( (defined $seqstr ) &&
336             ($seqstr !~ /^[$MATCHPATTERN]*$/) ) {
337 8 100       14 if ($throw) {
338 3   50     7 $self->throw("Failed validation of sequence '".(defined($self->id) ||
339             '[unidentified sequence]')."'. Invalid characters were: " .
340             join('',($seqstr =~ /[^$MATCHPATTERN]/g)));
341             }
342 5         19 return 0;
343             }
344 23090         26157 return 1;
345             }
346              
347              
348             =head2 subseq
349              
350             Title : subseq
351             Usage : $substring = $seqobj->subseq(10,40);
352             $substring = $seqobj->subseq(10,40,'nogap');
353             $substring = $seqobj->subseq(-start=>10, -end=>40, -replace_with=>'tga');
354             $substring = $seqobj->subseq($location_obj);
355             $substring = $seqobj->subseq($location_obj, -nogap => 1);
356             Function: Return the subseq from start to end, where the first sequence
357             character has coordinate 1 number is inclusive, ie 1-2 are the
358             first two characters of the sequence. The given start coordinate
359             has to be larger than the end, even if the sequence is circular.
360             Returns : a string
361             Args : integer for start position
362             integer for end position
363             OR
364             Bio::LocationI location for subseq (strand honored)
365             Specify -NOGAP=>1 to return subseq with gap characters removed
366             Specify -REPLACE_WITH=>$new_subseq to replace the subseq returned
367             with $new_subseq in the sequence object
368              
369             =cut
370              
371             sub subseq {
372 10039     10039 1 6913 my $self = shift;
373 10039         9450 my @args = @_;
374 10039         19216 my ($start, $end, $nogap, $replace) = $self->_rearrange([qw(START
375             END
376             NOGAP
377             REPLACE_WITH)], @args);
378              
379             # If -replace_with is specified, validate the replacement sequence
380 10039 100       14740 if (defined $replace) {
381 2 100       5 $self->validate_seq( $replace ) ||
382             $self->throw("Replacement sequence does not look valid");
383             }
384              
385 10038 100 66     32809 if( ref($start) && $start->isa('Bio::LocationI') ) {
    50 33        
386 52         41 my $loc = $start;
387 52         42 my $seq = '';
388              
389             # For Split objects if Guide Strand is negative,
390             # pass the sublocations in reverse
391 52         44 my $order = 0;
392 52 100       119 if ($loc->isa('Bio::Location::SplitLocationI')) {
393             # guide_strand can return undef, so don't compare directly
394             # to avoid 'uninitialized value' warning
395 44 100       70 my $guide_strand = defined ($loc->guide_strand) ? ($loc->guide_strand) : 0;
396 44 100       52 $order = ($guide_strand == -1) ? -1 : 0;
397             }
398             # Reversing order using ->each_Location(-1) does not work well for
399             # cut by origin-splits (like "complement(join(1900..END,START..50))"),
400             # so use "reverse" instead
401 52 100       108 my @sublocs = ($order == -1) ? reverse $loc->each_Location(): $loc->each_Location;
402 52         57 foreach my $subloc (@sublocs) {
403 120         244 my $piece = $self->subseq(-start => $subloc->start(),
404             -end => $subloc->end(),
405             -replace_with => $replace,
406             -nogap => $nogap);
407 120 100       177 $piece =~ s/[$GAP_SYMBOLS]//g if $nogap;
408              
409             # strand can return undef, so don't compare directly
410             # to avoid 'uninitialized value' warning
411 120 100       176 my $strand = defined ($subloc->strand) ? ($subloc->strand) : 0;
412 120 100       178 if ($strand < 0) {
413 59         77 $piece = $self->_revcom_from_string($piece, $self->alphabet);
414             }
415 120         158 $seq .= $piece;
416             }
417 52         196 return $seq;
418             } elsif( defined $start && defined $end ) {
419 9986 50       11682 if( $start > $end ){
420 0         0 $self->throw("Bad start,end parameters. Start [$start] has to be ".
421             "less than end [$end]");
422             }
423 9986 50       11248 if( $start <= 0 ) {
424 0         0 $self->throw("Bad start parameter ($start). Start must be positive.");
425             }
426              
427             # Remove one from start, and then length is end-start
428 9986         6381 $start--;
429              
430 9986         5771 my $seqstr;
431 9986 100       9392 if (defined $replace) {
432 1         4 $seqstr = substr $self->{seq}, $start, $end-$start, $replace;
433             } else {
434 9985         12949 $seqstr = substr $self->{seq}, $start, $end-$start;
435             }
436              
437              
438 9986 100       11061 if ($end > $self->length) {
439 1 50       3 if ($self->is_circular) {
440 1         2 my $start = 0;
441 1         2 my $end = $end - $self->length;
442              
443 1         1 my $appendstr;
444 1 50       2 if (defined $replace) {
445 0         0 $appendstr = substr $self->{seq}, $start, $end-$start, $replace;
446             } else {
447 1         2 $appendstr = substr $self->{seq}, $start, $end-$start;
448             }
449              
450 1         1 $seqstr .= $appendstr;
451             } else {
452 0         0 $self->throw("Bad end parameter ($end). End must be less than ".
453             "the total length of sequence (total=".$self->length.")")
454             }
455             }
456              
457 9986 100       11462 $seqstr =~ s/[$GAP_SYMBOLS]//g if ($nogap);
458 9986         16173 return $seqstr;
459              
460             } else {
461 0         0 $self->warn("Incorrect parameters to subseq - must be two integers or ".
462             "a Bio::LocationI object. Got:", $self,$start,$end,$replace,$nogap);
463 0         0 return;
464             }
465             }
466              
467              
468             =head2 length
469              
470             Title : length
471             Usage : $len = $seqobj->length();
472             Function: Get the stored length of the sequence in number of symbols (bases
473             or amino acids). In some circumstances, you can also set this attribute:
474              
475             1. For empty sequences, you can set the length to anything you want:
476             my $seqobj = Bio::PrimarySeq->new( -length => 123 );
477             my $len = $seqobj->len; # 123
478             2. To save memory when using very long sequences, you can set the
479             length of the sequence to the length of the sequence (and nothing
480             else):
481             my $seqobj = Bio::PrimarySeq->new( -seq => 'ACGT...' ); # 1 Mbp sequence
482             # process $seqobj... then after you're done with it
483             $seqobj->length($seqobj->length);
484             $seqobj->seq(undef); # free memory!
485             my $len = $seqobj->len; # 1 Mbp
486              
487             Note that if you set seq() to a value other than undef at any time,
488             the length attribute will be reset.
489             Returns : integer representing the length of the sequence.
490             Args : Optionally, the value on set
491              
492             =cut
493              
494             sub length {
495 30694     30694 1 28476 my ($self, $val) = @_;
496 30694 100       35597 if (defined $val) {
497 5         8 my $len = $self->{'length'};
498 5 100 100     19 if ($len && ($len != $val)) {
499 1         5 $self->throw("Can not set the length to $val, current length value is $len");
500             }
501 4         6 $self->{'length'} = $val;
502 4         9 $self->{'_freeze_length'} = undef;
503             }
504 30693         42354 return $self->{'length'};
505             }
506              
507              
508             =head2 display_id
509              
510             Title : display_id or display_name
511             Usage : $id_string = $seqobj->display_id();
512             Function: Get or set the display id, aka the common name of the sequence object.
513              
514             The semantics of this is that it is the most likely string to
515             be used as an identifier of the sequence, and likely to have
516             "human" readability. The id is equivalent to the ID field of
517             the GenBank/EMBL databanks and the id field of the
518             Swissprot/sptrembl database. In fasta format, the >(\S+) is
519             presumed to be the id, though some people overload the id to
520             embed other information. Bioperl does not use any embedded
521             information in the ID field, and people are encouraged to use
522             other mechanisms (accession field for example, or extending
523             the sequence object) to solve this.
524              
525             With the new Bio::DescribeableI interface, display_name aliases
526             to this method.
527             Returns : A string for the display ID
528             Args : Optional string for the display ID to set
529              
530             =cut
531              
532             sub display_id {
533 23860     23860 1 25368 my ($self, $value) = @_;
534 23860 100       30150 if( defined $value) {
535 14090         15683 $self->{'display_id'} = $value;
536             }
537 23860         34354 return $self->{'display_id'};
538             }
539              
540              
541             =head2 accession_number
542              
543             Title : accession_number or object_id
544             Usage : $unique_key = $seqobj->accession_number;
545             Function: Returns the unique biological id for a sequence, commonly
546             called the accession_number. For sequences from established
547             databases, the implementors should try to use the correct
548             accession number. Notice that primary_id() provides the
549             unique id for the implemetation, allowing multiple objects
550             to have the same accession number in a particular implementation.
551              
552             For sequences with no accession number, this method should
553             return "unknown".
554              
555             [Note this method name is likely to change in 1.3]
556              
557             With the new Bio::IdentifiableI interface, this is aliased
558             to object_id
559             Returns : A string
560             Args : A string (optional) for setting
561              
562             =cut
563              
564             sub accession_number {
565 900     900 1 1667 my( $self, $acc ) = @_;
566 900 100       1453 if (defined $acc) {
567 654         947 $self->{'accession_number'} = $acc;
568             } else {
569 246         315 $acc = $self->{'accession_number'};
570 246 100       519 $acc = 'unknown' unless defined $acc;
571             }
572 900         1540 return $acc;
573             }
574              
575              
576             =head2 primary_id
577              
578             Title : primary_id
579             Usage : $unique_key = $seqobj->primary_id;
580             Function: Returns the unique id for this object in this
581             implementation. This allows implementations to manage their
582             own object ids in a way the implementaiton can control
583             clients can expect one id to map to one object.
584              
585             For sequences with no natural primary id, this method
586             should return a stringified memory location.
587             Returns : A string
588             Args : A string (optional, for setting)
589              
590             =cut
591              
592             sub primary_id {
593 505     505 1 743 my $self = shift;
594              
595 505 100       1187 if(@_) {
596 447         665 $self->{'primary_id'} = shift;
597             }
598 505 100       880 if( ! defined($self->{'primary_id'}) ) {
599 21         172 return "$self";
600             }
601 484         547 return $self->{'primary_id'};
602             }
603              
604              
605             =head2 alphabet
606              
607             Title : alphabet
608             Usage : if( $seqobj->alphabet eq 'dna' ) { # Do something }
609             Function: Get/set the alphabet of sequence, one of
610             'dna', 'rna' or 'protein'. This is case sensitive.
611              
612             This is not called because this would cause
613             upgrade problems from the 0.5 and earlier Seq objects.
614             Returns : a string either 'dna','rna','protein'. NB - the object must
615             make a call of the type - if there is no alphabet specified it
616             has to guess.
617             Args : optional string to set : 'dna' | 'rna' | 'protein'
618              
619              
620             =cut
621              
622             sub alphabet {
623 54892     54892 1 45273 my ($self,$value) = @_;
624 54892 100       67370 if (defined $value) {
625 27920         25250 $value = lc $value;
626 27920 50       41710 unless ( $valid_type{$value} ) {
627 0         0 $self->throw("Alphabet '$value' is not a valid alphabet (".
628             join(',', map "'$_'", sort keys %valid_type) .") lowercase");
629             }
630 27920         33263 $self->{'alphabet'} = $value;
631             }
632 54892         84771 return $self->{'alphabet'};
633             }
634              
635              
636             =head2 desc
637              
638             Title : desc or description
639             Usage : $seqobj->desc($newval);
640             Function: Get/set description of the sequence.
641              
642             'description' is an alias for this for compliance with the
643             Bio::DescribeableI interface.
644             Returns : value of desc (a string)
645             Args : newvalue (a string or undef, optional)
646              
647              
648             =cut
649              
650             sub desc{
651 1711     1711 1 5959 my $self = shift;
652              
653 1711 100       4125 return $self->{'desc'} = shift if @_;
654 454         1408 return $self->{'desc'};
655             }
656              
657              
658             =head2 can_call_new
659              
660             Title : can_call_new
661             Usage :
662             Function:
663             Example :
664             Returns : true
665             Args :
666              
667             =cut
668              
669             sub can_call_new {
670 10     10 1 9 my ($self) = @_;
671              
672 10         173 return 1;
673             }
674              
675              
676             =head2 id
677              
678             Title : id
679             Usage : $id = $seqobj->id();
680             Function: This is mapped on display_id
681             Example :
682             Returns :
683             Args :
684              
685             =cut
686              
687             sub id {
688 9031     9031 1 17124 return shift->display_id(@_);
689             }
690              
691              
692             =head2 is_circular
693              
694             Title : is_circular
695             Usage : if( $seqobj->is_circular) { # Do something }
696             Function: Returns true if the molecule is circular
697             Returns : Boolean value
698             Args : none
699              
700             =cut
701              
702             sub is_circular{
703 143388     143388 1 113639 my $self = shift;
704 143388 100       201734 return $self->{'is_circular'} = shift if @_;
705 143362         304910 return $self->{'is_circular'};
706             }
707              
708              
709             =head1 Methods for Bio::IdentifiableI compliance
710              
711             =head2 object_id
712              
713             Title : object_id
714             Usage : $string = $seqobj->object_id();
715             Function: Get or set a string which represents the stable primary identifier
716             in this namespace of this object. For DNA sequences this
717             is its accession_number, similarly for protein sequences.
718              
719             This is aliased to accession_number().
720             Returns : A scalar
721             Args : Optional object ID to set.
722              
723             =cut
724              
725             sub object_id {
726 4     4 1 7 return shift->accession_number(@_);
727             }
728              
729              
730             =head2 version
731              
732             Title : version
733             Usage : $version = $seqobj->version();
734             Function: Get or set a number which differentiates between versions of
735             the same object. Higher numbers are considered to be
736             later and more relevant, but a single object described
737             the same identifier should represent the same concept.
738             Returns : A number
739             Args : Optional version to set.
740              
741             =cut
742              
743             sub version{
744 3582     3582 1 3079 my ($self,$value) = @_;
745 3582 100       4761 if( defined $value) {
746 292         669 $self->{'_version'} = $value;
747             }
748 3582         5667 return $self->{'_version'};
749             }
750              
751              
752             =head2 authority
753              
754             Title : authority
755             Usage : $authority = $seqobj->authority();
756             Function: Get or set a string which represents the organisation which
757             granted the namespace, written as the DNS name of the
758             organisation (eg, wormbase.org).
759             Returns : A scalar
760             Args : Optional authority to set.
761              
762             =cut
763              
764             sub authority {
765 91     91 1 74 my ($self, $value) = @_;
766 91 100       118 if( defined $value) {
767 86         88 $self->{'authority'} = $value;
768             }
769 91         87 return $self->{'authority'};
770             }
771              
772              
773             =head2 namespace
774              
775             Title : namespace
776             Usage : $string = $seqobj->namespace();
777             Function: Get or set a string representing the name space this identifier
778             is valid in, often the database name or the name describing the
779             collection.
780             Returns : A scalar
781             Args : Optional namespace to set.
782              
783             =cut
784              
785             sub namespace{
786 527     527 1 474 my ($self,$value) = @_;
787 527 100       752 if( defined $value) {
788 493         745 $self->{'namespace'} = $value;
789             }
790 527   100     957 return $self->{'namespace'} || "";
791             }
792              
793              
794             =head1 Methods for Bio::DescribableI compliance
795              
796             This comprises of display_name and description.
797              
798             =head2 display_name
799              
800             Title : display_name
801             Usage : $string = $seqobj->display_name();
802             Function: Get or set a string which is what should be displayed to the user.
803             The string should have no spaces (ideally, though a cautious
804             user of this interface would not assumme this) and should be
805             less than thirty characters (though again, double checking
806             this is a good idea).
807              
808             This is aliased to display_id().
809             Returns : A string for the display name
810             Args : Optional string for the display name to set.
811              
812             =cut
813              
814             sub display_name {
815 2     2 1 5 return shift->display_id(@_);
816             }
817              
818              
819             =head2 description
820              
821             Title : description
822             Usage : $string = $seqobj->description();
823             Function: Get or set a text string suitable for displaying to the user a
824             description. This string is likely to have spaces, but
825             should not have any newlines or formatting - just plain
826             text. The string should not be greater than 255 characters
827             and clients can feel justified at truncating strings at 255
828             characters for the purposes of display.
829              
830             This is aliased to desc().
831             Returns : A string for the description
832             Args : Optional string for the description to set.
833              
834             =cut
835              
836             sub description {
837 91     91 1 169 return shift->desc(@_);
838             }
839              
840              
841             =head1 Methods Inherited from Bio::PrimarySeqI
842              
843             These methods are available on Bio::PrimarySeq, although they are
844             actually implemented on Bio::PrimarySeqI
845              
846             =head2 revcom
847              
848             Title : revcom
849             Usage : $rev = $seqobj->revcom();
850             Function: Produces a new Bio::SeqI implementing object which
851             is the reversed complement of the sequence. For protein
852             sequences this throws an exception of
853             "Sequence is a protein. Cannot revcom".
854              
855             The id is the same id as the orginal sequence, and the
856             accession number is also indentical. If someone wants to
857             track that this sequence has be reversed, it needs to
858             define its own extensions.
859              
860             To do an inplace edit of an object you can go:
861              
862             $seqobj = $seqobj->revcom();
863              
864             This of course, causes Perl to handle the garbage
865             collection of the old object, but it is roughly speaking as
866             efficient as an inplace edit.
867             Returns : A new (fresh) Bio::SeqI object
868             Args : none
869              
870             =head2 trunc
871              
872             Title : trunc
873             Usage : $subseq = $myseq->trunc(10,100);
874             Function: Provides a truncation of a sequence,
875             Returns : A fresh Bio::SeqI implementing object.
876             Args : Numbers for the start and end positions
877              
878             =head1 Internal methods
879              
880             These are internal methods to PrimarySeq
881              
882             =head2 _guess_alphabet
883              
884             Title : _guess_alphabet
885             Usage :
886             Function: Automatically guess and set the type of sequence: dna, rna, protein
887             or '' if the sequence was empty. This method first removes dots (.),
888             dashes (-) and question marks (?) before guessing the alphabet
889             using the IUPAC conventions for ambiguous residues. Since the DNA and
890             RNA characters are also valid characters for proteins, there is
891             no foolproof way of determining the right alphabet. This is our best
892             guess only!
893             Returns : string 'dna', 'rna', 'protein' or ''.
894             Args : none
895              
896             =cut
897              
898             sub _guess_alphabet {
899 15855     15855   13908 my ($self) = @_;
900             # Guess alphabet
901 15855         25959 my $alphabet = $self->_guess_alphabet_from_string($self->seq, $self->{'_nowarnonempty'});
902             # Set alphabet unless it is unknown
903 15855 100       31327 $self->alphabet($alphabet) if $alphabet;
904 15855         13513 return $alphabet;
905             }
906              
907              
908             sub _guess_alphabet_from_string {
909             # Get the alphabet from a sequence string
910 18852     18852   16758 my ($self, $str, $nowarnonempty) = @_;
911              
912 18852 100       27465 $nowarnonempty = 0 if not defined $nowarnonempty;
913              
914             # Remove chars that clearly don't denote nucleic or amino acids
915 18852         51527 $str =~ s/[-.?]//gi;
916              
917             # Check for sequences without valid letters
918 18852         13809 my $alphabet;
919 18852         15175 my $total = CORE::length($str);
920 18852 100       25455 if( $total == 0 ) {
921 2 100       4 if (not $nowarnonempty) {
922 1         6 $self->warn("Got a sequence without letters. Could not guess alphabet");
923             }
924 2         5 $alphabet = '';
925             }
926              
927             # Determine alphabet now
928 18852 100       24499 if (not defined $alphabet) {
929 18850 100       30919 if ($str =~ m/[EFIJLOPQXZ]/i) {
930             # Start with a safe method to find proteins.
931             # Unambiguous IUPAC letters for proteins are: E,F,I,J,L,O,P,Q,X,Z
932 1762         1887 $alphabet = 'protein';
933             } else {
934             # Alphabet is unsure, could still be DNA, RNA or protein
935             # DNA and RNA contain mostly A, T, U, G, C and N, but the other
936             # letters they use are also among the 15 valid letters that a
937             # protein sequence can contain at this stage. Make our best guess
938             # based on sequence composition. If it contains over 70% of ACGTUN,
939             # it is likely nucleic.
940 17088 100       35829 if( ($str =~ tr/ATUGCNWSKMatugcnwskm//) / $total > 0.7 ) {
941 16081 100       26603 if ( $str =~ m/U/i ) {
942 53         152 $alphabet = 'rna';
943             } else {
944 16028         15855 $alphabet = 'dna';
945             }
946             } else {
947 1007         1037 $alphabet = 'protein';
948             }
949             }
950             }
951              
952 18852         25278 return $alphabet;
953             }
954              
955              
956             ############################################################################
957             # aliases due to name changes or to compensate for our lack of consistency #
958             ############################################################################
959              
960             sub accession {
961 0     0 0   my $self = shift;
962              
963 0           $self->warn(ref($self)."::accession is deprecated, ".
964             "use accession_number() instead");
965 0           return $self->accession_number(@_);
966             }
967              
968             1;