File Coverage

Bio/SeqIO/swiss.pm
Criterion Covered Total %
statement 472 591 79.8
branch 222 330 67.2
condition 64 100 64.0
subroutine 28 32 87.5
pod 2 2 100.0
total 788 1055 74.6


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::SeqIO::swiss
3             #
4             # Copyright Elia Stupka
5             #
6             # You may distribute this module under the same terms as perl itself
7              
8             # POD documentation - main docs before the code
9              
10             =head1 NAME
11              
12             Bio::SeqIO::swiss - Swissprot sequence input/output stream
13              
14             =head1 SYNOPSIS
15              
16             It is probably best not to use this object directly, but
17             rather go through the SeqIO handler system:
18              
19             use Bio::SeqIO;
20              
21             $stream = Bio::SeqIO->new(-file => $filename,
22             -format => 'swiss');
23              
24             while ( my $seq = $stream->next_seq() ) {
25             # do something with $seq
26             }
27              
28             =head1 DESCRIPTION
29              
30             This object can transform Bio::Seq objects to and from Swiss-Pprot flat
31             file databases.
32              
33             There is a lot of flexibility here about how to dump things which needs
34             to be documented.
35              
36             =head2 GN (Gene name) line management details
37              
38             A Uniprot/Swiss-Prot entry holds information on one protein
39             sequence. If that sequence is identical across genes and species, they
40             are all merged into one entry. This creates complex needs for several
41             annotation fields in swiss-prot format.
42              
43             The latest syntax for GN line is described in the user manual:
44              
45             http://www.expasy.ch/sprot/userman.html#GN_line
46              
47             Each of the possibly multiple genes in an entry can have Name,
48             Synonyms (only if there is a name), OrderedLocusNames (names from
49             genomic sequences) and ORFNames (temporary or cosmid names). "Name"
50             here really means "symbol". This complexity is now dealt with the
51             following way:
52              
53             A new Bio::AnnotationI class was created in order to store the
54             data in tag-value pairs. This class (Bio::Annotation::TagTree)
55             is stored in the Bio::Annotation::Collection object and is
56             accessed like all other annotations. The tag name is 'gene_name'.
57              
58             There is a single Bio::Annotation::TagTree per sequence record, which
59             corresponds to the original class that stored this data
60             (Bio::Annotation::StructuredValue). Depending on how we progress
61             this may change to represent each group of gene names.
62              
63             For now, to access the gene name tree annotation, one uses the below method:
64              
65             my ($gene) = $seq->annotation->get_Annotations('gene_name');
66              
67             If you are only interested in displaying the values, value() returns a
68             string with similar formatting.
69              
70             There are several ways to get directly at the information you want if you
71             know the element (tag) for the data. For gene names all data is stored with
72             the element-tag pairs:
73              
74             "element1=tag1, tag2, tag3; element2=tag4, tag5;"
75              
76             This normally means the element will be 'Name', 'Synonyms', etc. and the
77             gene names the values. Using findval(), you can do the following:
78              
79             # grab a flattened list of all gene names
80             my @names = $ann->findval('Name');
81              
82             # or iterated through the nodes and grab the name for each group
83             for my $node ($ann->findnode('gene_name')) {
84             my @names = $node->findval('Name');
85             }
86              
87             The current method for parsing gene name data (and reconstructing gene name
88             output) is very generic. This is somewhat preemptive if, for instance, UniProt
89             decides to update and add another element name to the current ones using the
90             same formatting layout. Under those circumstances, one can iterate through the
91             tag tree in a safe way and retrieve all node data like so.
92              
93             # retrieve the gene name nodes (groups like names, synonyms, etc).
94             for my $ann ($seq->annotation->get_Annotations('gene_name')) {
95              
96             # each gene name group
97             for my $node ($ann->findnode('gene_name')) {
98             print "Gene name:\n";
99              
100             # each gene name node (tag => value pair)
101             for my $n ($node->children) {
102             print "\t".$n->element.": ".$n->children."\n";
103             }
104             }
105             }
106              
107             For more uses see Bio::Annotation::TagTree.
108              
109             Since Uniprot/Swiss-Prot format have been around for quite some time, the
110             parser is also able to read in the older GN line syntax where genes
111             are separated by AND and various symbols by OR. The first symbol is
112             taken to be the 'Name' and the remaining ones are stored as 'Synonyms'.
113              
114             Also, for UniProt output we support using other Bio::AnnotationI, but in this
115             case we only use the stringified version of the annotation. This is to allow for
116             backwards compatibility with code that previously used
117             Bio::Annotation::SimpleValue or other Bio::AnnotationI classes.
118              
119             =head2 Optional functions
120              
121             =over 3
122              
123             =item _show_dna()
124              
125             (output only) shows the dna or not
126              
127             =item _post_sort()
128              
129             (output only) provides a sorting func which is applied to the FTHelpers
130             before printing
131              
132             =item _id_generation_func()
133              
134             This is function which is called as
135              
136             print "ID ", $func($seq), "\n";
137              
138             To generate the ID line. If it is not there, it generates a sensible ID
139             line using a number of tools.
140              
141             If you want to output annotations in Swissprot format they need to be
142             stored in a Bio::Annotation::Collection object which is accessible
143             through the Bio::SeqI interface method L.
144              
145             The following are the names of the keys which are polled from a
146             L object.
147              
148             reference - Should contain Bio::Annotation::Reference objects
149             comment - Should contain Bio::Annotation::Comment objects
150             dblink - Should contain Bio::Annotation::DBLink objects
151             gene_name - Should contain Bio::Annotation::SimpleValue object
152              
153             =back
154              
155             =head1 FEEDBACK
156              
157             =head2 Mailing Lists
158              
159             User feedback is an integral part of the evolution of this
160             and other Bioperl modules. Send your comments and suggestions,
161             preferably to one of the Bioperl mailing lists.
162             Your participation is much appreciated.
163              
164             bioperl-l@bioperl.org - General discussion
165             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
166              
167             =head2 Support
168              
169             Please direct usage questions or support issues to the mailing list:
170              
171             I
172              
173             rather than to the module maintainer directly. Many experienced and
174             reponsive experts will be able look at the problem and quickly
175             address it. Please include a thorough description of the problem
176             with code and data examples if at all possible.
177              
178             =head2 Reporting Bugs
179              
180             Report bugs to the Bioperl bug tracking system to help us keep track
181             the bugs and their resolution.
182             Bug reports can be submitted via the web:
183              
184             https://github.com/bioperl/bioperl-live/issues
185              
186             =head1 AUTHOR - Elia Stupka
187              
188             Email elia@tll.org.sg
189              
190             =head1 APPENDIX
191              
192             The rest of the documentation details each of the object methods.
193             Internal methods are usually preceded with a _
194              
195             =cut
196              
197             # Let the code begin...
198              
199             package Bio::SeqIO::swiss;
200 5     5   680 use vars qw(@Unknown_names @Unknown_genus);
  5         14  
  5         333  
201 5     5   31 use strict;
  5         11  
  5         145  
202 5     5   780 use Bio::SeqIO::FTHelper;
  5         11  
  5         126  
203 5     5   27 use Bio::SeqFeature::Generic;
  5         8  
  5         94  
204 5     5   855 use Bio::Species;
  5         11  
  5         135  
205 5     5   2539 use Bio::Tools::SeqStats;
  5         17  
  5         192  
206 5     5   37 use Bio::Seq::SeqFactory;
  5         10  
  5         130  
207 5     5   24 use Bio::Annotation::Collection;
  5         8  
  5         108  
208 5     5   763 use Bio::Annotation::Comment;
  5         11  
  5         117  
209 5     5   747 use Bio::Annotation::Reference;
  5         10  
  5         119  
210 5     5   27 use Bio::Annotation::DBLink;
  5         9  
  5         117  
211 5     5   25 use Bio::Annotation::SimpleValue;
  5         9  
  5         103  
212 5     5   1166 use Bio::Annotation::TagTree;
  5         12  
  5         265  
213              
214 5     5   29 use base qw(Bio::SeqIO);
  5         63  
  5         29607  
215              
216             our $LINE_LENGTH = 76;
217              
218             # this is for doing species name parsing
219             @Unknown_names=('other', 'unidentified',
220             'unknown organism', 'not specified',
221             'not shown', 'Unspecified', 'Unknown',
222             'None', 'unclassified', 'unidentified organism',
223             'not supplied'
224             );
225             # dictionary of synonyms for taxid 32644
226             # all above can be part of valid species name
227             @Unknown_genus = qw(unknown unclassified uncultured unidentified);
228              
229             # if there are any other gene name tags, they are added to the end
230             our @GENE_NAME_ORDER = qw(Name Synonyms OrderedLocusNames ORFNames);
231              
232             sub _initialize {
233 23     23   84 my($self,@args) = @_;
234 23         142 $self->SUPER::_initialize(@args);
235             # hash for functions for decoding keys.
236 23         113 $self->{'_func_ftunit_hash'} = {};
237             # sets this to one by default. People can change it
238 23         141 $self->_show_dna(1);
239 23 50       113 if ( ! defined $self->sequence_factory ) {
240 23         95 $self->sequence_factory(Bio::Seq::SeqFactory->new
241             (-verbose => $self->verbose(),
242             -type => 'Bio::Seq::RichSeq'));
243             }
244             }
245              
246             =head2 next_seq
247              
248             Title : next_seq
249             Usage : $seq = $stream->next_seq()
250             Function: returns the next sequence in the stream
251             Returns : Bio::Seq object
252             Args :
253              
254             =cut
255              
256             sub next_seq {
257 36     36 1 4420 my ($self,@args) = @_;
258 36         111 my ($pseq,$c,$line,$name,$desc,$acc,$seqc,$mol,$div, $sptr,$seq_div,
259             $date,$comment,@date_arr);
260 36         77 my $genename = "";
261 36         259 my ($annotation, %params, @features) = ( Bio::Annotation::Collection->new());
262              
263 36         76 local $_;
264              
265 36   100     211 1 while defined($_ = $self->_readline) && /^\s+$/;
266 36 100 66     259 return unless defined $_ && /^ID\s/;
267              
268             # fixed to allow _DIVISION to be optional for bug #946
269             # see bug report for more information
270             #
271             # 9/6/06 Note: Swiss/TrEMBL sequences have no division acc. to UniProt
272             # release notes; this is fixed to simplify the regex parsing
273             # STANDARD (SwissProt) and PRELIMINARY (TrEMBL) added to namespace()
274 31 50       245 unless( m{^
275             ID \s+ #
276             (\S+) \s+ # $1 entryname
277             ([^\s;]+); \s+ # $2 DataClass
278             (?:PRT;)? \s+ # Molecule Type (optional)
279             [0-9]+[ ]AA \. # Sequencelength (capture?)
280             $
281             }ox ) {
282             # I couldn't find any new current UniProt sequences
283             # that matched this format:
284             # || m/^ID\s+(\S+)\s+(_([^\s_]+))? /ox ) {
285 0         0 $self->throw("swissprot stream with no ID. Not swissprot in my book");
286             }
287 31         153 ($name, $seq_div) = ($1, $2);
288 31 100 100     257 $params{'-namespace'} =
    100 100        
289             ($seq_div eq 'Reviewed' || $seq_div eq 'STANDARD') ? 'Swiss-Prot' :
290             ($seq_div eq 'Unreviewed' || $seq_div eq 'PRELIMINARY') ? 'TrEMBL' :
291             $seq_div;
292             # we shouldn't be setting the division, but for now...
293 31         147 my ($junk, $division) = split q(_), $name;
294 31         88 $params{'-division'} = $division;
295 31         69 $params{'-alphabet'} = 'protein';
296             # this is important to have the id for display in e.g. FTHelper, otherwise
297             # you won't know which entry caused an error
298 31         69 $params{'-display_id'} = $name;
299              
300             BEFORE_FEATURE_TABLE :
301 31         99 while ( defined($_ = $self->_readline) ) {
302             # Exit at start of Feature table and at the sequence at the
303             # latest HL 05/11/2000
304 981 100       2952 last if( /^(FT|SQ)/ );
305              
306             # Description line(s)
307 950 100       7670 if (/^DE\s+(\S.*\S)/) {
    100          
    100          
    100          
    100          
    100          
    100          
    100          
    100          
    50          
308 54 100       242 $desc .= $desc ? " $1" : $1;
309             }
310             #Gene name
311             elsif (/^GN\s+(.*)/) {
312 36 100       97 $genename .= " " if $genename;
313 36         112 $genename .= $1;
314             }
315             #accession number(s)
316             elsif ( /^AC\s+(.+)/) {
317 31         164 my @accs = split(/[; ]+/, $1); # allow space in addition
318             $params{'-accession_number'} = shift @accs
319 31 50       135 unless defined $params{'-accession_number'};
320 31         64 push @{$params{'-secondary_accessions'}}, @accs;
  31         140  
321             }
322             #date and sequence version
323             elsif ( /^DT\s+(.*)/ ) {
324 90         442 my $line = $1;
325 90         281 my ($date, $version) = split(' ', $line, 2);
326 90         162 $date =~ tr/,//d; # remove comma if new version
327 90 100 100     701 if ($version =~ /\(Rel\. (\d+), Last sequence update\)/ || # old
    100 100        
328             /sequence version (\d+)/) { #new
329 30         240 my $update = Bio::Annotation::SimpleValue->new
330             (-tagname => 'seq_update',
331             -value => $1
332             );
333 30         152 $annotation->add_Annotation($update);
334             } elsif ($version =~ /\(Rel\. (\d+), Last annotation update\)/ || #old
335             /entry version (\d+)/) { #new
336 30         96 $params{'-version'} = $1;
337             }
338 90         121 push @{$params{'-dates'}}, $date;
  90         368  
339             }
340             # Evidence level
341             elsif ( /^PE\s+(.*)/ ) {
342 1         6 my $line = $1;
343 1         7 $line =~ s/;\s*//; # trim trailing semicolon and any spaces at the end of the line
344 1         12 my $evidence = Bio::Annotation::SimpleValue->new
345             (-tagname => 'evidence',
346             -value => $line
347             );
348 1         6 $annotation->add_Annotation($evidence);
349             }
350             # Organism name and phylogenetic information
351             elsif (/^O[SCG]/) {
352 30         130 my $species = $self->_read_swissprot_Species($_);
353 30         286 $params{'-species'}= $species;
354             # now we are one line ahead -- so continue without reading the next
355             # line HL 05/11/2000
356             }
357             # References
358             elsif (/^R/) {
359 30         150 my $refs = $self->_read_swissprot_References($_);
360 30         97 foreach my $r (@$refs) {
361 227         564 $annotation->add_Annotation('reference',$r);
362             }
363             }
364             # Comments
365             elsif (/^CC\s{3}(.*)/) {
366 30         148 $comment .= $1;
367 30         65 $comment .= "\n";
368 30   66     99 while (defined ($_ = $self->_readline) && /^CC\s{3}(.*)/ ) {
369 474         1560 $comment .= $1 . "\n";
370             }
371              
372             # note: don't try to process comments here -- they may contain
373             # structure. LP 07/30/2000
374              
375 30         306 my $commobj = Bio::Annotation::Comment->new(-tagname => 'comment',
376             -text => $comment);
377 30         128 $annotation->add_Annotation('comment',$commobj);
378 30         64 $comment = "";
379 30         132 $self->_pushback($_);
380             }
381             #DBLinks
382             # old regexp
383             # /^DR\s+(\S+)\;\s+(\S+)\;\s+(\S+)[\;\.](.*)$/) {
384             # new regexp from Andreas Kahari bug #1584
385             elsif (/^DR\s+(\S+)\;\s+(\S+)\;\s+([^;]+)[\;\.](.*)$/) {
386 599         2292 my ($database,$primaryid,$optional,$comment) = ($1,$2,$3,$4);
387              
388             # drop leading and training spaces and trailing .
389 599         1415 $comment =~ s/\.\s*$//;
390 599         1144 $comment =~ s/^\s+//;
391              
392 599         2226 my $dblinkobj = Bio::Annotation::DBLink->new
393             (-database => $database,
394             -primary_id => $primaryid,
395             -optional_id => $optional,
396             -comment => $comment,
397             -tagname => 'dblink',
398             );
399              
400 599         1705 $annotation->add_Annotation('dblink',$dblinkobj);
401             }
402             #keywords
403             elsif ( /^KW\s+(.*)$/ ) {
404 49         409 my @kw = split(/\s*\;\s*/,$1);
405 49 50       237 defined $kw[-1] && $kw[-1] =~ s/\.$//;
406 49         108 push @{$params{'-keywords'}}, @kw;
  49         289  
407             }
408             }
409             # process and parse the gene name line if there was one (note: we
410             # can't do this above b/c GN may be multi-line and we can't
411             # unequivocally determine whether we've seen the last GN line in
412             # the new format)
413 31 100       106 if ($genename) {
414 30         78 my @stags;
415 30 100       133 if ($genename =~ /\w=\w/) {
416             # new format (e.g., Name=RCHY1; Synonyms=ZNF363, CHIMP)
417 12         68 for my $n (split(m{\s+and\s+},$genename)) {
418 14         29 my @genenames;
419 14         74 for my $section (split(m{\s*;\s*},$n)) {
420 19         76 my ($tag, $rest) = split("=",$section);
421 19   50     53 $rest ||= '';
422 19         56 for my $val (split(m{\s*,\s*},$rest)) {
423 23         70 push @genenames, [$tag => $val];
424             }
425             }
426 14         83 push @stags, ['gene_name' => \@genenames];
427             }
428             } else {
429             # old format
430 18         90 for my $section (split(/ AND /, $genename)) {
431 22         48 my @genenames;
432 22         102 $section =~ s/[\(\)\.]//g;
433 22         99 my @names = split(m{\s+OR\s+}, $section);
434 22         152 push @genenames, ['Name' => shift @names];
435 22         78 push @genenames, map {['Synonyms' => $_]} @names;
  25         70  
436 22         102 push @stags, ['gene_name' => \@genenames]
437             }
438             } #use Data::Dumper; print Dumper $gn, $genename;# exit;
439 30         371 my $gn = Bio::Annotation::TagTree->new(-tagname => 'gene_name',
440             -value => ['gene_names' => \@stags]);
441 30         136 $annotation->add_Annotation('gene_name', $gn);
442             }
443              
444             FEATURE_TABLE :
445             # if there is no feature table, or if we've got beyond, exit loop or don't
446             # even enter HL 05/11/2000
447 31   66     271 while (defined $_ && /^FT/ ) {
448 443         1337 my $ftunit = $self->_read_FTHelper_swissprot($_);
449              
450             # process ftunit
451             # when parsing of the line fails we get undef returned
452 443 50       856 if ($ftunit) {
453             push(@features,
454             $ftunit->_generic_seqfeature($self->location_factory(),
455 443         1373 $params{'-seqid'}, "SwissProt"));
456             } else {
457             $self->warn("failed to parse feature table line for seq " .
458 0         0 $params{'-display_id'}. "\n$_");
459             }
460 443         1713 $_ = $self->_readline;
461             }
462 31   33     303 while ( defined($_) && ! /^SQ/ ) {
463 0         0 $_ = $self->_readline;
464             }
465 31         74 $seqc = "";
466 31         96 while ( defined ($_ = $self->_readline) ) {
467 242 100       563 last if m{^//};
468 211         1428 s/[^A-Za-z]//g;
469 211         673 $seqc .= uc($_);
470             }
471              
472 31         183 my $seq= $self->sequence_factory->create
473             (-verbose => $self->verbose,
474             %params,
475             -seq => $seqc,
476             -desc => $desc,
477             -features => \@features,
478             -annotation => $annotation,
479             );
480              
481             # The annotation doesn't get added by the constructor
482 31         164 $seq->annotation($annotation);
483              
484 31         416 return $seq;
485             }
486              
487             =head2 write_seq
488              
489             Title : write_seq
490             Usage : $stream->write_seq($seq)
491             Function: writes the $seq object (must be seq) to the stream
492             Returns : 1 for success and 0 for error
493             Args : array of 1 to n Bio::SeqI objects
494              
495              
496             =cut
497              
498             sub write_seq {
499 4     4 1 87 my ($self,@seqs) = @_;
500 4         15 foreach my $seq ( @seqs ) {
501 4 50       18 $self->throw("Attempting to write with no seq!") unless defined $seq;
502              
503 4 50 33     36 if ( ! ref $seq || ! $seq->isa('Bio::SeqI') ) {
504 0         0 $self->warn(" $seq is not a SeqI compliant module. Attempting to dump, but may fail!");
505             }
506              
507 4         11 my $i;
508 4         26 my $str = $seq->seq;
509              
510 4         9 my $div;
511 4   66     73 my $ns = ($seq->can('namespace')) && $seq->namespace();
512 4         23 my $len = $seq->length();
513              
514 4 100 66     45 if ( !$seq->can('division') || ! defined ($div = $seq->division()) ) {
515 1         4 $div = 'UNK';
516             }
517              
518             # namespace dictates database, takes precedent over division. Sorry!
519 4 100 66     32 if (defined($ns) && $ns ne '') {
520 3 0       17 $div = ($ns eq 'Swiss-Prot') ? 'Reviewed' :
    50          
521             ($ns eq 'TrEMBL') ? 'Unreviewed' :
522             $ns;
523             } else {
524 1         2 $ns = 'Swiss-Prot';
525             # division not reset; acts as fallback
526             }
527              
528 4 50       38 $self->warn("No whitespace allowed in SWISS-PROT display id [". $seq->display_id. "]")
529             if $seq->display_id =~ /\s/;
530              
531 4         11 my $temp_line;
532 4 50       19 if ( $self->_id_generation_func ) {
533 0         0 $temp_line = &{$self->_id_generation_func}($seq);
  0         0  
534             } else {
535             #$temp_line = sprintf ("%10s STANDARD; %3s; %d AA.",
536             # $seq->primary_id()."_".$div,$mol,$len);
537             # Reconstructing the ID relies heavily upon the input source having
538             # been in a format that is parsed as this routine expects it -- that is,
539             # by this module itself. This is bad, I think, and immediately breaks
540             # if e.g. the Bio::DB::GenPept module is used as input.
541             # Hence, switch to display_id(); _every_ sequence is supposed to have
542             # this. HL 2000/09/03
543             # Changed to reflect ID line changes in UniProt
544             # Oct 2006 - removal of molecule type - see bug 2134
545 4         12 $temp_line = sprintf ("%-24s%-12s%9d AA.",
546             $seq->display_id(), $div.';', $len);
547             }
548              
549 4         44 $self->_print( "ID $temp_line\n");
550              
551             # if there, write the accession line
552 4         25 local($^W) = 0; # suppressing warnings about uninitialized fields
553              
554 4 50       26 if ( $self->_ac_generation_func ) {
    50          
555 0         0 $temp_line = &{$self->_ac_generation_func}($seq);
  0         0  
556 0         0 $self->_print( "AC $temp_line\n");
557             }
558             elsif ($seq->can('accession_number') ) {
559 4         22 my $ac_line = $seq->accession_number;
560 4 100       30 if ($seq->can('get_secondary_accessions') ) {
561 3         16 foreach my $sacc ($seq->get_secondary_accessions) {
562 0         0 $ac_line .= "; ". $sacc;;
563             }
564 3         11 $ac_line .= ";";
565             }
566              
567 4         24 $self->_write_line_swissprot_regex("AC ","AC ",$ac_line,
568             "\\s\+\|\$",$LINE_LENGTH);
569             }
570             # otherwise - cannot print
571              
572              
573             # Date lines and sequence versions (changed 6/15/2006)
574             # This is rebuilt from scratch using the current SwissProt/UniProt format
575 4 100       37 if ( $seq->can('get_dates') ) {
576 3         21 my @dates = $seq->get_dates();
577 3         9 my $ct = 1;
578 3         15 my $seq_version = $seq->version;
579 3         13 my ($update_version) = $seq->annotation->get_Annotations("seq_update");
580 3         11 foreach my $dt (@dates) {
581 9 100       41 $self->_write_line_swissprot_regex("DT ","DT ",
582             $dt.', integrated into UniProtKB/'.$ns.'.',
583             "\\s\+\|\$",$LINE_LENGTH) if $ct == 1;
584 9 100       37 $self->_write_line_swissprot_regex("DT ","DT ",
585             $dt.", sequence version ".$update_version->display_text.'.',
586             "\\s\+\|\$",$LINE_LENGTH) if $ct == 2;
587 9 100       38 $self->_write_line_swissprot_regex("DT ","DT ",
588             $dt.", entry version $seq_version.",
589             "\\s\+\|\$",$LINE_LENGTH) if $ct == 3;
590 9         18 $ct++;
591             }
592             }
593              
594             #Definition lines
595 4         27 $self->_write_line_swissprot_regex("DE ","DE ",$seq->desc(),"\\s\+\|\$",$LINE_LENGTH);
596              
597             #Gene name; print out new format
598 4         17 foreach my $gene ( my @genes = $seq->annotation->get_Annotations('gene_name') ) {
599             # gene is a Bio::Annotation::TagTree;
600 3 100       25 if ($gene->isa('Bio::Annotation::TagTree')) {
601 2         4 my @genelines;
602 2         15 for my $node ($gene->findnode('gene_name')) {
603             # check for Name and Synonym first, then the rest get tagged on
604 2         464 my $geneline = "GN ";
605 2         21 my %genedata = $node->hash;
606 2         104 for my $tag (@GENE_NAME_ORDER) {
607 8 100       25 if (exists $genedata{$tag}) {
608             $geneline .= (ref $genedata{$tag} eq 'ARRAY') ?
609 2 50       18 "$tag=".join(', ',@{$genedata{$tag}})."; " :
  0         0  
610             "$tag=$genedata{$tag}; ";
611 2         8 delete $genedata{$tag};
612             }
613             }
614             # add rest
615 2         11 for my $tag (sort keys %genedata) {
616             $geneline .= (ref $genedata{$tag} eq 'ARRAY') ?
617 0 0       0 "$tag=".join(', ',@{$genedata{$tag}})."; " :
  0         0  
618             "$tag=$genedata{$tag}; ";
619 0         0 delete $genedata{$tag};
620             }
621 2         12 push @genelines, "$geneline\n";
622             }
623 2         14 $self->_print(join("GN and\n",@genelines));
624             } else { # fall back to getting stringified output
625 1         4 $self->_write_line_swissprot_regex("GN ","GN ",
626             $gene->display_text,
627             "\\s\+\|\$",
628             $LINE_LENGTH);
629             }
630             }
631              
632             # Organism lines
633 4 100 66     48 if ($seq->can('species') && (my $spec = $seq->species)) {
634 3         25 my @class = $spec->classification();
635 3         6 shift(@class);
636 3         17 my $species = $spec->species;
637 3         11 my $genus = $spec->genus;
638 3         20 my $OS = $spec->scientific_name;
639 3 50       17 if ($class[-1] =~ /viruses/i) {
640 0         0 $OS = $species;
641 0 0       0 $OS .= " ". $spec->sub_species if $spec->sub_species;
642             }
643 3         19 foreach (($spec->variant, $spec->common_name)) {
644 3 50       13 $OS .= " ($_)" if $_;
645             }
646 3         28 $self->_print( "OS $OS.\n");
647 3         19 my $OC = join('; ', reverse(@class)) .'.';
648 3         19 $self->_write_line_swissprot_regex("OC ","OC ",$OC,"\; \|\$",$LINE_LENGTH);
649 3 50       18 if ($spec->organelle) {
650 0         0 $self->_write_line_swissprot_regex("OG ","OG ",$spec->organelle,"\; \|\$",$LINE_LENGTH);
651             }
652 3 50       16 if ($spec->ncbi_taxid) {
653 3         13 $self->_print("OX NCBI_TaxID=".$spec->ncbi_taxid.";\n");
654             }
655             }
656              
657             # Reference lines
658 4         14 my $t = 1;
659 4         29 foreach my $ref ( $seq->annotation->get_Annotations('reference') ) {
660 3         19 $self->_print( "RN [$t]\n");
661             # changed by lorenz 08/03/00
662             # j.gilbert and h.lapp agreed that the rp line in swissprot seems
663             # more like a comment than a parseable value, so print it as is
664 3 50       20 if ($ref->rp) {
665 3         17 $self->_write_line_swissprot_regex("RP ","RP ",$ref->rp,
666             "\\s\+\|\$",$LINE_LENGTH);
667             }
668 3 100       20 if ($ref->comment) {
669 2         6 $self->_write_line_swissprot_regex("RC ","RC ",$ref->comment,
670             "\\s\+\|\$",$LINE_LENGTH);
671             }
672 3 50 33     17 if ($ref->medline or $ref->pubmed or $ref->doi) {
      33        
673             # new RX format in swissprot LP 09/17/00
674             # RX line can now have a DOI, Heikki 13 Feb 2008
675              
676 0         0 my $line;
677 0 0       0 $line .= "MEDLINE=". $ref->medline. '; ' if $ref->medline;
678 0 0       0 $line .= "PubMed=". $ref->pubmed. '; ' if $ref->pubmed;
679 0 0       0 $line .= "DOI=". $ref->doi. '; ' if $ref->doi;
680 0         0 chop $line;
681              
682 0         0 $self->_write_line_swissprot_regex("RX ","RX ",
683             $line,
684             "\\s\+\|\$",$LINE_LENGTH);
685              
686             }
687 3 50       17 my $author = $ref->authors .';' if($ref->authors);
688 3 50       16 my $title = $ref->title .';' if( $ref->title);
689 3 50       16 my $rg = $ref->rg . ';' if $ref->rg;
690 3         40 $author =~ s/([\w\.]) (\w)/$1#$2/g; # add word wrap protection char '#'
691              
692 3 50       14 $self->_write_line_swissprot_regex("RG ","RG ",$rg,"\\s\+\|\$",$LINE_LENGTH) if $rg;
693 3 50       18 $self->_write_line_swissprot_regex("RA ","RA ",$author,"\\s\+\|\$",$LINE_LENGTH) if $author;
694 3 50       11 $self->_write_line_swissprot_regex("RT ","RT ",$title,'[\s\-]+|$',$LINE_LENGTH) if $title;
695 3         16 $self->_write_line_swissprot_regex("RL ","RL ",$ref->location,"\\s\+\|\$",$LINE_LENGTH);
696 3         15 $t++;
697             }
698              
699             # Comment lines
700              
701 4         17 foreach my $comment ( $seq->annotation->get_Annotations('comment') ) {
702 3         15 foreach my $cline (split ("\n", $comment->text)) {
703 48         81 while (length $cline > 74) {
704 0         0 $self->_print("CC ",(substr $cline,0,74),"\n");
705 0         0 $cline = substr $cline,74;
706             }
707 48         77 $self->_print("CC ",$cline,"\n");
708             }
709             }
710              
711             # Database xref lines
712              
713 4         18 foreach my $dblink ( $seq->annotation->get_Annotations('dblink') ) {
714 27         56 my ($primary_id) = $dblink->primary_id;
715            
716 27 100 66     48 if (defined($dblink->comment) && ($dblink->comment) ) {
    50          
717 12         19 $self->_print("DR ",$dblink->database,"; ",$primary_id,"; ",
718             $dblink->optional_id,"; ",$dblink->comment,".\n");
719             } elsif ($dblink->optional_id) {
720 15         39 $self->_print("DR ",$dblink->database,"; ",
721             $primary_id,"; ",
722             $dblink->optional_id,".\n");
723             } else {
724 0         0 $self->_print("DR ",$dblink->database,
725             "; ",$primary_id,"; ","-.\n");
726             }
727             }
728              
729             # Evidence lines
730              
731 4         22 foreach my $evidence ( $seq->annotation->get_Annotations('evidence') ) {
732 0         0 $self->_print("PE ",$evidence->value,";\n");
733             }
734              
735             # if there, write the kw line
736             {
737 4         9 my $kw;
  4         6  
738 4 50       19 if ( my $func = $self->_kw_generation_func ) {
    100          
739 0         0 $kw = &{$func}($seq);
  0         0  
740             } elsif ( $seq->can('keywords') ) {
741 3         15 $kw = $seq->keywords;
742 3 50       24 if ( ref($kw) =~ /ARRAY/i ) {
743 0         0 $kw = join("; ", @$kw);
744             }
745 3 50 33     31 $kw .= '.' if $kw and $kw !~ /\.$/ ;
746             }
747 4         51 $kw =~ s/([\w\.]) (\w)/$1#$2/g; # add word wrap protection char '#'
748 4 100       23 $self->_write_line_swissprot_regex("KW ","KW ",
749             $kw, "\\s\+\|\$",$LINE_LENGTH)
750             if $kw;
751             }
752              
753             #Check if there is seqfeatures before printing the FT line
754 4 50       47 my @feats = $seq->can('top_SeqFeatures') ? $seq->top_SeqFeatures : ();
755 4 100       15 if ($feats[0]) {
756 3 50       14 if ( defined $self->_post_sort ) {
757              
758             # we need to read things into an array. Process. Sort them. Print 'em
759              
760 0         0 my $post_sort_func = $self->_post_sort();
761 0         0 my @fth;
762              
763 0         0 foreach my $sf ( @feats ) {
764 0         0 push(@fth,Bio::SeqIO::FTHelper::from_SeqFeature($sf,$seq));
765             }
766 0         0 @fth = sort { &$post_sort_func($a,$b) } @fth;
  0         0  
767              
768 0         0 foreach my $fth ( @fth ) {
769 0         0 $self->_print_swissprot_FTHelper($fth);
770             }
771             } else {
772             # not post sorted. And so we can print as we get them.
773             # lower memory load...
774              
775 3         9 foreach my $sf ( @feats ) {
776 9         35 my @fth = Bio::SeqIO::FTHelper::from_SeqFeature($sf,$seq);
777 9         18 foreach my $fth ( @fth ) {
778 9 50       32 if ( ! $fth->isa('Bio::SeqIO::FTHelper') ) {
779 0         0 $sf->throw("Cannot process FTHelper... $fth");
780             }
781 9         32 $self->_print_swissprot_FTHelper($fth);
782             }
783             }
784             }
785              
786 3 50       14 if ( $self->_show_dna() == 0 ) {
787 0         0 return;
788             }
789             }
790             # finished printing features.
791              
792             # molecular weight
793 4         9 my $mw = ${Bio::Tools::SeqStats->get_mol_wt($seq->primary_seq)}[0];
  4         21  
794             # checksum
795             # was crc32 checksum, changed it to crc64
796 4         27 my $crc64 = $self->_crc64(\$str);
797 4         112 $self->_print( sprintf("SQ SEQUENCE %4d AA; %d MW; %16s CRC64;\n",
798             $len,$mw,$crc64));
799 4         25 $self->_print( " ");
800 4         8 my $linepos;
801 4         33 for ($i = 0; $i < length($str); $i += 10) {
802 164         389 $self->_print( " ", substr($str,$i,10));
803 164         226 $linepos += 11;
804 164 100 66     428 if ( ($i+10)%60 == 0 && (($i+10) < length($str))) {
805 24         48 $self->_print( "\n ");
806             }
807             }
808 4         18 $self->_print( "\n//\n");
809              
810 4 50 33     22 $self->flush if $self->_flush_on_write && defined $self->_fh;
811 4         73 return 1;
812             }
813             }
814              
815             # Thanks to James Gilbert for the following two. LP 08/01/2000
816              
817             =head2 _generateCRCTable
818              
819             Title : _generateCRCTable
820             Usage :
821             Function:
822             Example :
823             Returns :
824             Args :
825              
826              
827             =cut
828              
829             sub _generateCRCTable {
830             # 10001000001010010010001110000100
831             # 32
832 0     0   0 my $poly = 0xEDB88320;
833 0         0 my ($self) = shift;
834              
835 0         0 $self->{'_crcTable'} = [];
836 0         0 foreach my $i (0..255) {
837 0         0 my $crc = $i;
838 0         0 for (my $j=8; $j > 0; $j--) {
839 0 0       0 if ($crc & 1) {
840 0         0 $crc = ($crc >> 1) ^ $poly;
841             } else {
842 0         0 $crc >>= 1;
843             }
844             }
845 0         0 ${$self->{'_crcTable'}}[$i] = $crc;
  0         0  
846             }
847             }
848              
849              
850             =head2 _crc32
851              
852             Title : _crc32
853             Usage :
854             Function:
855             Example :
856             Returns :
857             Args :
858              
859              
860             =cut
861              
862             sub _crc32 {
863 0     0   0 my( $self, $str ) = @_;
864              
865 0 0       0 $self->throw("Argument to crc32() must be ref to scalar")
866             unless ref($str) eq 'SCALAR';
867              
868 0 0       0 $self->_generateCRCTable() unless exists $self->{'_crcTable'};
869              
870 0         0 my $len = length($$str);
871              
872 0         0 my $crc = 0xFFFFFFFF;
873 0         0 for (my $i = 0; $i < $len; $i++) {
874             # Get upper case value of each letter
875 0         0 my $int = ord uc substr $$str, $i, 1;
876             $crc = (($crc >> 8) & 0x00FFFFFF) ^
877 0         0 ${$self->{'_crcTable'}}[ ($crc ^ $int) & 0xFF ];
  0         0  
878             }
879 0         0 return $crc;
880             }
881              
882             =head2 _crc64
883              
884             Title : _crc64
885             Usage :
886             Function:
887             Example :
888             Returns :
889             Args :
890              
891              
892             =cut
893              
894             sub _crc64{
895 4     4   16 my ($self, $sequence) = @_;
896 4         10 my $POLY64REVh = 0xd8000000;
897 4         11 my @CRCTableh = 256;
898 4         8 my @CRCTablel = 256;
899 4         8 my $initialized;
900              
901 4         10 my $seq = $$sequence;
902              
903 4         6 my $crcl = 0;
904 4         9 my $crch = 0;
905 4 50       14 if (!$initialized) {
906 4         6 $initialized = 1;
907 4         19 for (my $i=0; $i<256; $i++) {
908 1024         1048 my $partl = $i;
909 1024         930 my $parth = 0;
910 1024         1443 for (my $j=0; $j<8; $j++) {
911 8192         8502 my $rflag = $partl & 1;
912 8192         7437 $partl >>= 1;
913 8192 50       10322 $partl |= (1 << 31) if $parth & 1;
914 8192         7402 $parth >>= 1;
915 8192 100       14621 $parth ^= $POLY64REVh if $rflag;
916             }
917 1024         1259 $CRCTableh[$i] = $parth;
918 1024         1617 $CRCTablel[$i] = $partl;
919             }
920             }
921              
922 4         476 foreach (split '', $seq) {
923 1636         1710 my $shr = ($crch & 0xFF) << 24;
924 1636         1581 my $temp1h = $crch >> 8;
925 1636         1562 my $temp1l = ($crcl >> 8) | $shr;
926 1636         2062 my $tableindex = ($crcl ^ (unpack "C", $_)) & 0xFF;
927 1636         1859 $crch = $temp1h ^ $CRCTableh[$tableindex];
928 1636         2050 $crcl = $temp1l ^ $CRCTablel[$tableindex];
929             }
930 4         148 my $crc64 = sprintf("%08X%08X", $crch, $crcl);
931 4         87 return $crc64;
932             }
933              
934             =head2 _print_swissprot_FTHelper
935              
936             Title : _print_swissprot_FTHelper
937             Usage :
938             Function:
939             Example :
940             Returns :
941             Args :
942              
943              
944             =cut
945              
946             sub _print_swissprot_FTHelper {
947 9     9   22 my ($self,$fth,$always_quote) = @_;
948 9   50     118 $always_quote ||= 0;
949 9         21 my ($start,$end) = ('?', '?');
950              
951 9 50 33     53 if ( ! ref $fth || ! $fth->isa('Bio::SeqIO::FTHelper') ) {
952 0         0 $fth->warn("$fth is not a FTHelper class. ".
953             "Attempting to print, but there could be tears!");
954             }
955 9         17 my $desc = "";
956              
957 9         20 for my $tag ( qw(description gene note product) ) {
958 9 50       26 if ( exists $fth->field->{$tag} ) {
959 9         15 $desc = @{$fth->field->{$tag}}[0].".";
  9         24  
960 9         20 last;
961             }
962             }
963 9         57 $desc =~ s/\.$//;
964              
965 9         21 my $ftid = "";
966 9 50       28 if ( exists $fth->field->{'FTId'} ) {
967 0         0 $ftid = @{$fth->field->{'FTId'}}[0]. '.';
  0         0  
968             }
969              
970 9         26 my $key =substr($fth->key,0,8);
971 9         23 my $loc = $fth->loc;
972 9 100       59 if ( $loc =~ /(\?|\d+|\>\d+|<\d+)?\.\.(\?|\d+|<\d+|>\d+)?/ ) {
    50          
973 6 50       27 $start = $1 if defined $1;
974 6 50       31 $end = $2 if defined $2;
975              
976             # to_FTString only returns one value when start == end, #JB955
977             # so if no match is found, assume it is both start and end #JB955
978             } elsif ( $loc =~ /join\((\d+)((?:,\d+)+)?\)/) {
979 0         0 my @y = ($1);
980 0 0       0 if ( defined( my $m = $2) ) {
981 0         0 $m =~ s/^\,//;
982 0         0 push @y, split(/,/,$m);
983             }
984 0         0 for my $x ( @y ) {
985 0         0 $self->_write_line_swissprot_regex(
986             sprintf("FT %-8s %6s %6s ",
987             $key,
988             $x ,$x),
989             "FT ",
990             $desc.'.','\s+|$',$LINE_LENGTH);
991             }
992 0         0 return;
993             } else {
994 3         13 $start = $end = $fth->loc;
995             }
996 9 50       19 if ($desc) {
997 9         86 $self->_write_line_swissprot_regex(sprintf("FT %-8s %6s %6s ",
998             $key,
999             $start ,$end),
1000             "FT ",
1001             $desc. '.', '\s+|$', $LINE_LENGTH);
1002             } else { #HELIX and STRAND do not have descriptions
1003 0         0 $self->_write_line_swissprot_regex(sprintf("FT %-8s %6s %6s",
1004             $key,
1005             $start ,$end),
1006             "FT ",
1007             ' ', '\s+|$', $LINE_LENGTH);
1008             }
1009              
1010              
1011 9 50       64 if ($ftid) {
1012 0         0 $self->_write_line_swissprot_regex("FT ",
1013             "FT ",
1014             "/FTId=$ftid",'.|$',$LINE_LENGTH);
1015              
1016             }
1017              
1018             }
1019             #'
1020              
1021             =head2 _read_swissprot_References
1022              
1023             Title : _read_swissprot_References
1024             Usage :
1025             Function: Reads references from swissprot format. Internal function really
1026             Example :
1027             Returns :
1028             Args :
1029              
1030              
1031             =cut
1032              
1033             sub _read_swissprot_References{
1034 30     30   97 my ($self,$line) = @_;
1035 30         104 my ($b1, $b2, $rp, $rg, $title, $loc, $au, $med, $com, $pubmed, $doi);
1036 30         0 my @refs;
1037 30         71 local $_ = $line;
1038 30         91 while ( defined $_ ) {
1039 1813 100 100     10990 if ( /^[^R]/ || /^RN/ ) {
    100          
    100          
    100          
    100          
    100          
    100          
    100          
    50          
1040 257 100       473 if ( $rp ) {
1041 227 100       417 $rg =~ s/;\s*$//g if defined($rg);
1042 227 100       335 if (defined($au)) {
1043 223         850 $au =~ s/;\s*$//;
1044             } else {
1045 4         10 $au = $rg;
1046             }
1047 227 100       883 $title =~ s/;\s*$//g if defined($title);
1048 227         1483 push @refs, Bio::Annotation::Reference->new
1049             (-title => $title,
1050             -start => $b1,
1051             -end => $b2,
1052             -authors => $au,
1053             -location=> $loc,
1054             -medline => $med,
1055             -pubmed => $pubmed,
1056             -doi => $doi,
1057             -comment => $com,
1058             -rp => $rp,
1059             -rg => $rg,
1060             -tagname => 'reference',
1061             );
1062             # reset state for the next reference
1063 227         514 $rp = '';
1064             }
1065 257 100       658 if (index($_,'R') != 0) {
1066 30         149 $self->_pushback($_); # want this line to go back on the list
1067 30         91 last; # may be the safest exit point HL 05/11/2000
1068             }
1069             # don't forget to reset the state for the next reference
1070 227         390 $b1 = $b2 = $rg = $med = $com = $pubmed = $doi = undef;
1071 227         284 $title = $loc = $au = undef;
1072             } elsif ( /^RP\s{3}(.+? OF (\d+)-(\d+).*)/) {
1073 35         133 $rp .= $1;
1074 35         77 $b1 = $2;
1075 35         93 $b2 = $3;
1076             } elsif ( /^RP\s{3}(.*)/) {
1077 198 100       388 if ($rp) {
1078 6         23 $rp .= " ".$1;
1079             } else {
1080 192         508 $rp = $1;
1081             }
1082             } elsif (/^RX\s{3}(.*)/) { # each reference can have only one RX line
1083 177         389 my $line = $1;
1084 177 100       705 $med = $1 if $line =~ /MEDLINE=(\d+);/;
1085 177 100       639 $pubmed = $1 if $line =~ /PubMed=(\d+);/;
1086 177 100       443 $doi = $1 if $line =~ /DOI=(.+);/;
1087             } elsif ( /^RA\s{3}(.*)/ ) {
1088 382 100       1132 $au .= $au ? " $1" : $1;
1089             } elsif ( /^RG\s{3}(.*)/ ) {
1090 16 50       65 $rg .= $rg ? " $1" : $1;
1091             } elsif ( /^RT\s{3}(.*)/ ) {
1092 381 100       626 if ($title) {
1093 177         317 my $tline = $1;
1094 177 100       745 $title .= ($title =~ /[\w;,:\?!]$/) ? " $tline" : $tline;
1095             } else {
1096 204         377 $title = $1;
1097             }
1098             } elsif (/^RL\s{3}(.*)/ ) {
1099 227 50       661 $loc .= $loc ? " $1" : $1;
1100             } elsif ( /^RC\s{3}(.*)/ ) {
1101 140 50       432 $com .= $com ? " $1" : $1;
1102             }
1103 1783         3386 $_ = $self->_readline;
1104             }
1105 30         129 return \@refs;
1106             }
1107              
1108              
1109             =head2 _read_swissprot_Species
1110              
1111             Title : _read_swissprot_Species
1112             Usage :
1113             Function: Reads the swissprot Organism species and classification
1114             lines.
1115             Able to deal with unconventional species names.
1116             Example : OS Unknown prokaryotic organism
1117             $genus = undef ; $species = Unknown prokaryotic organism
1118             Returns : A Bio::Species object
1119             Args :
1120              
1121             =cut
1122              
1123             sub _read_swissprot_Species {
1124 30     30   76 my( $self,$line ) = @_;
1125 30         47 my $org;
1126 30         75 local $_ = $line;
1127              
1128 30         76 my( $sub_species, $species, $genus, $common, $variant, $ncbi_taxid, $sci_name, $class_lines, $descr );
1129 30         53 my $osline = "";
1130 30         44 my $do_genus_check = 1;
1131 30         95 while ( defined $_ ) {
1132 183 100       526 last unless /^O[SCGX]/;
1133             # believe it or not, but OS may come multiple times -- at this time
1134             # we can't capture multiple species
1135 153 100 100     978 if (/^OS\s+(\S.+)/ && (! defined($sci_name))) {
    100 100        
    50          
    100          
1136 30 50       92 $osline .= " " if $osline;
1137 30         107 $osline .= $1;
1138 30 50       291 if ($osline =~ s/(,|, and|\.)$//) {
1139             # OS lines are usually like:
1140             # Homo sapiens (human)
1141             # where we have $sci_name followed by $descr (common name) in
1142             # brackets, but we can also have:
1143             # Venerupis (Ruditapes) philippinarum
1144             # where we have brackets but they don't indicate a $descr
1145 30 50       147 if ($osline =~ /[^\(\)]+\(.+\)[^\(\)]+$/) {
1146             #*** Danger! no idea if this will pick up some syntaxes for
1147             # common names as well)
1148 0         0 $sci_name = $osline;
1149 0         0 $sci_name =~ s/\.$//;
1150 0         0 $descr = '';
1151 0         0 $do_genus_check = 0;
1152             } else {
1153 30         166 ($sci_name, $descr) = $osline =~ /(\S[^\(]+)(.*)/;
1154             }
1155 30         133 $sci_name =~ s/\s+$//;
1156              
1157 30         153 while ($descr =~ /\(([^\)]+)\)/g) {
1158 13         33 my $item = $1;
1159             # strain etc may not necessarily come first (yes, swissprot
1160             # is messy)
1161 13 50 33     323 if ((! defined($variant)) &&
    50 33        
    50          
1162             (($item =~ /(^|[^\(\w])([Ss]train|isolate|serogroup|serotype|subtype|clone)\b/) ||
1163             ($item =~ /^(biovar|pv\.|type\s+)/))) {
1164 0         0 $variant = $item;
1165             } elsif ($item =~ s/^subsp\.\s+//) {
1166 0 0       0 if (! $sub_species) {
    0          
1167 0         0 $sub_species = $item;
1168             } elsif (! $variant) {
1169 0         0 $variant = $item;
1170             }
1171             } elsif (! defined($common)) {
1172             # we're only interested in the first common name
1173 13         28 $common = $item;
1174 13 50 33     82 if ((index($common, '(') >= 0) &&
1175             (index($common, ')') < 0)) {
1176 0         0 $common .= ')';
1177             }
1178             }
1179             }
1180             }
1181             } elsif (s/^OC\s+(\S.+)$//) {
1182 62         168 $class_lines .= $1;
1183             } elsif (/^OG\s+(.*)/) {
1184 0         0 $org = $1;
1185             } elsif (/^OX\s+(.*)/ && (! defined($ncbi_taxid))) {
1186 29         74 my $taxstring = $1;
1187             # we only keep the first one and ignore all others
1188 29 50       150 if ($taxstring =~ /NCBI_TaxID=([\w\d]+)/) {
1189 29         78 $ncbi_taxid = $1;
1190             } else {
1191 0         0 $self->throw("$taxstring doesn't look like NCBI_TaxID");
1192             }
1193             }
1194 153         317 $_ = $self->_readline;
1195             }
1196 30         141 $self->_pushback($_); # pushback the last line because we need it
1197              
1198 30 50       98 $sci_name || return;
1199              
1200             # if the organism belongs to taxid 32644 then no Bio::Species object.
1201 30 50       89 return if grep { $_ eq $sci_name } @Unknown_names;
  330         524  
1202              
1203             # Convert data in classification lines into classification array.
1204             # Remove trailing . then split on ';' or '.;' so that classification that is 2
1205             # or more words will still get matched, use map() to remove trailing/leading/intervening
1206             # spaces
1207 30         170 $class_lines=~s/\.\s*$//;
1208 30         332 my @class = map { s/^\s+//; s/\s+$//; s/\s{2,}/ /g; $_; } split /[;\.]*;/, $class_lines;
  298         569  
  298         446  
  298         355  
  298         429  
1209              
1210 30 50       187 if ($class[0] =~ /viruses/i) {
    50          
1211             # viruses have different OS/OC syntax
1212 0         0 my @virusnames = split(/\s+/, $sci_name);
1213 0 0       0 $species = (@virusnames > 1) ? pop(@virusnames) : '';
1214 0         0 $genus = join(" ", @virusnames);
1215 0         0 $sub_species = $descr;
1216             } elsif ($do_genus_check) {
1217             # do we have a genus?
1218 30         80 my $possible_genus = $class[-1];
1219 30 50       125 $possible_genus .= "|$class[-2]" if $class[-2];
1220 30 50       1129 if ($sci_name =~ /^($possible_genus)/) {
1221 30         104 $genus = $1;
1222 30         374 ($species) = $sci_name =~ /^$genus\s+(.+)/;
1223             } else {
1224 0         0 $species = $sci_name;
1225             }
1226             # is this organism of rank species or is it lower?
1227             # (doesn't catch everything, but at least the guess isn't dangerous)
1228 30 50 33     250 if ($species && $species =~ /subsp\.|var\./) {
1229 0         0 ($species, $sub_species) = $species =~ /(.+)\s+((?:subsp\.|var\.).+)/;
1230             }
1231             }
1232              
1233             # Bio::Species array needs array in Species -> Kingdom direction
1234 30 50       109 unless ($class[-1] eq $sci_name) {
1235 30         79 push(@class, $sci_name);
1236             }
1237 30         61 @class = reverse @class;
1238              
1239 30         238 my $taxon = Bio::Species->new();
1240 30         149 $taxon->scientific_name($sci_name);
1241 30         125 $taxon->classification(@class);
1242 30 100       138 $taxon->common_name($common) if $common;
1243 30 50       89 $taxon->sub_species($sub_species) if $sub_species;
1244 30 50       88 $taxon->organelle($org) if $org;
1245 30 100       186 $taxon->ncbi_taxid($ncbi_taxid) if $ncbi_taxid;
1246 30 50       80 $taxon->variant($variant) if $variant;
1247              
1248             # done
1249 30         208 return $taxon;
1250             }
1251              
1252             =head2 _filehandle
1253              
1254             Title : _filehandle
1255             Usage : $obj->_filehandle($newval)
1256             Function:
1257             Example :
1258             Returns : value of _filehandle
1259             Args : newvalue (optional)
1260              
1261              
1262             =cut
1263              
1264             # inherited from SeqIO.pm ! HL 05/11/2000
1265              
1266             =head2 _read_FTHelper_swissprot
1267              
1268             Title : _read_FTHelper_swissprot
1269             Usage : _read_FTHelper_swissprot(\$buffer)
1270             Function: reads the next FT key line
1271             Example :
1272             Returns : Bio::SeqIO::FTHelper object
1273             Args :
1274              
1275              
1276             =cut
1277              
1278             sub _read_FTHelper_swissprot {
1279 443     443   802 my ($self,$line ) = @_;
1280             # initial version implemented by HL 05/10/2000
1281             # FIXME this may not be perfect, so please review
1282             # lots of cleaning up by JES 2004/07/01, still may not be perfect =)
1283             # FTId now sepated from description as a qualifier
1284              
1285 443         720 local $_ = $line;
1286 443         640 my ($key, # The key of the feature
1287             $loc, # The location line from the feature
1288             $desc, # The descriptive text
1289             $ftid, # feature Id is like a qualifier but there can be only one of them
1290             );
1291 443 50       2154 if ( m/^FT\s{3}(\w+)\s+([\d\?\<]+)\s+([\d\?\>]+)\s*(.*)$/ox) {
1292 443         1529 $key = $1;
1293 443         713 my $loc1 = $2;
1294 443         641 my $loc2 = $3;
1295 443         922 $loc = "$loc1..$loc2";
1296 443 100 66     1913 if ($4 && (length($4) > 0)) {
1297 384         662 $desc = $4;
1298 384         725 chomp($desc);
1299             } else {
1300 59         96 $desc = "";
1301             }
1302             }
1303              
1304 443   66     1099 while ( defined($_ = $self->_readline) && /^FT\s{20,}(\S.*)$/ ) {
1305 312         951 my $continuation_line = $1;
1306 312 100       1256 if ( $continuation_line =~ /.FTId=(.*)\./ ) {
    50          
1307 272         570 $ftid=$1;
1308             }
1309             elsif ( $desc) {
1310 40         115 $desc .= " $continuation_line";
1311             } else {
1312 0         0 $desc = $continuation_line;
1313             }
1314 312         855 chomp $desc;
1315             }
1316 443         1351 $self->_pushback($_);
1317 443 50       855 unless( $key ) {
1318             # No feature key. What's this?
1319 0         0 $self->warn("No feature key in putative feature table line: $line");
1320 0         0 return;
1321             }
1322              
1323             # Make the new FTHelper object
1324 443         1109 my $out = Bio::SeqIO::FTHelper->new(-verbose => $self->verbose());
1325 443         1331 $out->key($key);
1326 443         1070 $out->loc($loc);
1327              
1328             # store the description if there is one
1329 443 100 66     1460 if ( $desc && length($desc) ) {
1330 384         1749 $desc =~ s/\.$//;
1331 384         733 push(@{$out->field->{"description"}}, $desc);
  384         842  
1332             }
1333             # Store the qualifier i.e. FTId
1334 443 100       984 if ( $ftid ) {
1335 272         340 push(@{$out->field->{"FTId"}}, $ftid);
  272         517  
1336             }
1337 443         1064 return $out;
1338             }
1339              
1340              
1341             =head2 _write_line_swissprot
1342              
1343             Title : _write_line_swissprot
1344             Usage :
1345             Function: internal function
1346             Example :
1347             Returns :
1348             Args :
1349              
1350              
1351             =cut
1352              
1353             sub _write_line_swissprot{
1354 0     0   0 my ($self,$pre1,$pre2,$line,$length) = @_;
1355              
1356 0 0       0 $length || $self->throw( "Miscalled write_line_swissprot without length. Programming error!");
1357 0         0 my $subl = $length - length $pre2;
1358 0         0 my $linel = length $line;
1359 0         0 my $i;
1360              
1361 0         0 my $sub = substr($line,0,$length - length $pre1);
1362              
1363 0         0 $self->_print( "$pre1$sub\n");
1364              
1365 0         0 for ($i= ($length - length $pre1);$i < $linel;) {
1366 0         0 $sub = substr($line,$i,($subl));
1367 0         0 $self->_print( "$pre2$sub\n");
1368 0         0 $i += $subl;
1369             }
1370              
1371             }
1372              
1373             =head2 _write_line_swissprot_regex
1374              
1375             Title : _write_line_swissprot_regex
1376             Usage :
1377             Function: internal function for writing lines of specified
1378             length, with different first and the next line
1379             left hand headers and split at specific points in the
1380             text
1381             Example :
1382             Returns : nothing
1383             Args : file handle, first header, second header, text-line, regex for line breaks, total line length
1384              
1385              
1386             =cut
1387              
1388             sub _write_line_swissprot_regex {
1389 44     44   108 my ($self,$pre1,$pre2,$line,$regex,$length) = @_;
1390              
1391             #print STDOUT "Going to print with $line!\n";
1392              
1393 44 50       88 $length || $self->throw( "Miscalled write_line_swissprot without length. Programming error!");
1394              
1395 44 50       94 if ( length $pre1 != length $pre2 ) {
1396 0         0 $self->warn( "len 1 is ". length ($pre1) . " len 2 is ". length ($pre2) . "\n");
1397 0         0 $self->throw( "Programming error - cannot called write_line_swissprot_regex with different length \npre1 ($pre1) and \npre2 ($pre2) tags!");
1398             }
1399              
1400 44         82 my $subl = $length - (length $pre1) -1 ;
1401              
1402 44         56 my $first_line = 1;
1403 44         544 while ($line =~ m/(.{1,$subl})($regex)/g) {
1404 54         170 my $s = $1.$2;
1405 54 100 100     269 $s =~ s/([\w\.])#(\w)/$1 $2/g # remove word wrap protection char '#'
1406             if $pre1 eq "RA " or $pre1 eq "KW ";
1407             # remove annoying extra spaces at the end of the wrapped lines
1408 54 100       147 substr($s, -1, 1, '') if substr($s, -1, 1) eq ' ';
1409 54 100       101 if ($first_line) {
1410 44         169 $self->_print( "$pre1$s\n");
1411 44         259 $first_line = 0;
1412             } else {
1413 10         41 $self->_print( "$pre2$s\n");
1414             }
1415              
1416             }
1417             }
1418              
1419             =head2 _post_sort
1420              
1421             Title : _post_sort
1422             Usage : $obj->_post_sort($newval)
1423             Function:
1424             Returns : value of _post_sort
1425             Args : newvalue (optional)
1426              
1427              
1428             =cut
1429              
1430             sub _post_sort{
1431 3     3   7 my $obj = shift;
1432 3 50       19 if ( @_ ) {
1433 0         0 my $value = shift;
1434 0         0 $obj->{'_post_sort'} = $value;
1435             }
1436 3         12 return $obj->{'_post_sort'};
1437              
1438             }
1439              
1440             =head2 _show_dna
1441              
1442             Title : _show_dna
1443             Usage : $obj->_show_dna($newval)
1444             Function:
1445             Returns : value of _show_dna
1446             Args : newvalue (optional)
1447              
1448              
1449             =cut
1450              
1451             sub _show_dna{
1452 26     26   61 my $obj = shift;
1453 26 100       97 if ( @_ ) {
1454 23         49 my $value = shift;
1455 23         71 $obj->{'_show_dna'} = $value;
1456             }
1457 26         74 return $obj->{'_show_dna'};
1458              
1459             }
1460              
1461             =head2 _id_generation_func
1462              
1463             Title : _id_generation_func
1464             Usage : $obj->_id_generation_func($newval)
1465             Function:
1466             Returns : value of _id_generation_func
1467             Args : newvalue (optional)
1468              
1469              
1470             =cut
1471              
1472             sub _id_generation_func{
1473 4     4   9 my $obj = shift;
1474 4 50       13 if ( @_ ) {
1475 0         0 my $value = shift;
1476 0         0 $obj->{'_id_generation_func'} = $value;
1477             }
1478 4         16 return $obj->{'_id_generation_func'};
1479              
1480             }
1481              
1482             =head2 _ac_generation_func
1483              
1484             Title : _ac_generation_func
1485             Usage : $obj->_ac_generation_func($newval)
1486             Function:
1487             Returns : value of _ac_generation_func
1488             Args : newvalue (optional)
1489              
1490              
1491             =cut
1492              
1493             sub _ac_generation_func{
1494 4     4   17 my $obj = shift;
1495 4 50       18 if ( @_ ) {
1496 0         0 my $value = shift;
1497 0         0 $obj->{'_ac_generation_func'} = $value;
1498             }
1499 4         47 return $obj->{'_ac_generation_func'};
1500              
1501             }
1502              
1503             =head2 _sv_generation_func
1504              
1505             Title : _sv_generation_func
1506             Usage : $obj->_sv_generation_func($newval)
1507             Function:
1508             Returns : value of _sv_generation_func
1509             Args : newvalue (optional)
1510              
1511              
1512             =cut
1513              
1514             sub _sv_generation_func{
1515 0     0   0 my $obj = shift;
1516 0 0       0 if ( @_ ) {
1517 0         0 my $value = shift;
1518 0         0 $obj->{'_sv_generation_func'} = $value;
1519             }
1520 0         0 return $obj->{'_sv_generation_func'};
1521              
1522             }
1523              
1524             =head2 _kw_generation_func
1525              
1526             Title : _kw_generation_func
1527             Usage : $obj->_kw_generation_func($newval)
1528             Function:
1529             Returns : value of _kw_generation_func
1530             Args : newvalue (optional)
1531              
1532              
1533             =cut
1534              
1535             sub _kw_generation_func{
1536 4     4   10 my $obj = shift;
1537 4 50       18 if ( @_ ) {
1538 0         0 my $value = shift;
1539 0         0 $obj->{'_kw_generation_func'} = $value;
1540             }
1541 4         44 return $obj->{'_kw_generation_func'};
1542              
1543             }
1544              
1545             1;