File Coverage

Bio/SeqIO/Handler/GenericRichSeqHandler.pm
Criterion Covered Total %
statement 324 411 78.8
branch 120 204 58.8
condition 37 61 60.6
subroutine 46 48 95.8
pod 11 12 91.6
total 538 736 73.1


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::SeqIO::Handler::GenericRichSeqHandler
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Chris Fields
7             #
8             # Copyright Chris Fields
9             #
10             # You may distribute this module under the same terms as perl itself
11              
12             # POD documentation - main docs before the code
13              
14             =head1 NAME
15              
16             Bio::SeqIO::Handler::GenericRichSeqHandler - Bio::HandlerI-based
17             data handler for GenBank/EMBL/UniProt (and other) sequence data
18              
19             =head1 SYNOPSIS
20              
21             # MyHandler is a GenericRichSeqHandler object.
22             # inside a parser (driver) constructor....
23              
24             $self->seq_handler($handler || MyHandler->new(-format => 'genbank'));
25              
26             # in next_seq() in driver...
27              
28             $hobj = $self->seqhandler();
29              
30             # roll data up into hashref chunks, pass off into Handler for processing...
31              
32             $hobj->data_handler($data);
33              
34             # or retrieve Handler methods and pass data directly to Handler methods...
35              
36             my $hmeth = $hobj->handler_methods;
37              
38             if ($hmeth->{ $data->{NAME} }) {
39             my $mth = $hmeth->{ $data->{NAME} };
40             $hobj->$mth($data);
41             }
42              
43             =head1 DESCRIPTION
44              
45             This is an experimental implementation of a sequence-based HandlerBaseI parser
46             and may change over time. It is possible (nay, likely) that the way handler
47             methods are set up will change over development to allow more flexibility.
48             Release pumpkins, please do not add this to a release until the API has settled.
49             It is also likely that write_seq() will not work properly for some data.
50              
51             Standard Developer caveats:
52              
53             Do not use for production purposes.
54             Not responsible for destroying (your data|computer|world).
55             Do not stare directly at GenericRichSeqHandler.
56             If GenericRichSeqHandler glows, back slowly away and call for help.
57              
58             Consider yourself warned!
59              
60             This class acts as a demonstration on how to handle similar data chunks derived
61             from Bio::SeqIO::gbdriver, Bio::SeqIO::embldriver, and Bio::SeqIO::swissdriver
62             using similar (or the same) handler methods.
63              
64             The modules currently pass all previous tests in t/genbank.t, t/embl.t, and
65             t/swiss.t yet all use the same handler methods (the collected tests for handlers
66             can be found in t/Handler.t). Some tweaking of the methods themselves is
67             probably in order over the long run to ensure that data is consistently handled
68             for each parser. Round-trip tests are probably in order here...
69              
70             Though a Bio::Seq::SeqBuilder is employed for building sequence objects no
71             bypassing of data based on builder slots has been implemented (yet); this is
72             planned in the near future.
73              
74             As a reminder: this is the current Annotation data chunk (via Data::Dumper):
75              
76             $VAR1 = {
77             'NAME' => 'REFERENCE',
78             'DATA' => '1 (bases 1 to 10001)'
79             'AUTHORS' => 'International Human Genome Sequencing Consortium.'
80             'TITLE' => 'The DNA sequence of Homo sapiens'
81             'JOURNAL' => 'Unpublished (2003)'
82             };
83             ...
84              
85             This is the current SeqFeature data chunk (again via Data::Dumper):
86              
87             $VAR1 = {
88             'mol_type' => 'genomic DNA',
89             'LOCATION' => '<1..>10001',
90             'NAME' => 'FEATURES',
91             'FEATURE_KEY' => 'source',
92             'note' => 'Accession AL451081 sequenced by The Sanger Centre',
93             'db_xref' => 'taxon:9606',
94             'clone' => 'RP11-302I18',
95             'organism' => 'Homo sapiens'
96             };
97              
98             =head1 FEEDBACK
99              
100             =head2 Mailing Lists
101              
102             User feedback is an integral part of the evolution of this and other
103             Bioperl modules. Send your comments and suggestions preferably to one
104             of the Bioperl mailing lists. Your participation is much appreciated.
105              
106             bioperl-l@bioperl.org - General discussion
107             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
108              
109             =head2 Support
110              
111             Please direct usage questions or support issues to the mailing list:
112              
113             I
114              
115             rather than to the module maintainer directly. Many experienced and
116             reponsive experts will be able look at the problem and quickly
117             address it. Please include a thorough description of the problem
118             with code and data examples if at all possible.
119              
120             =head2 Reporting Bugs
121              
122             Report bugs to the Bioperl bug tracking system to help us keep track
123             the bugs and their resolution. Bug reports can be submitted via the
124             web:
125              
126             https://github.com/bioperl/bioperl-live/issues
127              
128             =head1 AUTHOR - Chris Fields
129              
130             Email cjfields at bioperl dot org
131              
132             =head1 APPENDIX
133              
134             The rest of the documentation details each of the object methods. Internal
135             methods are usually preceded with a _
136              
137             =cut
138              
139             # Let the code begin...
140              
141             package Bio::SeqIO::Handler::GenericRichSeqHandler;
142 1     1   6 use strict;
  1         1  
  1         23  
143 1     1   4 use warnings;
  1         1  
  1         20  
144              
145 1     1   592 use Bio::SeqIO::FTHelper;
  1         3  
  1         24  
146 1     1   6 use Bio::Annotation::Collection;
  1         1  
  1         17  
147 1     1   706 use Bio::Annotation::DBLink;
  1         2  
  1         28  
148 1     1   589 use Bio::Annotation::Comment;
  1         2  
  1         23  
149 1     1   597 use Bio::Annotation::Reference;
  1         2  
  1         26  
150 1     1   6 use Bio::Annotation::Collection;
  1         1  
  1         15  
151 1     1   5 use Bio::Annotation::SimpleValue;
  1         2  
  1         16  
152 1     1   662 use Bio::Annotation::TagTree;
  1         2  
  1         28  
153 1     1   7 use Bio::SeqFeature::Generic;
  1         1  
  1         17  
154 1     1   624 use Bio::Species;
  1         2  
  1         25  
155 1     1   6 use Bio::Taxon;
  1         2  
  1         14  
156 1     1   4 use Bio::DB::Taxonomy;
  1         2  
  1         15  
157 1     1   4 use Bio::Factory::FTLocationFactory;
  1         2  
  1         20  
158 1     1   4 use Data::Dumper;
  1         2  
  1         47  
159              
160 1     1   4 use base qw(Bio::Root::Root Bio::HandlerBaseI);
  1         2  
  1         662  
161              
162             my %HANDLERS = (
163             'genbank' => {
164             'LOCUS' => \&_genbank_locus,
165             'DEFINITION' => \&_generic_description,
166             'ACCESSION' => \&_generic_accession,
167             'VERSION' => \&_generic_version,
168             'KEYWORDS' => \&_generic_keywords,
169             'DBSOURCE' => \&_genbank_dbsource,
170             'DBLINK' => \&_genbank_dbsource,
171             'SOURCE' => \&_generic_species,
172             'REFERENCE' => \&_generic_reference,
173             'COMMENT' => \&_generic_comment,
174             'FEATURES' => \&_generic_seqfeatures,
175             'BASE' => \&noop, # this is generated from scratch
176             'ORIGIN' => \&_generic_seq,
177             # handles anything else (WGS, WGS_SCAFLD, CONTIG, PROJECT)
178             '_DEFAULT_' => \&_generic_simplevalue,
179             },
180             'embl' => {
181             'ID' => \&_embl_id,
182             'DT' => \&_embl_date,
183             'DR' => \&_generic_dbsource,
184             'SV' => \&_generic_version,
185             'RN' => \&_generic_reference,
186             'KW' => \&_generic_keywords,
187             'DE' => \&_generic_description,
188             'AC' => \&_generic_accession,
189             #'AH' => \&noop, # TPA data not dealt with yet...
190             #'AS' => \&noop,
191             'SQ' => \&_generic_seq,
192             'OS' => \&_generic_species,
193             'CC' => \&_generic_comment,
194             'FT' => \&_generic_seqfeatures,
195             # handles anything else (WGS, TPA, ANN...)
196             '_DEFAULT_' => \&_generic_simplevalue,
197             },
198             'swiss' => {
199             'ID' => \&_swiss_id,
200             'DT' => \&_swiss_date,
201             'GN' => \&_swiss_genename,
202             'DR' => \&_generic_dbsource,
203             'RN' => \&_generic_reference,
204             'KW' => \&_generic_keywords,
205             'DE' => \&_generic_description,
206             'AC' => \&_generic_accession,
207             'SQ' => \&_generic_seq,
208             'OS' => \&_generic_species,
209             'CC' => \&_generic_comment,
210             'FT' => \&_generic_seqfeatures,
211             # handles anything else, though I don't know what...
212             '_DEFAULT_' => \&_generic_simplevalue,
213             },
214             );
215              
216             # can we do this generically? Seems like a lot of trouble...
217             my %DBSOURCE = map {$_ => 1} qw(
218             EchoBASE IntAct SWISS-2DPAGE ECO2DBASE ECOGENE TIGRFAMs
219             TIGR GO InterPro Pfam PROSITE SGD GermOnline
220             HSSP PhosSite Ensembl RGD AGD ArrayExpress KEGG
221             H-InvDB HGNC LinkHub PANTHER PRINTS SMART SMR
222             MGI MIM RZPD-ProtExp ProDom MEROPS TRANSFAC Reactome
223             UniGene GlycoSuiteDB PIRSF HSC-2DPAGE PHCI-2DPAGE
224             PMMA-2DPAGE Siena-2DPAGE Rat-heart-2DPAGE Aarhus/Ghent-2DPAGE
225             Biocyc MetaCyc Biocyc:Metacyc GenomeReviews FlyBase
226             TMHOBP COMPLUYEAST-2DPAGE OGP DictyBase HAMAP
227             PhotoList Gramene WormBase WormPep Genew ZFIN
228             PeroxiBase MaizeDB TAIR DrugBank REBASE HPA
229             swissprot GenBank GenPept REFSEQ embl PDB UniProtKB);
230              
231             my %NOPROCESS = map {$_ => 1} qw(DBSOURCE ORGANISM FEATURES);
232              
233             our %VALID_ALPHABET = (
234             'bp' => 'dna',
235             'aa' => 'protein',
236             'rc' => '' # rc = release candidate; file has no sequences
237             );
238              
239             =head2 new
240              
241             Title : new
242             Usage :
243             Function:
244             Returns :
245             Args : -format Sequence format to be mapped for handler methods
246             -builder Bio::Seq::SeqBuilder object (normally defined in
247             SequenceStreamI object implementation constructor)
248             Throws : On undefined '-format' sequence format parameter
249             Note : Still under heavy development
250              
251             =cut
252              
253             sub new {
254 42     42 1 185 my ($class, @args) = @_;
255 42         165 my $self = $class->SUPER::new(@args);
256 42         234 $self = {@args};
257 42         89 bless $self,$class;
258 42         170 my ($format, $builder) = $self->_rearrange([qw(FORMAT BUILDER)], @args);
259 42 50       153 $self->throw("Must define sequence record format") if !$format;
260 42         218 $self->format($format);
261 42         171 $self->handler_methods();
262 42 50       220 $builder && $self->seqbuilder($builder);
263 42         170 $self->location_factory();
264 42         227 return $self;
265             }
266              
267             =head1 L implementing methods
268              
269             =head2 handler_methods
270              
271             Title : handler_methods
272             Usage : $handler->handler_methods('GenBank')
273             %handlers = $handler->handler_methods();
274             Function: Retrieve the handler methods used for the current format() in
275             the handler. This assumes the handler methods are already
276             described in the HandlerI-implementing class.
277             Returns : a hash reference with the data type handled and the code ref
278             associated with it.
279             Args : [optional] String representing the sequence format. If set here
280             this will also set sequence_format()
281             Throws : On unimplemented sequence format in %HANDLERS
282              
283             =cut
284              
285             sub handler_methods {
286 75     75 1 132 my $self = shift;
287 75 100       189 if (!($self->{'handlers'})) {
288             $self->throw("No handlers defined for seqformat ",$self->format)
289 42 50       123 unless exists $HANDLERS{$self->format};
290 42         121 $self->{'handlers'} = $HANDLERS{$self->format};
291             }
292 75         156 return ($self->{'handlers'});
293             }
294              
295             =head2 data_handler
296              
297             Title : data_handler
298             Usage : $handler->data_handler($data)
299             Function: Centralized method which accepts all data chunks, then distributes
300             to the appropriate methods for processing based on the chunk name
301             from within the HandlerBaseI object.
302              
303             One can also use
304             Returns : None
305             Args : an hash ref containing a data chunk.
306              
307             =cut
308              
309             sub data_handler {
310 1646     1646 1 3476 my ($self, $data) = @_;
311 1646   33     4039 my $nm = $data->{NAME} || $self->throw("No name tag defined!");
312            
313             # this should handle data on the fly w/o caching; any caching should be
314             # done in the driver!
315             my $method = (exists $self->{'handlers'}->{$nm}) ? ($self->{'handlers'}->{$nm}) :
316 1646 50       6349 (exists $self->{'handlers'}->{'_DEFAULT_'}) ? ($self->{'handlers'}->{'_DEFAULT_'}) :
    100          
317             undef;
318 1646 50       3093 if (!$method) {
319 0         0 $self->debug("No handler defined for $nm\n");
320 0         0 return;
321             };
322 1646         4154 $self->$method($data);
323             }
324              
325             =head2 reset_parameters
326              
327             Title : reset_parameters
328             Usage : $handler->reset_parameters()
329             Function: Resets the internal cache of data (normally object parameters for
330             a builder or factory)
331             Returns : None
332             Args : None
333              
334             =cut
335              
336             sub reset_parameters {
337 58     58 1 115 my $self = shift;
338 58         592 $self->{'_params'} = undef;
339             }
340              
341             =head2 format
342              
343             Title : format
344             Usage : $handler->format('GenBank')
345             Function: Get/Set the format for the report/record being parsed. This can be
346             used to set handlers in classes which are capable of processing
347             similar data chunks from multiple driver modules.
348             Returns : String with the sequence format
349             Args : [optional] String with the sequence format
350             Note : The format may be used to set the handlers (as in the
351             current GenericRichSeqHandler implementation)
352              
353             =cut
354              
355             sub format {
356 1301     1301 1 2114 my $self = shift;
357 1301 100       2871 return $self->{'_seqformat'} = lc shift if @_;
358 1259         3472 return $self->{'_seqformat'};
359             }
360              
361             =head2 get_params
362              
363             Title : get_params
364             Usage : $handler->get_params('-species')
365             Function: Convenience method used to retrieve the specified
366             parameters from the internal parameter cache
367             Returns : Hash ref containing parameters requested and data as
368             key-value pairs. Note that some parameter values may be
369             objects, arrays, etc.
370             Args : List (array) representing the parameters requested
371              
372             =cut
373              
374             sub get_params {
375 818     818 1 1647 my ($self, @ids) = @_;
376 818         1306 my %data;
377 818         1426 for my $id (@ids) {
378 818 50       2422 if (!index($id, '-')==0) {
379 818         2103 $id = '-'.$id ;
380             }
381 818 100       3653 $data{$id} = $self->{'_params'}->{$id} if (exists $self->{'_params'}->{$id});
382             }
383 818         2380 return \%data;
384             }
385              
386             =head2 set_params
387              
388             Title : set_params
389             Usage : $handler->set_param({'-species')
390             Function: Convenience method used to set specific parameters
391             Returns : None
392             Args : Hash ref containing the data to be passed as key-value pairs
393              
394             =cut
395              
396             sub set_params {
397 0     0 1 0 shift->throw('Not implemented yet!');
398             }
399              
400             =head1 Methods unique to this implementation
401              
402             =head2 seqbuilder
403              
404             Title : seqbuilder
405             Usage :
406             Function:
407             Returns :
408             Args :
409             Throws :
410             Note :
411              
412             =cut
413              
414             sub seqbuilder {
415 104     104 1 208 my $self = shift;
416 104 100       361 return $self->{'_seqbuilder'} = shift if @_;
417 62         175 return $self->{'_seqbuilder'};
418             }
419              
420             =head2 build_sequence
421              
422             Title : build_sequence
423             Usage :
424             Function:
425             Returns :
426             Args :
427             Throws :
428             Note :
429              
430             =cut
431              
432             sub build_sequence {
433 62     62 1 146 my $self = shift;
434 62         251 my $builder = $self->seqbuilder();
435 62         109 my $seq;
436 62 100       194 if (defined($self->{'_params'})) {
437 58         111 $builder->add_slot_value(%{ $self->{'_params'} });
  58         839  
438 58         340 $seq = $builder->make_object();
439 58         321 $self->reset_parameters;
440             }
441 62 100       1149 return $seq if $seq;
442 4         25 return 0;
443             }
444              
445             =head2 location_factory
446              
447             Title : location_factory
448             Usage :
449             Function:
450             Returns :
451             Args :
452             Throws :
453             Note :
454              
455             =cut
456              
457             sub location_factory {
458 42     42 1 108 my ($self, $factory) = @_;
459 42 50       213 if ($factory) {
    50          
460 0 0 0     0 $self->throw("Must have a Bio::Factory::LocationFactoryI when ".
461             "explicitly setting factory()") unless
462             (ref($factory) && $factory->isa('Bio::Factory::LocationFactoryI'));
463 0         0 $self->{'_locfactory'} = $factory;
464             } elsif (!defined($self->{'_locfactory'})) {
465 42         279 $self->{'_locfactory'} = Bio::Factory::FTLocationFactory->new()
466             }
467 42         93 return $self->{'_locfactory'};
468             }
469              
470             =head2 annotation_collection
471              
472             Title : annotation_collection
473             Usage :
474             Function:
475             Returns :
476             Args :
477             Throws :
478             Note :
479              
480             =cut
481              
482             sub annotation_collection {
483 746     746 1 1516 my ($self, $coll) = @_;
484 746 50       2781 if ($coll) {
    100          
485 0 0 0     0 $self->throw("Must have Bio::AnnotationCollectionI ".
486             "when explicitly setting collection()")
487             unless (ref($coll) && $coll->isa('Bio::AnnotationCollectionI'));
488 0         0 $self->{'_params'}->{'-annotation'} = $coll;
489             } elsif (!exists($self->{'_params'}->{'-annotation'})) {
490 57         536 $self->{'_params'}->{'-annotation'} = Bio::Annotation::Collection->new()
491             }
492 746         3127 return $self->{'_params'}->{'-annotation'};
493             }
494              
495             ####################### SEQUENCE HANDLERS #######################
496              
497             # any sequence data
498             sub _generic_seq {
499 53     53   155 my ($self, $data) = @_;
500 53         435 $self->{'_params'}->{'-seq'} = $data->{DATA};
501             }
502              
503             ####################### RAW DATA HANDLERS #######################
504              
505             # GenBank LOCUS line
506             sub _genbank_locus {
507 31     31   71 my ($self, $data) = @_;
508 31         237 my (@tokens) = split m{\s+}, $data->{DATA};
509 31         71 my $display_id = shift @tokens;
510 31         118 $self->{'_params'}->{'-display_id'} = $display_id;
511 31         64 my $seqlength = shift @tokens;
512 31 50       94 if (exists $VALID_ALPHABET{$seqlength}) {
513             # moved one token too far. No locus name?
514             $self->warn("Bad LOCUS name? Changing [".$self->{'_params'}->{'-display_id'}.
515 0         0 "] to 'unknown' and length to ".$self->{'_params'}->{'-display_id'});
516 0         0 $self->{'_params'}->{'-length'} = $self->{'_params'}->{'-display_id'};
517 0         0 $self->{'_params'}->{'-display_id'} = 'unknown';
518             # add token back...
519 0         0 unshift @tokens, $seqlength;
520             } else {
521 31         79 $self->{'_params'}->{'-length'} = $seqlength;
522             }
523 31         67 my $alphabet = lc(shift @tokens);
524             $self->{'_params'}->{'-alphabet'} =
525 31 50       133 (exists $VALID_ALPHABET{$alphabet}) ? $VALID_ALPHABET{$alphabet} :
526             $self->warn("Unknown alphabet: $alphabet");
527 31 50 66     142 if (($self->{'_params'}->{'-alphabet'} eq 'dna') || (@tokens > 2)) {
528 31         98 $self->{'_params'}->{'-molecule'} = shift(@tokens);
529 31         71 my $circ = shift(@tokens);
530 31 100       83 if ($circ eq 'circular') {
531 2         6 $self->{'_params'}->{'-is_circular'} = 1;
532 2         7 $self->{'_params'}->{'-division'} = shift(@tokens);
533             } else {
534             # 'linear' or 'circular' may actually be omitted altogether
535 29 100       96 $self->{'_params'}->{'-division'} =
536             (CORE::length($circ) == 3 ) ? $circ : shift(@tokens);
537             }
538             } else {
539 0 0       0 $self->{'_params'}->{'-molecule'} = 'PRT' if($self->{'_params'}->{'-alphabet'} eq 'aa');
540 0         0 $self->{'_params'}->{'-division'} = shift(@tokens);
541             }
542 31         103 my $date = join(' ', @tokens);
543             # maybe use Date::Time for dates?
544 31 100 66     287 if($date && $date =~ s{\s*((\d{1,2})-(\w{3})-(\d{2,4})).*}{$1}) {
545            
546 30 50       75 if( length($date) < 11 ) {
547             # improperly formatted date
548             # But we'll be nice and fix it for them
549 0         0 my ($d,$m,$y) = ($2,$3,$4);
550 0 0       0 if( length($d) == 1 ) {
551 0         0 $d = "0$d";
552             }
553             # guess the century here
554 0 0       0 if( length($y) == 2 ) {
555 0 0       0 if( $y > 60 ) { # arbitrarily guess that '60' means 1960
556 0         0 $y = "19$y";
557             } else {
558 0         0 $y = "20$y";
559             }
560 0         0 $self->warn("Date was malformed, guessing the century for $date to be $y\n");
561             }
562 0         0 $self->{'_params'}->{'-dates'} = [join('-',$d,$m,$y)];
563             } else {
564 30         107 $self->{'_params'}->{'-dates'} = [$date];
565             }
566             }
567             }
568              
569             # EMBL ID line
570             sub _embl_id {
571 10     10   33 my ($self, $data) = @_;
572 10         21 my $alphabet;
573 10         26 my ($name, $sv, $topology, $mol, $div);
574 10         28 my $line = $data->{DATA};
575             #$self->debug("$line\n");
576 10         34 my ($idtype) = $line =~ tr/;/;/;
577 10 100       61 if ( $idtype == 6) { # New style headers contain exactly six semicolons.
    100          
578             # New style header (EMBL Release >= 87, after June 2006)
579 1         5 my $topology;
580             my $sv;
581            
582             # ID DQ299383; SV 1; linear; mRNA; STD; MAM; 431 BP.
583             # This regexp comes from the new2old.pl conversion script, from EBI
584 1 50       13 if ($line =~ m/^(\w+);\s+SV (\d+); (\w+); ([^;]+); (\w{3}); (\w{3}); (\d+) \w{2}\./) {
585 1         9 ($name, $sv, $topology, $mol, $div) = ($1, $2, $3, $4, $6);
586             } else {
587 0         0 $self->throw("Unrecognized EMBL ID line:[$line]");
588             }
589 1 50       5 if (defined($sv)) {
590 1         6 $self->{'_params'}->{'-seq_version'} = $sv;
591 1         4 $self->{'_params'}->{'-version'} = $sv;
592             }
593              
594 1 50       4 if ($topology eq "circular") {
595 0         0 $self->{'_params'}->{'-is_circular'} = 1;
596             }
597            
598 1 50       4 if (defined $mol ) {
599 1 50       8 if ($mol =~ /DNA/) {
    50          
    0          
600 0         0 $alphabet='dna';
601             }
602             elsif ($mol =~ /RNA/) {
603 1         4 $alphabet='rna';
604             }
605             elsif ($mol =~ /AA/) {
606 0         0 $alphabet='protein';
607             }
608             }
609             } elsif ($idtype) { # has internal ';'
610             # Old style header (EMBL Release < 87, before June 2006)
611 8 50       99 if ($line =~ m{^(\S+)[^;]*;\s+(\S+)[^;]*;\s+(\S+)[^;]*;}) {
612 8         52 ($name, $mol, $div) = ($1, $2, $3);
613             #$self->debug("[$name][$mol][$div]");
614             }
615              
616 8 50       41 if($mol) {
617 8 50       33 if ( $mol =~ m{circular} ) {
618 0         0 $self->{'_params'}->{'-is_circular'} = 1;
619 0         0 $mol =~ s{circular }{};
620             }
621 8 50       37 if (defined $mol ) {
622 8 100       53 if ($mol =~ /DNA/) {
    50          
    0          
623 7         30 $alphabet='dna';
624             }
625             elsif ($mol =~ /RNA/) {
626 1         5 $alphabet='rna';
627             }
628             elsif ($mol =~ /AA/) {
629 0         0 $alphabet='protein';
630             }
631             }
632             }
633             } else {
634 1         2 $name = $data->{DATA};
635             }
636 10 50 33     85 unless( defined $name && length($name) ) {
637 0         0 $name = "unknown_id";
638             }
639 10         69 $self->{'_params'}->{'-display_id'} = $name;
640 10         35 $self->{'_params'}->{'-alphabet'} = $alphabet;
641 10 100       62 $self->{'_params'}->{'-division'} = $div if $div;
642 10 100       71 $self->{'_params'}->{'-molecule'} = $mol if $mol;
643             }
644              
645             # UniProt/SwissProt ID line
646             sub _swiss_id {
647 17     17   55 my ($self, $data) = @_;
648 17         46 my ($name, $seq_div);
649 17 50       198 if($data->{DATA} =~ m{^
650             (\S+) \s+ # $1 entryname
651             ([^\s;]+); \s+ # $2 DataClass
652             (?:PRT;)? \s+ # Molecule Type (optional)
653             [0-9]+[ ]AA \. # Sequencelength (capture?)
654             $
655             }ox ) {
656 17         93 ($name, $seq_div) = ($1, $2);
657 17 50 100     197 $self->{'_params'}->{'-namespace'} =
    100 66        
658             ($seq_div eq 'Reviewed' || $seq_div eq 'STANDARD') ? 'Swiss-Prot' :
659             ($seq_div eq 'Unreviewed' || $seq_div eq 'PRELIMINARY') ? 'TrEMBL' :
660             $seq_div;
661             # we shouldn't be setting the division, but for now...
662 17         113 my ($junk, $division) = split q(_), $name;
663 17         87 $self->{'_params'}->{'-division'} = $division;
664 17         53 $self->{'_params'}->{'-alphabet'} = 'protein';
665             # this is important to have the id for display in e.g. FTHelper, otherwise
666             # you won't know which entry caused an error
667 17         97 $self->{'_params'}->{'-display_id'} = $name;
668             } else {
669 0         0 $self->throw("Unrecognized UniProt/SwissProt ID line:[".$data->{DATA}."]");
670             }
671             }
672              
673             # UniProt/SwissProt GN line
674             sub _swiss_genename {
675 17     17   51 my ($self, $data) = @_;
676             #$self->debug(Dumper($data));
677 17         68 my $genename = $data->{DATA};
678 17         31 my $gn;
679 17 50       73 if ($genename) {
680 17         45 my @stags;
681 17 100       98 if ($genename =~ /\w=\w/) {
682             # new format (e.g., Name=RCHY1; Synonyms=ZNF363, CHIMP)
683 10         57 for my $n (split(m{\s+and\s+},$genename)) {
684 12         26 my @genenames;
685 12         136 for my $section (split(m{\s*;\s*},$n)) {
686 15         60 my ($tag, $rest) = split("=",$section);
687 15   50     45 $rest ||= '';
688 15         48 for my $val (split(m{\s*,\s*},$rest)) {
689 19         73 push @genenames, [$tag => $val];
690             }
691             }
692 12         58 push @stags, ['gene_name' => \@genenames];
693             }
694             } else {
695             # old format
696 7         38 for my $section (split(/ AND /, $genename)) {
697 9         19 my @genenames;
698 9         51 $section =~ s/[\(\)\.]//g;
699 9         44 my @names = split(m{\s+OR\s+}, $section);
700 9         39 push @genenames, ['Name' => shift @names];
701 9         24 push @genenames, map {['Synonyms' => $_]} @names;
  10         23  
702 9         45 push @stags, ['gene_name' => \@genenames]
703             }
704             } #use Data::Dumper; print Dumper $gn, $genename;# exit;
705 17         251 my $gn = Bio::Annotation::TagTree->new(-tagname => 'gene_name',
706             -value => ['gene_names' => \@stags]);
707 17         78 $self->annotation_collection->add_Annotation('gene_name', $gn);
708             }
709             }
710              
711             # GenBank VERSION line
712             # old EMBL SV line (now obsolete)
713             # UniProt/SwissProt?
714             sub _generic_version {
715 28     28   74 my ($self, $data) = @_;
716 28         131 my ($acc,$gi) = split(' ',$data->{DATA});
717 28 100       157 if($acc =~ m{^\w+\.(\d+)}xmso) {
718 27         95 $self->{'_params'}->{'-version'} = $1;
719 27         60 $self->{'_params'}->{'-seq_version'} = $1;
720             }
721 28 100 66     158 if($gi && (index($gi,"GI:") == 0)) {
722 24         90 $self->{'_params'}->{'-primary_id'} = substr($gi,3);
723             }
724             }
725              
726             # EMBL DT lines
727             sub _embl_date {
728 5     5   21 my ($self, $data) = @_;
729 5         58 while ($data->{DATA} =~ m{(\S+)\s\((.*?)\)}g) {
730 10         44 my ($date, $version) = ($1, $2);
731 10         25 $date =~ tr{,}{}d; # remove comma if new version
732 10 50       51 if ($version =~ m{\(Rel\.\s(\d+),\sCreated\)}xmso ) {
    50          
733 0         0 my $release = Bio::Annotation::SimpleValue->new(
734             -tagname => 'creation_release',
735             -value => $1
736             );
737 0         0 $self->annotation_collection->add_Annotation($release);
738             } elsif ($version =~ m{\(Rel\.\s(\d+),\sLast\supdated,\sVersion\s(\d+)\)}xmso ) {
739 0         0 my $release = Bio::Annotation::SimpleValue->new(
740             -tagname => 'update_release',
741             -value => $1
742             );
743 0         0 $self->annotation_collection->add_Annotation($release);
744 0         0 my $update = Bio::Annotation::SimpleValue->new(
745             -tagname => 'update_version',
746             -value => $2
747             );
748 0         0 $self->annotation_collection->add_Annotation($update);
749             }
750 10         23 push @{ $self->{'_params'}->{'-dates'} }, $date;
  10         83  
751             }
752             }
753              
754             # UniProt/SwissProt DT lines
755             sub _swiss_date {
756 17     17   71 my ($self, $data) = @_;
757             # swissprot
758 17         101 my @dls = split m{\n}, $data->{DATA};
759 17         66 for my $dl (@dls) {
760 51         191 my ($date, $version) = split(' ', $dl, 2);
761 51         121 $date =~ tr{,}{}d; # remove comma if new version
762 51 100 100     718 if ($version =~ m{\(Rel\. (\d+), Last sequence update\)} || # old
    100 100        
763             $version =~ m{sequence version (\d+)\.}) { #new
764 17         214 my $update = Bio::Annotation::SimpleValue->new(
765             -tagname => 'seq_update',
766             -value => $1
767             );
768 17         91 $self->annotation_collection->add_Annotation($update);
769             } elsif ($version =~ m{\(Rel\. (\d+), Last annotation update\)} || #old
770             $version =~ m{entry version (\d+)\.}) { #new
771 17         112 $self->{'_params'}->{'-version'} = $1;
772 17         74 $self->{'_params'}->{'-seq_version'} = $1;
773             }
774 51         92 push @{ $self->{'_params'}->{'-dates'} }, $date;
  51         233  
775             }
776             }
777              
778             # GenBank KEYWORDS line
779             # EMBL KW line
780             # UniProt/SwissProt KW line
781             sub _generic_keywords {
782 55     55   151 my ($self, $data) = @_;
783 55         368 $data->{DATA} =~ s{\.$}{};
784 55         562 my @kw = split m{\s*\;\s*}xo ,$data->{DATA};
785 55         246 $self->{'_params'}->{'-keywords'} = \@kw;
786             }
787              
788             # GenBank DEFINITION line
789             # EMBL DE line
790             # UniProt/SwissProt DE line
791             sub _generic_description {
792 57     57   159 my ($self, $data) = @_;
793 57         216 $self->{'_params'}->{'-desc'} = $data->{DATA};
794             }
795              
796             # GenBank ACCESSION line
797             # EMBL AC line
798             # UniProt/SwissProt AC line
799             sub _generic_accession {
800 56     56   163 my ($self, $data) = @_;
801 56         370 my @accs = split m{[\s;]+}, $data->{DATA};
802 56         214 $self->{'_params'}->{'-accession_number'} = shift @accs;
803 56 100       253 $self->{'_params'}->{'-secondary_accessions'} = \@accs if @accs;
804             }
805              
806             ####################### SPECIES HANDLERS #######################
807              
808             # uses Bio::Species
809             # GenBank SOURCE, ORGANISM lines
810             # EMBL O* lines
811             # UniProt/SwissProt O* lines
812             sub _generic_species {
813 56     56   149 my ($self, $data) = @_;
814            
815 56         195 my $seqformat = $self->format;
816             # if data is coming in from GenBank parser...
817 56 100 66     456 if ($seqformat eq 'genbank' &&
818             $data->{ORGANISM} =~ m{(.+?)\s(\S+;[^\n\.]+)}ox) {
819 30         135 ($data->{ORGANISM}, $data->{CLASSIFICATION}) = ($1, $2);
820             }
821            
822             # SwissProt stuff...
823             # hybrid names in swissprot files are no longer valid per intergration into
824             # UniProt. Files containing these have been split into separate entries, so
825             # it is probably a good idea to update if one has these lingering around...
826              
827 56         137 my $taxid;
828 56 100       172 if ($seqformat eq 'swiss') {
829 17 50       135 if ($data->{DATA} =~ m{^([^,]+)}ox) {
830 17         98 $data->{DATA} = $1;
831             }
832 17 100 66     170 if ($data->{CROSSREF} && $data->{CROSSREF} =~ m{NCBI_TaxID=(\d+)}) {
833 16         61 $taxid = $1;
834             }
835             }
836            
837             my ($sl, $class, $sci_name) = ($data->{DATA},
838             $data->{CLASSIFICATION},
839 56   100     334 $data->{ORGANISM} || '');
840 56         109 my ($organelle,$abbr_name, $common);
841 56         708 my @class = reverse split m{\s*;\s*}, $class;
842             # have to treat swiss different from everything else...
843 56 50       749 if ($sl =~ m{^(mitochondrion|chloroplast|plastid)? # GenBank format
844             \s*(.*?)
845             \s*(?: \( (.*?) \) )?\.?$
846             }xmso ){
847 56         272 ($organelle, $abbr_name, $common) = ($1, $2, $3); # optional
848             } else {
849 0         0 $abbr_name = $sl; # nothing caught; this is a backup!
850             }
851             # there is no 'abbreviated name' for EMBL
852 56 100       208 $sci_name = $abbr_name if $seqformat ne 'genbank';
853 56   100     325 $organelle ||= '';
854 56   100     195 $common ||= '';
855 56 50       118 $sci_name || return;
856 56         150 unshift @class, $sci_name;
857             # no genus/species parsing here; moving to Bio::Taxon-based taxonomy
858 56         532 my $make = Bio::Species->new();
859 56         294 $make->scientific_name($sci_name);
860 56 50       344 $make->classification(@class) if @class > 0;
861 56 100       302 $common && $make->common_name( $common );
862 56 50       334 $abbr_name && $make->name('abbreviated', $abbr_name);
863 56 100       247 $organelle && $make->organelle($organelle);
864 56 100       287 $taxid && $make->ncbi_taxid($taxid);
865 56         668 $self->{'_params'}->{'-species'} = $make;
866             }
867              
868             ####################### ANNOTATION HANDLERS #######################
869              
870             # GenBank DBSOURCE line
871             sub _genbank_dbsource {
872 1     1   3 my ($self, $data) = @_;
873 1         3 my $dbsource = $data->{DATA};
874 1         3 my $annotation = $self->annotation_collection;
875             # deal with swissprot dbsources
876             # we could possibly parcel these out to subhandlers...
877 1 50       4 if( $dbsource =~ s/(UniProt(?:KB)|swissprot):\s+locus\s+(\S+)\,.+\n// ) {
878 0         0 $annotation->add_Annotation
879             ('dblink',
880             Bio::Annotation::DBLink->new
881             (-primary_id => $2,
882             -database => $1,
883             -tagname => 'dblink'));
884 0 0       0 if( $dbsource =~ s/\s*created:\s+([^\.]+)\.\n// ) {
885 0         0 $annotation->add_Annotation
886             ('swissprot_dates',
887             Bio::Annotation::SimpleValue->new
888             (-tagname => 'date_created',
889             -value => $1));
890             }
891 0         0 while( $dbsource =~ s/\s*(sequence|annotation)\s+updated:\s+([^\.]+)\.\n//g ) {
892 0         0 $annotation->add_Annotation
893             ('swissprot_dates',
894             Bio::Annotation::SimpleValue->new
895             (-tagname => 'date_updated',
896             -value => $1));
897             }
898 0         0 $dbsource =~ s/\n/ /g;
899 0 0       0 if( $dbsource =~ s/\s*xrefs:\s+((?:\S+,\s+)+\S+)\s+xrefs/xrefs/ ) {
    0          
900             # will use $i to determine even or odd
901             # for swissprot the accessions are paired
902 0         0 my $i = 0;
903 0         0 for my $dbsrc ( split(/,\s+/,$1) ) {
904 0 0 0     0 if( $dbsrc =~ /(\S+)\.(\d+)/ || $dbsrc =~ /(\S+)/ ) {
905 0         0 my ($id,$version) = ($1,$2);
906 0 0       0 $version ='' unless defined $version;
907 0         0 my $db;
908 0 0       0 if( $id =~ /^\d\S{3}/) {
909 0         0 $db = 'PDB';
910             } else {
911 0 0       0 $db = ($i++ % 2 ) ? 'GenPept' : 'GenBank';
912             }
913 0         0 $annotation->add_Annotation
914             ('dblink',
915             Bio::Annotation::DBLink->new
916             (-primary_id => $id,
917             -version => $version,
918             -database => $db,
919             -tagname => 'dblink'));
920             }
921             }
922             } elsif( $dbsource =~ s/\s*xrefs:\s+(.+)\s+xrefs/xrefs/i ) {
923             # download screwed up and ncbi didn't put acc in for gi numbers
924 0         0 my $i = 0;
925 0         0 for my $id ( split(/\,\s+/,$1) ) {
926 0         0 my ($acc,$db);
927 0 0       0 if( $id =~ /gi:\s+(\d+)/ ) {
    0          
928 0         0 $acc= $1;
929 0 0       0 $db = ($i++ % 2 ) ? 'GenPept' : 'GenBank';
930             } elsif( $id =~ /pdb\s+accession\s+(\S+)/ ) {
931 0         0 $acc= $1;
932 0         0 $db = 'PDB';
933             } else {
934 0         0 $acc= $id;
935 0         0 $db = '';
936             }
937 0         0 $annotation->add_Annotation
938             ('dblink',
939             Bio::Annotation::DBLink->new
940             (-primary_id => $acc,
941             -database => $db,
942             -tagname => 'dblink'));
943             }
944             } else {
945 0         0 $self->warn("Cannot match $dbsource\n");
946             }
947 0 0       0 if( $dbsource =~ s/xrefs\s+\(non\-sequence\s+databases\):\s+
948             ((?:\S+,\s+)+\S+)//x ) {
949 0         0 for my $id ( split(/\,\s+/,$1) ) {
950 0         0 my $db;
951             # this is because GenBank dropped the spaces!!!
952             # I'm sure we're not going to get this right
953             ##if( $id =~ s/^://i ) {
954             ## $db = $1;
955             ##}
956 0         0 $db = substr($id,0,index($id,':'));
957 0 0       0 if (! exists $DBSOURCE{ $db }) {
958 0         0 $db = ''; # do we want 'GenBank' here?
959             }
960 0         0 $id = substr($id,index($id,':')+1);
961 0         0 $annotation->add_Annotation
962             ('dblink',Bio::Annotation::DBLink->new
963             (-primary_id => $id,
964             -database => $db,
965             -tagname => 'dblink'));
966             }
967             }
968             } else {
969 1 50       7 if( $dbsource =~ /^(\S*?):?\s*accession\s+(\S+)\.(\d+)/ ) {
    0          
970 1         4 my ($db,$id,$version) = ($1,$2,$3);
971 1   50     11 $annotation->add_Annotation
972             ('dblink',
973             Bio::Annotation::DBLink->new
974             (-primary_id => $id,
975             -version => $version,
976             -database => $db || 'GenBank',
977             -tagname => 'dblink'));
978             } elsif ( $dbsource =~ /(\S+)([\.:])(\d+)/ ) {
979 0         0 my ($id, $db, $version);
980 0 0       0 if ($2 eq ':') {
981 0         0 ($db, $id) = ($1, $3);
982             } else {
983 0         0 ($db, $id, $version) = ('GenBank', $1, $3);
984             }
985 0         0 $annotation->add_Annotation('dblink',
986             Bio::Annotation::DBLink->new(
987             -primary_id => $id,
988             -version => $version,
989             -database => $db,
990             -tagname => 'dblink')
991             );
992             } else {
993 0         0 $self->warn("Unrecognized DBSOURCE data: $dbsource\n");
994             }
995             }
996             }
997              
998             # EMBL DR lines
999             # UniProt/SwissProt DR lines
1000             sub _generic_dbsource {
1001 20     20   76 my ($self, $data) = @_;
1002             #$self->debug(Dumper($data));
1003 20         222 while ($data->{DATA} =~ m{([^\n]+)}og) {
1004 351         1061 my $dblink = $1;
1005 351         1515 $dblink =~ s{\.$}{};
1006 351         585 my $link;
1007 351         1079 my @linkdata = split '; ',$dblink;
1008 351 50       1899 if ( $dblink =~ m{([^\s;]+);\s*([^\s;]+);?\s*([^\s;]+)?}) {
1009             #if ( $dblink =~ m{([^\s;]+);\s*([^\s;]+);?\s*([^\s;]+)?}) {
1010 351         1323 my ($databse, $prim_id, $sec_id) = ($1,$2,$3);
1011 351         1388 $link = Bio::Annotation::DBLink->new(-database => $databse,
1012             -primary_id => $prim_id,
1013             -optional_id => $sec_id);
1014             } else {
1015 0         0 $self->warn("No match for $dblink");
1016             }
1017 351         1010 $self->annotation_collection->add_Annotation('dblink', $link);
1018             }
1019             }
1020              
1021              
1022             # GenBank REFERENCE and related lines
1023             # EMBL R* lines
1024             # UniProt/SwissProt R* lines
1025             sub _generic_reference {
1026 301     301   594 my ($self, $data) = @_;
1027 301         707 my $seqformat = $self->format;
1028 301         526 my ($start, $end);
1029             # get these in EMBL/Swiss
1030 301 100       803 if ($data->{CROSSREF}) {
1031 128         1322 while ($data->{CROSSREF} =~ m{(pubmed|doi|medline)(?:=|;\s+)(\S+)}oig) {
1032 199         838 my ($db, $ref) = (uc $1, $2);
1033 199         872 $ref =~ s{[;.]+$}{};
1034 199         1015 $data->{$db} = $ref;
1035             }
1036             }
1037             # run some cleanup for swissprot
1038 301 100       639 if ($seqformat eq 'swiss') {
1039 109         165 for my $val (values %{ $data }) {
  109         623  
1040 959         2027 $val =~ s{;$}{};
1041 959         1776 $val =~ s{(\w-)\s}{$1};
1042             }
1043             }
1044 301 100       692 if ( $data->{POSITION} ) {
1045 127 100       628 if ($seqformat eq 'embl') {
    100          
1046 18         110 ($start, $end) = split '-', $data->{POSITION},2;
1047             } elsif ($data->{POSITION} =~ m{.+? OF (\d+)-(\d+).*}) { #swiss
1048 23         94 ($start, $end) = ($1, $2);
1049             }
1050             }
1051 301 100       925 if ($data->{DATA} =~ m{^\d+\s+\([a-z]+\s+(\d+)\s+to\s+(\d+)\)}xmso) {
1052 59         161 ($start, $end) = ($1, $2);
1053             }
1054             my $ref = Bio::Annotation::Reference->new(
1055             -comment => $data->{REMARK},
1056             -location => $data->{JOURNAL},
1057             -pubmed => $data->{PUBMED},
1058             -consortium => $data->{CONSRTM},
1059             -title => $data->{TITLE},
1060             -authors => $data->{AUTHORS},
1061             -medline => $data->{MEDLINE},
1062             -doi => $data->{DOI},
1063             -rp => $data->{POSITION}, # JIC...
1064 301         3272 -start => $start,
1065             -end => $end,
1066             );
1067 301 100       1780 if ($data->{DATA} =~ m{^\d+\s+\((.*)\)}xmso) {
1068 59         185 $ref->gb_reference($1);
1069             }
1070 301         925 $self->annotation_collection->add_Annotation('reference', $ref);
1071             }
1072              
1073             # GenBank COMMENT lines
1074             # EMBL CC lines
1075             # UniProt/SwissProt CC lines
1076             sub _generic_comment {
1077 43     43   137 my ($self, $data) = @_;
1078             $self->annotation_collection->add_Annotation('comment',
1079 43         185 Bio::Annotation::Comment->new( -text => $data->{DATA} ));
1080             }
1081              
1082             ####################### SEQFEATURE HANDLER #######################
1083              
1084             # GenBank Feature Table
1085             sub _generic_seqfeatures {
1086 848     848   1542 my ($self, $data) = @_;
1087 848 100       1951 return if $data->{FEATURE_KEY} eq 'FEATURES';
1088 818         1479 my $primary_tag = $data->{FEATURE_KEY};
1089            
1090             # grab the NCBI taxon ID from the source SF
1091 818 100 100     1879 if ($primary_tag eq 'source' && exists $data->{'db_xref'}) {
1092 36 100 66     290 if ( $self->{'_params'}->{'-species'} &&
1093             $data->{'db_xref'} =~ m{taxon:(\d+)}xmso ) {
1094 35         188 $self->{'_params'}->{'-species'}->ncbi_taxid($1);
1095             }
1096             }
1097 818         1903 my $source = $self->format;
1098            
1099 818         1214 my $seqid = ${ $self->get_params('accession_number') }{'accession_number'};
  818         1930  
1100            
1101 818         1651 my $loc;
1102 818         1519 eval {
1103 818         3376 $loc = $self->{'_locfactory'}->from_string($data->{'LOCATION'});
1104             };
1105 818 50       1987 if(! $loc) {
1106             $self->warn("exception while parsing location line [" .
1107             $data->{'LOCATION'} .
1108             "] in reading $source, ignoring feature " .
1109 0         0 $data->{'primary_tag'}.
1110             " (seqid=" . $seqid . "): " . $@);
1111 0         0 return;
1112             }
1113 818 50 33     1747 if($seqid && (! $loc->is_remote())) {
1114 0         0 $loc->seq_id($seqid); # propagates if it is a split location
1115             }
1116 818         3299 my $sf = Bio::SeqFeature::Generic->direct_new();
1117 818         2458 $sf->location($loc);
1118 818         2680 $sf->primary_tag($primary_tag);
1119 818         2441 $sf->seq_id($seqid);
1120 818         2149 $sf->source_tag($source);
1121 818         1813 delete $data->{'FEATURE_KEY'};
1122 818         1444 delete $data->{'LOCATION'};
1123 818         1220 delete $data->{'NAME'};
1124 818         1113 delete $data->{'DATA'};
1125 818         2673 $sf->set_attributes(-tag => $data);
1126 818         1546 push @{ $self->{'_params'}->{'-features'} }, $sf;
  818         3767  
1127             }
1128              
1129             ####################### ODDS AND ENDS #######################
1130              
1131             # Those things that don't fit anywhere else. If a specific name
1132             # maps to the below table, that class and method are used, otherwise
1133             # it goes into a SimpleValue (I think this is a good argument for why
1134             # we need a generic mechanism for storing annotation)
1135              
1136             sub _generic_simplevalue {
1137 16     16   36 my ($self, $data) = @_;
1138             $self->annotation_collection->add_Annotation(
1139             Bio::Annotation::SimpleValue->new(-tagname => lc($data->{NAME}),
1140             -value => $data->{DATA})
1141 16         49 );
1142             }
1143              
1144       15 0   sub noop {}
1145              
1146             sub _debug {
1147 0     0     my ($self, $data) = @_;
1148 0           $self->debug(Dumper($data));
1149             }
1150              
1151              
1152             1;