File Coverage

Bio/SeqIO/Handler/GenericRichSeqHandler.pm
Criterion Covered Total %
statement 324 411 78.8
branch 120 204 58.8
condition 36 61 59.0
subroutine 46 48 95.8
pod 11 12 91.6
total 537 736 72.9


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::SeqIO::Handler::GenericRichSeqHandler
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Chris Fields
7             #
8             # Copyright Chris Fields
9             #
10             # You may distribute this module under the same terms as perl itself
11              
12             # POD documentation - main docs before the code
13              
14             =head1 NAME
15              
16             Bio::SeqIO::Handler::GenericRichSeqHandler - Bio::HandlerI-based
17             data handler for GenBank/EMBL/UniProt (and other) sequence data
18              
19             =head1 SYNOPSIS
20              
21             # MyHandler is a GenericRichSeqHandler object.
22             # inside a parser (driver) constructor....
23              
24             $self->seq_handler($handler || MyHandler->new(-format => 'genbank'));
25              
26             # in next_seq() in driver...
27              
28             $hobj = $self->seqhandler();
29              
30             # roll data up into hashref chunks, pass off into Handler for processing...
31              
32             $hobj->data_handler($data);
33              
34             # or retrieve Handler methods and pass data directly to Handler methods...
35              
36             my $hmeth = $hobj->handler_methods;
37              
38             if ($hmeth->{ $data->{NAME} }) {
39             my $mth = $hmeth->{ $data->{NAME} };
40             $hobj->$mth($data);
41             }
42              
43             =head1 DESCRIPTION
44              
45             This is an experimental implementation of a sequence-based HandlerBaseI parser
46             and may change over time. It is possible (nay, likely) that the way handler
47             methods are set up will change over development to allow more flexibility.
48             Release pumpkins, please do not add this to a release until the API has settled.
49             It is also likely that write_seq() will not work properly for some data.
50              
51             Standard Developer caveats:
52              
53             Do not use for production purposes.
54             Not responsible for destroying (your data|computer|world).
55             Do not stare directly at GenericRichSeqHandler.
56             If GenericRichSeqHandler glows, back slowly away and call for help.
57              
58             Consider yourself warned!
59              
60             This class acts as a demonstration on how to handle similar data chunks derived
61             from Bio::SeqIO::gbdriver, Bio::SeqIO::embldriver, and Bio::SeqIO::swissdriver
62             using similar (or the same) handler methods.
63              
64             The modules currently pass all previous tests in t/genbank.t, t/embl.t, and
65             t/swiss.t yet all use the same handler methods (the collected tests for handlers
66             can be found in t/Handler.t). Some tweaking of the methods themselves is
67             probably in order over the long run to ensure that data is consistently handled
68             for each parser. Round-trip tests are probably in order here...
69              
70             Though a Bio::Seq::SeqBuilder is employed for building sequence objects no
71             bypassing of data based on builder slots has been implemented (yet); this is
72             planned in the near future.
73              
74             As a reminder: this is the current Annotation data chunk (via Data::Dumper):
75              
76             $VAR1 = {
77             'NAME' => 'REFERENCE',
78             'DATA' => '1 (bases 1 to 10001)'
79             'AUTHORS' => 'International Human Genome Sequencing Consortium.'
80             'TITLE' => 'The DNA sequence of Homo sapiens'
81             'JOURNAL' => 'Unpublished (2003)'
82             };
83             ...
84              
85             This is the current SeqFeature data chunk (again via Data::Dumper):
86              
87             $VAR1 = {
88             'mol_type' => 'genomic DNA',
89             'LOCATION' => '<1..>10001',
90             'NAME' => 'FEATURES',
91             'FEATURE_KEY' => 'source',
92             'note' => 'Accession AL451081 sequenced by The Sanger Centre',
93             'db_xref' => 'taxon:9606',
94             'clone' => 'RP11-302I18',
95             'organism' => 'Homo sapiens'
96             };
97              
98             =head1 FEEDBACK
99              
100             =head2 Mailing Lists
101              
102             User feedback is an integral part of the evolution of this and other
103             Bioperl modules. Send your comments and suggestions preferably to one
104             of the Bioperl mailing lists. Your participation is much appreciated.
105              
106             bioperl-l@bioperl.org - General discussion
107             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
108              
109             =head2 Support
110              
111             Please direct usage questions or support issues to the mailing list:
112              
113             I
114              
115             rather than to the module maintainer directly. Many experienced and
116             reponsive experts will be able look at the problem and quickly
117             address it. Please include a thorough description of the problem
118             with code and data examples if at all possible.
119              
120             =head2 Reporting Bugs
121              
122             Report bugs to the Bioperl bug tracking system to help us keep track
123             the bugs and their resolution. Bug reports can be submitted via the
124             web:
125              
126             https://github.com/bioperl/bioperl-live/issues
127              
128             =head1 AUTHOR - Chris Fields
129              
130             Email cjfields at bioperl dot org
131              
132             =head1 APPENDIX
133              
134             The rest of the documentation details each of the object methods. Internal
135             methods are usually preceded with a _
136              
137             =cut
138              
139             # Let the code begin...
140              
141             package Bio::SeqIO::Handler::GenericRichSeqHandler;
142 1     1   3 use strict;
  1         1  
  1         21  
143 1     1   3 use warnings;
  1         1  
  1         16  
144              
145 1     1   687 use Bio::SeqIO::FTHelper;
  1         2  
  1         23  
146 1     1   4 use Bio::Annotation::Collection;
  1         1  
  1         16  
147 1     1   701 use Bio::Annotation::DBLink;
  1         2  
  1         20  
148 1     1   627 use Bio::Annotation::Comment;
  1         2  
  1         20  
149 1     1   675 use Bio::Annotation::Reference;
  1         1  
  1         25  
150 1     1   4 use Bio::Annotation::Collection;
  1         1  
  1         15  
151 1     1   3 use Bio::Annotation::SimpleValue;
  1         1  
  1         13  
152 1     1   690 use Bio::Annotation::TagTree;
  1         2  
  1         24  
153 1     1   4 use Bio::SeqFeature::Generic;
  1         1  
  1         15  
154 1     1   688 use Bio::Species;
  1         2  
  1         21  
155 1     1   4 use Bio::Taxon;
  1         2  
  1         12  
156 1     1   3 use Bio::DB::Taxonomy;
  1         1  
  1         14  
157 1     1   3 use Bio::Factory::FTLocationFactory;
  1         1  
  1         18  
158 1     1   3 use Data::Dumper;
  1         1  
  1         44  
159              
160 1     1   3 use base qw(Bio::Root::Root Bio::HandlerBaseI);
  1         1  
  1         742  
161              
162             my %HANDLERS = (
163             'genbank' => {
164             'LOCUS' => \&_genbank_locus,
165             'DEFINITION' => \&_generic_description,
166             'ACCESSION' => \&_generic_accession,
167             'VERSION' => \&_generic_version,
168             'KEYWORDS' => \&_generic_keywords,
169             'DBSOURCE' => \&_genbank_dbsource,
170             'DBLINK' => \&_genbank_dbsource,
171             'SOURCE' => \&_generic_species,
172             'REFERENCE' => \&_generic_reference,
173             'COMMENT' => \&_generic_comment,
174             'FEATURES' => \&_generic_seqfeatures,
175             'BASE' => \&noop, # this is generated from scratch
176             'ORIGIN' => \&_generic_seq,
177             # handles anything else (WGS, WGS_SCAFLD, CONTIG, PROJECT)
178             '_DEFAULT_' => \&_generic_simplevalue,
179             },
180             'embl' => {
181             'ID' => \&_embl_id,
182             'DT' => \&_embl_date,
183             'DR' => \&_generic_dbsource,
184             'SV' => \&_generic_version,
185             'RN' => \&_generic_reference,
186             'KW' => \&_generic_keywords,
187             'DE' => \&_generic_description,
188             'AC' => \&_generic_accession,
189             #'AH' => \&noop, # TPA data not dealt with yet...
190             #'AS' => \&noop,
191             'SQ' => \&_generic_seq,
192             'OS' => \&_generic_species,
193             'CC' => \&_generic_comment,
194             'FT' => \&_generic_seqfeatures,
195             # handles anything else (WGS, TPA, ANN...)
196             '_DEFAULT_' => \&_generic_simplevalue,
197             },
198             'swiss' => {
199             'ID' => \&_swiss_id,
200             'DT' => \&_swiss_date,
201             'GN' => \&_swiss_genename,
202             'DR' => \&_generic_dbsource,
203             'RN' => \&_generic_reference,
204             'KW' => \&_generic_keywords,
205             'DE' => \&_generic_description,
206             'AC' => \&_generic_accession,
207             'SQ' => \&_generic_seq,
208             'OS' => \&_generic_species,
209             'CC' => \&_generic_comment,
210             'FT' => \&_generic_seqfeatures,
211             # handles anything else, though I don't know what...
212             '_DEFAULT_' => \&_generic_simplevalue,
213             },
214             );
215              
216             # can we do this generically? Seems like a lot of trouble...
217             my %DBSOURCE = map {$_ => 1} qw(
218             EchoBASE IntAct SWISS-2DPAGE ECO2DBASE ECOGENE TIGRFAMs
219             TIGR GO InterPro Pfam PROSITE SGD GermOnline
220             HSSP PhosSite Ensembl RGD AGD ArrayExpress KEGG
221             H-InvDB HGNC LinkHub PANTHER PRINTS SMART SMR
222             MGI MIM RZPD-ProtExp ProDom MEROPS TRANSFAC Reactome
223             UniGene GlycoSuiteDB PIRSF HSC-2DPAGE PHCI-2DPAGE
224             PMMA-2DPAGE Siena-2DPAGE Rat-heart-2DPAGE Aarhus/Ghent-2DPAGE
225             Biocyc MetaCyc Biocyc:Metacyc GenomeReviews FlyBase
226             TMHOBP COMPLUYEAST-2DPAGE OGP DictyBase HAMAP
227             PhotoList Gramene WormBase WormPep Genew ZFIN
228             PeroxiBase MaizeDB TAIR DrugBank REBASE HPA
229             swissprot GenBank GenPept REFSEQ embl PDB UniProtKB);
230              
231             my %NOPROCESS = map {$_ => 1} qw(DBSOURCE ORGANISM FEATURES);
232              
233             our %VALID_ALPHABET = (
234             'bp' => 'dna',
235             'aa' => 'protein',
236             'rc' => '' # rc = release candidate; file has no sequences
237             );
238              
239             =head2 new
240              
241             Title : new
242             Usage :
243             Function:
244             Returns :
245             Args : -format Sequence format to be mapped for handler methods
246             -builder Bio::Seq::SeqBuilder object (normally defined in
247             SequenceStreamI object implementation constructor)
248             Throws : On undefined '-format' sequence format parameter
249             Note : Still under heavy development
250              
251             =cut
252              
253             sub new {
254 42     42 1 123 my ($class, @args) = @_;
255 42         170 my $self = $class->SUPER::new(@args);
256 42         154 $self = {@args};
257 42         209 bless $self,$class;
258 42         188 my ($format, $builder) = $self->_rearrange([qw(FORMAT BUILDER)], @args);
259 42 50       155 $self->throw("Must define sequence record format") if !$format;
260 42         170 $self->format($format);
261 42         128 $self->handler_methods();
262 42 50       183 $builder && $self->seqbuilder($builder);
263 42         138 $self->location_factory();
264 42         210 return $self;
265             }
266              
267             =head1 L implementing methods
268              
269             =head2 handler_methods
270              
271             Title : handler_methods
272             Usage : $handler->handler_methods('GenBank')
273             %handlers = $handler->handler_methods();
274             Function: Retrieve the handler methods used for the current format() in
275             the handler. This assumes the handler methods are already
276             described in the HandlerI-implementing class.
277             Returns : a hash reference with the data type handled and the code ref
278             associated with it.
279             Args : [optional] String representing the sequence format. If set here
280             this will also set sequence_format()
281             Throws : On unimplemented sequence format in %HANDLERS
282              
283             =cut
284              
285             sub handler_methods {
286 75     75 1 103 my $self = shift;
287 75 100       187 if (!($self->{'handlers'})) {
288             $self->throw("No handlers defined for seqformat ",$self->format)
289 42 50       119 unless exists $HANDLERS{$self->format};
290 42         113 $self->{'handlers'} = $HANDLERS{$self->format};
291             }
292 75         143 return ($self->{'handlers'});
293             }
294              
295             =head2 data_handler
296              
297             Title : data_handler
298             Usage : $handler->data_handler($data)
299             Function: Centralized method which accepts all data chunks, then distributes
300             to the appropriate methods for processing based on the chunk name
301             from within the HandlerBaseI object.
302              
303             One can also use
304             Returns : None
305             Args : an hash ref containing a data chunk.
306              
307             =cut
308              
309             sub data_handler {
310 1646     1646 1 1977 my ($self, $data) = @_;
311 1646   33     3629 my $nm = $data->{NAME} || $self->throw("No name tag defined!");
312            
313             # this should handle data on the fly w/o caching; any caching should be
314             # done in the driver!
315             my $method = (exists $self->{'handlers'}->{$nm}) ? ($self->{'handlers'}->{$nm}) :
316 1646 50       4123 (exists $self->{'handlers'}->{'_DEFAULT_'}) ? ($self->{'handlers'}->{'_DEFAULT_'}) :
    100          
317             undef;
318 1646 50       2620 if (!$method) {
319 0         0 $self->debug("No handler defined for $nm\n");
320 0         0 return;
321             };
322 1646         3156 $self->$method($data);
323             }
324              
325             =head2 reset_parameters
326              
327             Title : reset_parameters
328             Usage : $handler->reset_parameters()
329             Function: Resets the internal cache of data (normally object parameters for
330             a builder or factory)
331             Returns : None
332             Args : None
333              
334             =cut
335              
336             sub reset_parameters {
337 58     58 1 115 my $self = shift;
338 58         150 $self->{'_params'} = undef;
339             }
340              
341             =head2 format
342              
343             Title : format
344             Usage : $handler->format('GenBank')
345             Function: Get/Set the format for the report/record being parsed. This can be
346             used to set handlers in classes which are capable of processing
347             similar data chunks from multiple driver modules.
348             Returns : String with the sequence format
349             Args : [optional] String with the sequence format
350             Note : The format may be used to set the handlers (as in the
351             current GenericRichSeqHandler implementation)
352              
353             =cut
354              
355             sub format {
356 1301     1301 1 1351 my $self = shift;
357 1301 100       2312 return $self->{'_seqformat'} = lc shift if @_;
358 1259         2782 return $self->{'_seqformat'};
359             }
360              
361             =head2 get_params
362              
363             Title : get_params
364             Usage : $handler->get_params('-species')
365             Function: Convenience method used to retrieve the specified
366             parameters from the internal parameter cache
367             Returns : Hash ref containing parameters requested and data as
368             key-value pairs. Note that some parameter values may be
369             objects, arrays, etc.
370             Args : List (array) representing the parameters requested
371              
372             =cut
373              
374             sub get_params {
375 818     818 1 1299 my ($self, @ids) = @_;
376 818         668 my %data;
377 818         1389 for my $id (@ids) {
378 818 50       2085 if (!index($id, '-')==0) {
379 818         1800 $id = '-'.$id ;
380             }
381 818 100       3709 $data{$id} = $self->{'_params'}->{$id} if (exists $self->{'_params'}->{$id});
382             }
383 818         1870 return \%data;
384             }
385              
386             =head2 set_params
387              
388             Title : set_params
389             Usage : $handler->set_param({'-species')
390             Function: Convenience method used to set specific parameters
391             Returns : None
392             Args : Hash ref containing the data to be passed as key-value pairs
393              
394             =cut
395              
396             sub set_params {
397 0     0 1 0 shift->throw('Not implemented yet!');
398             }
399              
400             =head1 Methods unique to this implementation
401              
402             =head2 seqbuilder
403              
404             Title : seqbuilder
405             Usage :
406             Function:
407             Returns :
408             Args :
409             Throws :
410             Note :
411              
412             =cut
413              
414             sub seqbuilder {
415 104     104 1 138 my $self = shift;
416 104 100       370 return $self->{'_seqbuilder'} = shift if @_;
417 62         137 return $self->{'_seqbuilder'};
418             }
419              
420             =head2 build_sequence
421              
422             Title : build_sequence
423             Usage :
424             Function:
425             Returns :
426             Args :
427             Throws :
428             Note :
429              
430             =cut
431              
432             sub build_sequence {
433 62     62 1 120 my $self = shift;
434 62         228 my $builder = $self->seqbuilder();
435 62         155 my $seq;
436 62 100       189 if (defined($self->{'_params'})) {
437 58         75 $builder->add_slot_value(%{ $self->{'_params'} });
  58         783  
438 58         243 $seq = $builder->make_object();
439 58         229 $self->reset_parameters;
440             }
441 62 100       1073 return $seq if $seq;
442 4         22 return 0;
443             }
444              
445             =head2 location_factory
446              
447             Title : location_factory
448             Usage :
449             Function:
450             Returns :
451             Args :
452             Throws :
453             Note :
454              
455             =cut
456              
457             sub location_factory {
458 42     42 1 69 my ($self, $factory) = @_;
459 42 50       193 if ($factory) {
    50          
460 0 0 0     0 $self->throw("Must have a Bio::Factory::LocationFactoryI when ".
461             "explicitly setting factory()") unless
462             (ref($factory) && $factory->isa('Bio::Factory::LocationFactoryI'));
463 0         0 $self->{'_locfactory'} = $factory;
464             } elsif (!defined($self->{'_locfactory'})) {
465 42         250 $self->{'_locfactory'} = Bio::Factory::FTLocationFactory->new()
466             }
467 42         71 return $self->{'_locfactory'};
468             }
469              
470             =head2 annotation_collection
471              
472             Title : annotation_collection
473             Usage :
474             Function:
475             Returns :
476             Args :
477             Throws :
478             Note :
479              
480             =cut
481              
482             sub annotation_collection {
483 746     746 1 957 my ($self, $coll) = @_;
484 746 50       1997 if ($coll) {
    100          
485 0 0 0     0 $self->throw("Must have Bio::AnnotationCollectionI ".
486             "when explicitly setting collection()")
487             unless (ref($coll) && $coll->isa('Bio::AnnotationCollectionI'));
488 0         0 $self->{'_params'}->{'-annotation'} = $coll;
489             } elsif (!exists($self->{'_params'}->{'-annotation'})) {
490 57         538 $self->{'_params'}->{'-annotation'} = Bio::Annotation::Collection->new()
491             }
492 746         2578 return $self->{'_params'}->{'-annotation'};
493             }
494              
495             ####################### SEQUENCE HANDLERS #######################
496              
497             # any sequence data
498             sub _generic_seq {
499 53     53   87 my ($self, $data) = @_;
500 53         409 $self->{'_params'}->{'-seq'} = $data->{DATA};
501             }
502              
503             ####################### RAW DATA HANDLERS #######################
504              
505             # GenBank LOCUS line
506             sub _genbank_locus {
507 31     31   51 my ($self, $data) = @_;
508 31         266 my (@tokens) = split m{\s+}, $data->{DATA};
509 31         57 my $display_id = shift @tokens;
510 31         120 $self->{'_params'}->{'-display_id'} = $display_id;
511 31         58 my $seqlength = shift @tokens;
512 31 50       92 if (exists $VALID_ALPHABET{$seqlength}) {
513             # moved one token too far. No locus name?
514             $self->warn("Bad LOCUS name? Changing [".$self->{'_params'}->{'-display_id'}.
515 0         0 "] to 'unknown' and length to ".$self->{'_params'}->{'-display_id'});
516 0         0 $self->{'_params'}->{'-length'} = $self->{'_params'}->{'-display_id'};
517 0         0 $self->{'_params'}->{'-display_id'} = 'unknown';
518             # add token back...
519 0         0 unshift @tokens, $seqlength;
520             } else {
521 31         91 $self->{'_params'}->{'-length'} = $seqlength;
522             }
523 31         108 my $alphabet = lc(shift @tokens);
524             $self->{'_params'}->{'-alphabet'} =
525 31 50       107 (exists $VALID_ALPHABET{$alphabet}) ? $VALID_ALPHABET{$alphabet} :
526             $self->warn("Unknown alphabet: $alphabet");
527 31 50 66     157 if (($self->{'_params'}->{'-alphabet'} eq 'dna') || (@tokens > 2)) {
528 31         71 $self->{'_params'}->{'-molecule'} = shift(@tokens);
529 31         67 my $circ = shift(@tokens);
530 31 100       85 if ($circ eq 'circular') {
531 2         5 $self->{'_params'}->{'-is_circular'} = 1;
532 2         4 $self->{'_params'}->{'-division'} = shift(@tokens);
533             } else {
534             # 'linear' or 'circular' may actually be omitted altogether
535 29 100       98 $self->{'_params'}->{'-division'} =
536             (CORE::length($circ) == 3 ) ? $circ : shift(@tokens);
537             }
538             } else {
539 0 0       0 $self->{'_params'}->{'-molecule'} = 'PRT' if($self->{'_params'}->{'-alphabet'} eq 'aa');
540 0         0 $self->{'_params'}->{'-division'} = shift(@tokens);
541             }
542 31         170 my $date = join(' ', @tokens);
543             # maybe use Date::Time for dates?
544 31 100 66     346 if($date && $date =~ s{\s*((\d{1,2})-(\w{3})-(\d{2,4})).*}{$1}) {
545            
546 30 50       76 if( length($date) < 11 ) {
547             # improperly formatted date
548             # But we'll be nice and fix it for them
549 0         0 my ($d,$m,$y) = ($2,$3,$4);
550 0 0       0 if( length($d) == 1 ) {
551 0         0 $d = "0$d";
552             }
553             # guess the century here
554 0 0       0 if( length($y) == 2 ) {
555 0 0       0 if( $y > 60 ) { # arbitrarily guess that '60' means 1960
556 0         0 $y = "19$y";
557             } else {
558 0         0 $y = "20$y";
559             }
560 0         0 $self->warn("Date was malformed, guessing the century for $date to be $y\n");
561             }
562 0         0 $self->{'_params'}->{'-dates'} = [join('-',$d,$m,$y)];
563             } else {
564 30         144 $self->{'_params'}->{'-dates'} = [$date];
565             }
566             }
567             }
568              
569             # EMBL ID line
570             sub _embl_id {
571 10     10   22 my ($self, $data) = @_;
572 10         20 my $alphabet;
573 10         19 my ($name, $sv, $topology, $mol, $div);
574 10         32 my $line = $data->{DATA};
575             #$self->debug("$line\n");
576 10         35 my ($idtype) = $line =~ tr/;/;/;
577 10 100       44 if ( $idtype == 6) { # New style headers contain exactly six semicolons.
    100          
578             # New style header (EMBL Release >= 87, after June 2006)
579 1         4 my $topology;
580             my $sv;
581            
582             # ID DQ299383; SV 1; linear; mRNA; STD; MAM; 431 BP.
583             # This regexp comes from the new2old.pl conversion script, from EBI
584 1 50       13 if ($line =~ m/^(\w+);\s+SV (\d+); (\w+); ([^;]+); (\w{3}); (\w{3}); (\d+) \w{2}\./) {
585 1         11 ($name, $sv, $topology, $mol, $div) = ($1, $2, $3, $4, $6);
586             } else {
587 0         0 $self->throw("Unrecognized EMBL ID line:[$line]");
588             }
589 1 50       6 if (defined($sv)) {
590 1         7 $self->{'_params'}->{'-seq_version'} = $sv;
591 1         3 $self->{'_params'}->{'-version'} = $sv;
592             }
593              
594 1 50       5 if ($topology eq "circular") {
595 0         0 $self->{'_params'}->{'-is_circular'} = 1;
596             }
597            
598 1 50       4 if (defined $mol ) {
599 1 50       8 if ($mol =~ /DNA/) {
    50          
    0          
600 0         0 $alphabet='dna';
601             }
602             elsif ($mol =~ /RNA/) {
603 1         3 $alphabet='rna';
604             }
605             elsif ($mol =~ /AA/) {
606 0         0 $alphabet='protein';
607             }
608             }
609             } elsif ($idtype) { # has internal ';'
610             # Old style header (EMBL Release < 87, before June 2006)
611 8 50       97 if ($line =~ m{^(\S+)[^;]*;\s+(\S+)[^;]*;\s+(\S+)[^;]*;}) {
612 8         51 ($name, $mol, $div) = ($1, $2, $3);
613             #$self->debug("[$name][$mol][$div]");
614             }
615              
616 8 50       27 if($mol) {
617 8 50       30 if ( $mol =~ m{circular} ) {
618 0         0 $self->{'_params'}->{'-is_circular'} = 1;
619 0         0 $mol =~ s{circular }{};
620             }
621 8 50       26 if (defined $mol ) {
622 8 100       46 if ($mol =~ /DNA/) {
    50          
    0          
623 7         26 $alphabet='dna';
624             }
625             elsif ($mol =~ /RNA/) {
626 1         3 $alphabet='rna';
627             }
628             elsif ($mol =~ /AA/) {
629 0         0 $alphabet='protein';
630             }
631             }
632             }
633             } else {
634 1         5 $name = $data->{DATA};
635             }
636 10 50 33     79 unless( defined $name && length($name) ) {
637 0         0 $name = "unknown_id";
638             }
639 10         55 $self->{'_params'}->{'-display_id'} = $name;
640 10         27 $self->{'_params'}->{'-alphabet'} = $alphabet;
641 10 100       45 $self->{'_params'}->{'-division'} = $div if $div;
642 10 100       65 $self->{'_params'}->{'-molecule'} = $mol if $mol;
643             }
644              
645             # UniProt/SwissProt ID line
646             sub _swiss_id {
647 17     17   29 my ($self, $data) = @_;
648 17         27 my ($name, $seq_div);
649 17 50       181 if($data->{DATA} =~ m{^
650             (\S+) \s+ # $1 entryname
651             ([^\s;]+); \s+ # $2 DataClass
652             (?:PRT;)? \s+ # Molecule Type (optional)
653             [0-9]+[ ]AA \. # Sequencelength (capture?)
654             $
655             }ox ) {
656 17         78 ($name, $seq_div) = ($1, $2);
657 17 50 100     166 $self->{'_params'}->{'-namespace'} =
    100 66        
658             ($seq_div eq 'Reviewed' || $seq_div eq 'STANDARD') ? 'Swiss-Prot' :
659             ($seq_div eq 'Unreviewed' || $seq_div eq 'PRELIMINARY') ? 'TrEMBL' :
660             $seq_div;
661             # we shouldn't be setting the division, but for now...
662 17         76 my ($junk, $division) = split q(_), $name;
663 17         57 $self->{'_params'}->{'-division'} = $division;
664 17         45 $self->{'_params'}->{'-alphabet'} = 'protein';
665             # this is important to have the id for display in e.g. FTHelper, otherwise
666             # you won't know which entry caused an error
667 17         72 $self->{'_params'}->{'-display_id'} = $name;
668             } else {
669 0         0 $self->throw("Unrecognized UniProt/SwissProt ID line:[".$data->{DATA}."]");
670             }
671             }
672              
673             # UniProt/SwissProt GN line
674             sub _swiss_genename {
675 17     17   34 my ($self, $data) = @_;
676             #$self->debug(Dumper($data));
677 17         45 my $genename = $data->{DATA};
678 17         26 my $gn;
679 17 50       51 if ($genename) {
680 17         24 my @stags;
681 17 100       86 if ($genename =~ /\w=\w/) {
682             # new format (e.g., Name=RCHY1; Synonyms=ZNF363, CHIMP)
683 10         45 for my $n (split(m{\s+and\s+},$genename)) {
684 12         30 my @genenames;
685 12         62 for my $section (split(m{\s*;\s*},$n)) {
686 15         42 my ($tag, $rest) = split("=",$section);
687 15   50     42 $rest ||= '';
688 15         33 for my $val (split(m{\s*,\s*},$rest)) {
689 19         63 push @genenames, [$tag => $val];
690             }
691             }
692 12         42 push @stags, ['gene_name' => \@genenames];
693             }
694             } else {
695             # old format
696 7         37 for my $section (split(/ AND /, $genename)) {
697 9         15 my @genenames;
698 9         49 $section =~ s/[\(\)\.]//g;
699 9         49 my @names = split(m{\s+OR\s+}, $section);
700 9         34 push @genenames, ['Name' => shift @names];
701 9         24 push @genenames, map {['Synonyms' => $_]} @names;
  10         31  
702 9         44 push @stags, ['gene_name' => \@genenames]
703             }
704             } #use Data::Dumper; print Dumper $gn, $genename;# exit;
705 17         205 my $gn = Bio::Annotation::TagTree->new(-tagname => 'gene_name',
706             -value => ['gene_names' => \@stags]);
707 17         60 $self->annotation_collection->add_Annotation('gene_name', $gn);
708             }
709             }
710              
711             # GenBank VERSION line
712             # old EMBL SV line (now obsolete)
713             # UniProt/SwissProt?
714             sub _generic_version {
715 28     28   70 my ($self, $data) = @_;
716 28         194 my ($acc,$gi) = split(' ',$data->{DATA});
717 28 100       172 if($acc =~ m{^\w+\.(\d+)}xmso) {
718 27         81 $self->{'_params'}->{'-version'} = $1;
719 27         65 $self->{'_params'}->{'-seq_version'} = $1;
720             }
721 28 100 66     160 if($gi && (index($gi,"GI:") == 0)) {
722 24         106 $self->{'_params'}->{'-primary_id'} = substr($gi,3);
723             }
724             }
725              
726             # EMBL DT lines
727             sub _embl_date {
728 5     5   11 my ($self, $data) = @_;
729 5         59 while ($data->{DATA} =~ m{(\S+)\s\((.*?)\)}g) {
730 10         36 my ($date, $version) = ($1, $2);
731 10         21 $date =~ tr{,}{}d; # remove comma if new version
732 10 50       49 if ($version =~ m{\(Rel\.\s(\d+),\sCreated\)}xmso ) {
    50          
733 0         0 my $release = Bio::Annotation::SimpleValue->new(
734             -tagname => 'creation_release',
735             -value => $1
736             );
737 0         0 $self->annotation_collection->add_Annotation($release);
738             } elsif ($version =~ m{\(Rel\.\s(\d+),\sLast\supdated,\sVersion\s(\d+)\)}xmso ) {
739 0         0 my $release = Bio::Annotation::SimpleValue->new(
740             -tagname => 'update_release',
741             -value => $1
742             );
743 0         0 $self->annotation_collection->add_Annotation($release);
744 0         0 my $update = Bio::Annotation::SimpleValue->new(
745             -tagname => 'update_version',
746             -value => $2
747             );
748 0         0 $self->annotation_collection->add_Annotation($update);
749             }
750 10         13 push @{ $self->{'_params'}->{'-dates'} }, $date;
  10         79  
751             }
752             }
753              
754             # UniProt/SwissProt DT lines
755             sub _swiss_date {
756 17     17   30 my ($self, $data) = @_;
757             # swissprot
758 17         75 my @dls = split m{\n}, $data->{DATA};
759 17         49 for my $dl (@dls) {
760 51         133 my ($date, $version) = split(' ', $dl, 2);
761 51         89 $date =~ tr{,}{}d; # remove comma if new version
762 51 100 100     547 if ($version =~ m{\(Rel\. (\d+), Last sequence update\)} || # old
    100 100        
763             $version =~ m{sequence version (\d+)\.}) { #new
764 17         187 my $update = Bio::Annotation::SimpleValue->new(
765             -tagname => 'seq_update',
766             -value => $1
767             );
768 17         70 $self->annotation_collection->add_Annotation($update);
769             } elsif ($version =~ m{\(Rel\. (\d+), Last annotation update\)} || #old
770             $version =~ m{entry version (\d+)\.}) { #new
771 17         87 $self->{'_params'}->{'-version'} = $1;
772 17         56 $self->{'_params'}->{'-seq_version'} = $1;
773             }
774 51         55 push @{ $self->{'_params'}->{'-dates'} }, $date;
  51         229  
775             }
776             }
777              
778             # GenBank KEYWORDS line
779             # EMBL KW line
780             # UniProt/SwissProt KW line
781             sub _generic_keywords {
782 55     55   126 my ($self, $data) = @_;
783 55         286 $data->{DATA} =~ s{\.$}{};
784 55         518 my @kw = split m{\s*\;\s*}xo ,$data->{DATA};
785 55         213 $self->{'_params'}->{'-keywords'} = \@kw;
786             }
787              
788             # GenBank DEFINITION line
789             # EMBL DE line
790             # UniProt/SwissProt DE line
791             sub _generic_description {
792 57     57   121 my ($self, $data) = @_;
793 57         233 $self->{'_params'}->{'-desc'} = $data->{DATA};
794             }
795              
796             # GenBank ACCESSION line
797             # EMBL AC line
798             # UniProt/SwissProt AC line
799             sub _generic_accession {
800 56     56   104 my ($self, $data) = @_;
801 56         337 my @accs = split m{[\s;]+}, $data->{DATA};
802 56         186 $self->{'_params'}->{'-accession_number'} = shift @accs;
803 56 100       245 $self->{'_params'}->{'-secondary_accessions'} = \@accs if @accs;
804             }
805              
806             ####################### SPECIES HANDLERS #######################
807              
808             # uses Bio::Species
809             # GenBank SOURCE, ORGANISM lines
810             # EMBL O* lines
811             # UniProt/SwissProt O* lines
812             sub _generic_species {
813 56     56   94 my ($self, $data) = @_;
814            
815 56         180 my $seqformat = $self->format;
816             # if data is coming in from GenBank parser...
817 56 100 66     506 if ($seqformat eq 'genbank' &&
818             $data->{ORGANISM} =~ m{(.+?)\s(\S+;[^\n\.]+)}ox) {
819 30         200 ($data->{ORGANISM}, $data->{CLASSIFICATION}) = ($1, $2);
820             }
821            
822             # SwissProt stuff...
823             # hybrid names in swissprot files are no longer valid per intergration into
824             # UniProt. Files containing these have been split into separate entries, so
825             # it is probably a good idea to update if one has these lingering around...
826              
827 56         93 my $taxid;
828 56 100       143 if ($seqformat eq 'swiss') {
829 17 50       116 if ($data->{DATA} =~ m{^([^,]+)}ox) {
830 17         60 $data->{DATA} = $1;
831             }
832 17 100 66     153 if ($data->{CROSSREF} && $data->{CROSSREF} =~ m{NCBI_TaxID=(\d+)}) {
833 16         39 $taxid = $1;
834             }
835             }
836            
837             my ($sl, $class, $sci_name) = ($data->{DATA},
838             $data->{CLASSIFICATION},
839 56   100     320 $data->{ORGANISM} || '');
840 56         85 my ($organelle,$abbr_name, $common);
841 56         986 my @class = reverse split m{\s*;\s*}, $class;
842             # have to treat swiss different from everything else...
843 56 50       703 if ($sl =~ m{^(mitochondrion|chloroplast|plastid)? # GenBank format
844             \s*(.*?)
845             \s*(?: \( (.*?) \) )?\.?$
846             }xmso ){
847 56         364 ($organelle, $abbr_name, $common) = ($1, $2, $3); # optional
848             } else {
849 0         0 $abbr_name = $sl; # nothing caught; this is a backup!
850             }
851             # there is no 'abbreviated name' for EMBL
852 56 100       177 $sci_name = $abbr_name if $seqformat ne 'genbank';
853 56   100     225 $organelle ||= '';
854 56   100     186 $common ||= '';
855 56 50       127 $sci_name || return;
856 56         128 unshift @class, $sci_name;
857             # no genus/species parsing here; moving to Bio::Taxon-based taxonomy
858 56         559 my $make = Bio::Species->new();
859 56         217 $make->scientific_name($sci_name);
860 56 50       454 $make->classification(@class) if @class > 0;
861 56 100       260 $common && $make->common_name( $common );
862 56 50       297 $abbr_name && $make->name('abbreviated', $abbr_name);
863 56 100       135 $organelle && $make->organelle($organelle);
864 56 100       219 $taxid && $make->ncbi_taxid($taxid);
865 56         643 $self->{'_params'}->{'-species'} = $make;
866             }
867              
868             ####################### ANNOTATION HANDLERS #######################
869              
870             # GenBank DBSOURCE line
871             sub _genbank_dbsource {
872 1     1   2 my ($self, $data) = @_;
873 1         2 my $dbsource = $data->{DATA};
874 1         3 my $annotation = $self->annotation_collection;
875             # deal with swissprot dbsources
876             # we could possibly parcel these out to subhandlers...
877 1 50       3 if( $dbsource =~ s/(UniProt(?:KB)|swissprot):\s+locus\s+(\S+)\,.+\n// ) {
878 0         0 $annotation->add_Annotation
879             ('dblink',
880             Bio::Annotation::DBLink->new
881             (-primary_id => $2,
882             -database => $1,
883             -tagname => 'dblink'));
884 0 0       0 if( $dbsource =~ s/\s*created:\s+([^\.]+)\.\n// ) {
885 0         0 $annotation->add_Annotation
886             ('swissprot_dates',
887             Bio::Annotation::SimpleValue->new
888             (-tagname => 'date_created',
889             -value => $1));
890             }
891 0         0 while( $dbsource =~ s/\s*(sequence|annotation)\s+updated:\s+([^\.]+)\.\n//g ) {
892 0         0 $annotation->add_Annotation
893             ('swissprot_dates',
894             Bio::Annotation::SimpleValue->new
895             (-tagname => 'date_updated',
896             -value => $1));
897             }
898 0         0 $dbsource =~ s/\n/ /g;
899 0 0       0 if( $dbsource =~ s/\s*xrefs:\s+((?:\S+,\s+)+\S+)\s+xrefs/xrefs/ ) {
    0          
900             # will use $i to determine even or odd
901             # for swissprot the accessions are paired
902 0         0 my $i = 0;
903 0         0 for my $dbsrc ( split(/,\s+/,$1) ) {
904 0 0 0     0 if( $dbsrc =~ /(\S+)\.(\d+)/ || $dbsrc =~ /(\S+)/ ) {
905 0         0 my ($id,$version) = ($1,$2);
906 0 0       0 $version ='' unless defined $version;
907 0         0 my $db;
908 0 0       0 if( $id =~ /^\d\S{3}/) {
909 0         0 $db = 'PDB';
910             } else {
911 0 0       0 $db = ($i++ % 2 ) ? 'GenPept' : 'GenBank';
912             }
913 0         0 $annotation->add_Annotation
914             ('dblink',
915             Bio::Annotation::DBLink->new
916             (-primary_id => $id,
917             -version => $version,
918             -database => $db,
919             -tagname => 'dblink'));
920             }
921             }
922             } elsif( $dbsource =~ s/\s*xrefs:\s+(.+)\s+xrefs/xrefs/i ) {
923             # download screwed up and ncbi didn't put acc in for gi numbers
924 0         0 my $i = 0;
925 0         0 for my $id ( split(/\,\s+/,$1) ) {
926 0         0 my ($acc,$db);
927 0 0       0 if( $id =~ /gi:\s+(\d+)/ ) {
    0          
928 0         0 $acc= $1;
929 0 0       0 $db = ($i++ % 2 ) ? 'GenPept' : 'GenBank';
930             } elsif( $id =~ /pdb\s+accession\s+(\S+)/ ) {
931 0         0 $acc= $1;
932 0         0 $db = 'PDB';
933             } else {
934 0         0 $acc= $id;
935 0         0 $db = '';
936             }
937 0         0 $annotation->add_Annotation
938             ('dblink',
939             Bio::Annotation::DBLink->new
940             (-primary_id => $acc,
941             -database => $db,
942             -tagname => 'dblink'));
943             }
944             } else {
945 0         0 $self->warn("Cannot match $dbsource\n");
946             }
947 0 0       0 if( $dbsource =~ s/xrefs\s+\(non\-sequence\s+databases\):\s+
948             ((?:\S+,\s+)+\S+)//x ) {
949 0         0 for my $id ( split(/\,\s+/,$1) ) {
950 0         0 my $db;
951             # this is because GenBank dropped the spaces!!!
952             # I'm sure we're not going to get this right
953             ##if( $id =~ s/^://i ) {
954             ## $db = $1;
955             ##}
956 0         0 $db = substr($id,0,index($id,':'));
957 0 0       0 if (! exists $DBSOURCE{ $db }) {
958 0         0 $db = ''; # do we want 'GenBank' here?
959             }
960 0         0 $id = substr($id,index($id,':')+1);
961 0         0 $annotation->add_Annotation
962             ('dblink',Bio::Annotation::DBLink->new
963             (-primary_id => $id,
964             -database => $db,
965             -tagname => 'dblink'));
966             }
967             }
968             } else {
969 1 50       5 if( $dbsource =~ /^(\S*?):?\s*accession\s+(\S+)\.(\d+)/ ) {
    0          
970 1         3 my ($db,$id,$version) = ($1,$2,$3);
971 1   50     12 $annotation->add_Annotation
972             ('dblink',
973             Bio::Annotation::DBLink->new
974             (-primary_id => $id,
975             -version => $version,
976             -database => $db || 'GenBank',
977             -tagname => 'dblink'));
978             } elsif ( $dbsource =~ /(\S+)([\.:])(\d+)/ ) {
979 0         0 my ($id, $db, $version);
980 0 0       0 if ($2 eq ':') {
981 0         0 ($db, $id) = ($1, $3);
982             } else {
983 0         0 ($db, $id, $version) = ('GenBank', $1, $3);
984             }
985 0         0 $annotation->add_Annotation('dblink',
986             Bio::Annotation::DBLink->new(
987             -primary_id => $id,
988             -version => $version,
989             -database => $db,
990             -tagname => 'dblink')
991             );
992             } else {
993 0         0 $self->warn("Unrecognized DBSOURCE data: $dbsource\n");
994             }
995             }
996             }
997              
998             # EMBL DR lines
999             # UniProt/SwissProt DR lines
1000             sub _generic_dbsource {
1001 20     20   61 my ($self, $data) = @_;
1002             #$self->debug(Dumper($data));
1003 20         131 while ($data->{DATA} =~ m{([^\n]+)}og) {
1004 351         710 my $dblink = $1;
1005 351         919 $dblink =~ s{\.$}{};
1006 351         338 my $link;
1007 351         744 my @linkdata = split '; ',$dblink;
1008 351 50       1319 if ( $dblink =~ m{([^\s;]+);\s*([^\s;]+);?\s*([^\s;]+)?}) {
1009             #if ( $dblink =~ m{([^\s;]+);\s*([^\s;]+);?\s*([^\s;]+)?}) {
1010 351         695 my ($databse, $prim_id, $sec_id) = ($1,$2,$3);
1011 351         1247 $link = Bio::Annotation::DBLink->new(-database => $databse,
1012             -primary_id => $prim_id,
1013             -optional_id => $sec_id);
1014             } else {
1015 0         0 $self->warn("No match for $dblink");
1016             }
1017 351         635 $self->annotation_collection->add_Annotation('dblink', $link);
1018             }
1019             }
1020              
1021              
1022             # GenBank REFERENCE and related lines
1023             # EMBL R* lines
1024             # UniProt/SwissProt R* lines
1025             sub _generic_reference {
1026 301     301   281 my ($self, $data) = @_;
1027 301         601 my $seqformat = $self->format;
1028 301         293 my ($start, $end);
1029             # get these in EMBL/Swiss
1030 301 100       526 if ($data->{CROSSREF}) {
1031 128         805 while ($data->{CROSSREF} =~ m{(pubmed|doi|medline)(?:=|;\s+)(\S+)}oig) {
1032 199         473 my ($db, $ref) = (uc $1, $2);
1033 199         489 $ref =~ s{[;.]+$}{};
1034 199         666 $data->{$db} = $ref;
1035             }
1036             }
1037             # run some cleanup for swissprot
1038 301 100       716 if ($seqformat eq 'swiss') {
1039 109         140 for my $val (values %{ $data }) {
  109         300  
1040 959         1055 $val =~ s{;$}{};
1041 959         980 $val =~ s{(\w-)\s}{$1};
1042             }
1043             }
1044 301 100       525 if ( $data->{POSITION} ) {
1045 127 100       507 if ($seqformat eq 'embl') {
    100          
1046 18         207 ($start, $end) = split '-', $data->{POSITION},2;
1047             } elsif ($data->{POSITION} =~ m{.+? OF (\d+)-(\d+).*}) { #swiss
1048 23         56 ($start, $end) = ($1, $2);
1049             }
1050             }
1051 301 100       883 if ($data->{DATA} =~ m{^\d+\s+\([a-z]+\s+(\d+)\s+to\s+(\d+)\)}xmso) {
1052 59         158 ($start, $end) = ($1, $2);
1053             }
1054             my $ref = Bio::Annotation::Reference->new(
1055             -comment => $data->{REMARK},
1056             -location => $data->{JOURNAL},
1057             -pubmed => $data->{PUBMED},
1058             -consortium => $data->{CONSRTM},
1059             -title => $data->{TITLE},
1060             -authors => $data->{AUTHORS},
1061             -medline => $data->{MEDLINE},
1062             -doi => $data->{DOI},
1063             -rp => $data->{POSITION}, # JIC...
1064 301         3419 -start => $start,
1065             -end => $end,
1066             );
1067 301 100       1418 if ($data->{DATA} =~ m{^\d+\s+\((.*)\)}xmso) {
1068 59         177 $ref->gb_reference($1);
1069             }
1070 301         690 $self->annotation_collection->add_Annotation('reference', $ref);
1071             }
1072              
1073             # GenBank COMMENT lines
1074             # EMBL CC lines
1075             # UniProt/SwissProt CC lines
1076             sub _generic_comment {
1077 43     43   76 my ($self, $data) = @_;
1078             $self->annotation_collection->add_Annotation('comment',
1079 43         122 Bio::Annotation::Comment->new( -text => $data->{DATA} ));
1080             }
1081              
1082             ####################### SEQFEATURE HANDLER #######################
1083              
1084             # GenBank Feature Table
1085             sub _generic_seqfeatures {
1086 848     848   889 my ($self, $data) = @_;
1087 848 100       1771 return if $data->{FEATURE_KEY} eq 'FEATURES';
1088 818         1021 my $primary_tag = $data->{FEATURE_KEY};
1089            
1090             # grab the NCBI taxon ID from the source SF
1091 818 100 66     1458 if ($primary_tag eq 'source' && exists $data->{'db_xref'}) {
1092 36 100 66     332 if ( $self->{'_params'}->{'-species'} &&
1093             $data->{'db_xref'} =~ m{taxon:(\d+)}xmso ) {
1094 35         164 $self->{'_params'}->{'-species'}->ncbi_taxid($1);
1095             }
1096             }
1097 818         1526 my $source = $self->format;
1098            
1099 818         851 my $seqid = ${ $self->get_params('accession_number') }{'accession_number'};
  818         1519  
1100            
1101 818         1097 my $loc;
1102 818         1048 eval {
1103 818         3152 $loc = $self->{'_locfactory'}->from_string($data->{'LOCATION'});
1104             };
1105 818 50       1810 if(! $loc) {
1106             $self->warn("exception while parsing location line [" .
1107             $data->{'LOCATION'} .
1108             "] in reading $source, ignoring feature " .
1109 0         0 $data->{'primary_tag'}.
1110             " (seqid=" . $seqid . "): " . $@);
1111 0         0 return;
1112             }
1113 818 50 33     1652 if($seqid && (! $loc->is_remote())) {
1114 0         0 $loc->seq_id($seqid); # propagates if it is a split location
1115             }
1116 818         2697 my $sf = Bio::SeqFeature::Generic->direct_new();
1117 818         1872 $sf->location($loc);
1118 818         2030 $sf->primary_tag($primary_tag);
1119 818         1697 $sf->seq_id($seqid);
1120 818         1598 $sf->source_tag($source);
1121 818         1207 delete $data->{'FEATURE_KEY'};
1122 818         905 delete $data->{'LOCATION'};
1123 818         973 delete $data->{'NAME'};
1124 818         850 delete $data->{'DATA'};
1125 818         2490 $sf->set_attributes(-tag => $data);
1126 818         1040 push @{ $self->{'_params'}->{'-features'} }, $sf;
  818         3440  
1127             }
1128              
1129             ####################### ODDS AND ENDS #######################
1130              
1131             # Those things that don't fit anywhere else. If a specific name
1132             # maps to the below table, that class and method are used, otherwise
1133             # it goes into a SimpleValue (I think this is a good argument for why
1134             # we need a generic mechanism for storing annotation)
1135              
1136             sub _generic_simplevalue {
1137 16     16   29 my ($self, $data) = @_;
1138             $self->annotation_collection->add_Annotation(
1139             Bio::Annotation::SimpleValue->new(-tagname => lc($data->{NAME}),
1140             -value => $data->{DATA})
1141 16         61 );
1142             }
1143              
1144       15 0   sub noop {}
1145              
1146             sub _debug {
1147 0     0     my ($self, $data) = @_;
1148 0           $self->debug(Dumper($data));
1149             }
1150              
1151              
1152             1;