File Coverage

Bio/SeqIO/Handler/GenericRichSeqHandler.pm
Criterion Covered Total %
statement 324 411 78.8
branch 120 204 58.8
condition 36 61 59.0
subroutine 46 48 95.8
pod 11 12 91.6
total 537 736 72.9


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::SeqIO::Handler::GenericRichSeqHandler
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Chris Fields
7             #
8             # Copyright Chris Fields
9             #
10             # You may distribute this module under the same terms as perl itself
11              
12             # POD documentation - main docs before the code
13              
14             =head1 NAME
15              
16             Bio::SeqIO::Handler::GenericRichSeqHandler - Bio::HandlerI-based
17             data handler for GenBank/EMBL/UniProt (and other) sequence data
18              
19             =head1 SYNOPSIS
20              
21             # MyHandler is a GenericRichSeqHandler object.
22             # inside a parser (driver) constructor....
23              
24             $self->seq_handler($handler || MyHandler->new(-format => 'genbank'));
25              
26             # in next_seq() in driver...
27              
28             $hobj = $self->seqhandler();
29              
30             # roll data up into hashref chunks, pass off into Handler for processing...
31              
32             $hobj->data_handler($data);
33              
34             # or retrieve Handler methods and pass data directly to Handler methods...
35              
36             my $hmeth = $hobj->handler_methods;
37              
38             if ($hmeth->{ $data->{NAME} }) {
39             my $mth = $hmeth->{ $data->{NAME} };
40             $hobj->$mth($data);
41             }
42              
43             =head1 DESCRIPTION
44              
45             This is an experimental implementation of a sequence-based HandlerBaseI parser
46             and may change over time. It is possible (nay, likely) that the way handler
47             methods are set up will change over development to allow more flexibility.
48             Release pumpkins, please do not add this to a release until the API has settled.
49             It is also likely that write_seq() will not work properly for some data.
50              
51             Standard Developer caveats:
52              
53             Do not use for production purposes.
54             Not responsible for destroying (your data|computer|world).
55             Do not stare directly at GenericRichSeqHandler.
56             If GenericRichSeqHandler glows, back slowly away and call for help.
57              
58             Consider yourself warned!
59              
60             This class acts as a demonstration on how to handle similar data chunks derived
61             from Bio::SeqIO::gbdriver, Bio::SeqIO::embldriver, and Bio::SeqIO::swissdriver
62             using similar (or the same) handler methods.
63              
64             The modules currently pass all previous tests in t/genbank.t, t/embl.t, and
65             t/swiss.t yet all use the same handler methods (the collected tests for handlers
66             can be found in t/Handler.t). Some tweaking of the methods themselves is
67             probably in order over the long run to ensure that data is consistently handled
68             for each parser. Round-trip tests are probably in order here...
69              
70             Though a Bio::Seq::SeqBuilder is employed for building sequence objects no
71             bypassing of data based on builder slots has been implemented (yet); this is
72             planned in the near future.
73              
74             As a reminder: this is the current Annotation data chunk (via Data::Dumper):
75              
76             $VAR1 = {
77             'NAME' => 'REFERENCE',
78             'DATA' => '1 (bases 1 to 10001)'
79             'AUTHORS' => 'International Human Genome Sequencing Consortium.'
80             'TITLE' => 'The DNA sequence of Homo sapiens'
81             'JOURNAL' => 'Unpublished (2003)'
82             };
83             ...
84              
85             This is the current SeqFeature data chunk (again via Data::Dumper):
86              
87             $VAR1 = {
88             'mol_type' => 'genomic DNA',
89             'LOCATION' => '<1..>10001',
90             'NAME' => 'FEATURES',
91             'FEATURE_KEY' => 'source',
92             'note' => 'Accession AL451081 sequenced by The Sanger Centre',
93             'db_xref' => 'taxon:9606',
94             'clone' => 'RP11-302I18',
95             'organism' => 'Homo sapiens'
96             };
97              
98             =head1 FEEDBACK
99              
100             =head2 Mailing Lists
101              
102             User feedback is an integral part of the evolution of this and other
103             Bioperl modules. Send your comments and suggestions preferably to one
104             of the Bioperl mailing lists. Your participation is much appreciated.
105              
106             bioperl-l@bioperl.org - General discussion
107             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
108              
109             =head2 Support
110              
111             Please direct usage questions or support issues to the mailing list:
112              
113             I
114              
115             rather than to the module maintainer directly. Many experienced and
116             reponsive experts will be able look at the problem and quickly
117             address it. Please include a thorough description of the problem
118             with code and data examples if at all possible.
119              
120             =head2 Reporting Bugs
121              
122             Report bugs to the Bioperl bug tracking system to help us keep track
123             the bugs and their resolution. Bug reports can be submitted via the
124             web:
125              
126             https://github.com/bioperl/bioperl-live/issues
127              
128             =head1 AUTHOR - Chris Fields
129              
130             Email cjfields at bioperl dot org
131              
132             =head1 APPENDIX
133              
134             The rest of the documentation details each of the object methods. Internal
135             methods are usually preceded with a _
136              
137             =cut
138              
139             # Let the code begin...
140              
141             package Bio::SeqIO::Handler::GenericRichSeqHandler;
142 1     1   3 use strict;
  1         1  
  1         23  
143 1     1   3 use warnings;
  1         0  
  1         18  
144              
145 1     1   653 use Bio::SeqIO::FTHelper;
  1         2  
  1         21  
146 1     1   5 use Bio::Annotation::Collection;
  1         1  
  1         16  
147 1     1   716 use Bio::Annotation::DBLink;
  1         2  
  1         19  
148 1     1   656 use Bio::Annotation::Comment;
  1         2  
  1         30  
149 1     1   724 use Bio::Annotation::Reference;
  1         3  
  1         23  
150 1     1   5 use Bio::Annotation::Collection;
  1         1  
  1         15  
151 1     1   3 use Bio::Annotation::SimpleValue;
  1         0  
  1         14  
152 1     1   717 use Bio::Annotation::TagTree;
  1         1  
  1         27  
153 1     1   4 use Bio::SeqFeature::Generic;
  1         1  
  1         14  
154 1     1   732 use Bio::Species;
  1         1  
  1         22  
155 1     1   4 use Bio::Taxon;
  1         2  
  1         12  
156 1     1   3 use Bio::DB::Taxonomy;
  1         2  
  1         13  
157 1     1   3 use Bio::Factory::FTLocationFactory;
  1         1  
  1         17  
158 1     1   3 use Data::Dumper;
  1         1  
  1         43  
159              
160 1     1   3 use base qw(Bio::Root::Root Bio::HandlerBaseI);
  1         1  
  1         731  
161              
162             my %HANDLERS = (
163             'genbank' => {
164             'LOCUS' => \&_genbank_locus,
165             'DEFINITION' => \&_generic_description,
166             'ACCESSION' => \&_generic_accession,
167             'VERSION' => \&_generic_version,
168             'KEYWORDS' => \&_generic_keywords,
169             'DBSOURCE' => \&_genbank_dbsource,
170             'DBLINK' => \&_genbank_dbsource,
171             'SOURCE' => \&_generic_species,
172             'REFERENCE' => \&_generic_reference,
173             'COMMENT' => \&_generic_comment,
174             'FEATURES' => \&_generic_seqfeatures,
175             'BASE' => \&noop, # this is generated from scratch
176             'ORIGIN' => \&_generic_seq,
177             # handles anything else (WGS, WGS_SCAFLD, CONTIG, PROJECT)
178             '_DEFAULT_' => \&_generic_simplevalue,
179             },
180             'embl' => {
181             'ID' => \&_embl_id,
182             'DT' => \&_embl_date,
183             'DR' => \&_generic_dbsource,
184             'SV' => \&_generic_version,
185             'RN' => \&_generic_reference,
186             'KW' => \&_generic_keywords,
187             'DE' => \&_generic_description,
188             'AC' => \&_generic_accession,
189             #'AH' => \&noop, # TPA data not dealt with yet...
190             #'AS' => \&noop,
191             'SQ' => \&_generic_seq,
192             'OS' => \&_generic_species,
193             'CC' => \&_generic_comment,
194             'FT' => \&_generic_seqfeatures,
195             # handles anything else (WGS, TPA, ANN...)
196             '_DEFAULT_' => \&_generic_simplevalue,
197             },
198             'swiss' => {
199             'ID' => \&_swiss_id,
200             'DT' => \&_swiss_date,
201             'GN' => \&_swiss_genename,
202             'DR' => \&_generic_dbsource,
203             'RN' => \&_generic_reference,
204             'KW' => \&_generic_keywords,
205             'DE' => \&_generic_description,
206             'AC' => \&_generic_accession,
207             'SQ' => \&_generic_seq,
208             'OS' => \&_generic_species,
209             'CC' => \&_generic_comment,
210             'FT' => \&_generic_seqfeatures,
211             # handles anything else, though I don't know what...
212             '_DEFAULT_' => \&_generic_simplevalue,
213             },
214             );
215              
216             # can we do this generically? Seems like a lot of trouble...
217             my %DBSOURCE = map {$_ => 1} qw(
218             EchoBASE IntAct SWISS-2DPAGE ECO2DBASE ECOGENE TIGRFAMs
219             TIGR GO InterPro Pfam PROSITE SGD GermOnline
220             HSSP PhosSite Ensembl RGD AGD ArrayExpress KEGG
221             H-InvDB HGNC LinkHub PANTHER PRINTS SMART SMR
222             MGI MIM RZPD-ProtExp ProDom MEROPS TRANSFAC Reactome
223             UniGene GlycoSuiteDB PIRSF HSC-2DPAGE PHCI-2DPAGE
224             PMMA-2DPAGE Siena-2DPAGE Rat-heart-2DPAGE Aarhus/Ghent-2DPAGE
225             Biocyc MetaCyc Biocyc:Metacyc GenomeReviews FlyBase
226             TMHOBP COMPLUYEAST-2DPAGE OGP DictyBase HAMAP
227             PhotoList Gramene WormBase WormPep Genew ZFIN
228             PeroxiBase MaizeDB TAIR DrugBank REBASE HPA
229             swissprot GenBank GenPept REFSEQ embl PDB UniProtKB);
230              
231             my %NOPROCESS = map {$_ => 1} qw(DBSOURCE ORGANISM FEATURES);
232              
233             our %VALID_ALPHABET = (
234             'bp' => 'dna',
235             'aa' => 'protein',
236             'rc' => '' # rc = release candidate; file has no sequences
237             );
238              
239             =head2 new
240              
241             Title : new
242             Usage :
243             Function:
244             Returns :
245             Args : -format Sequence format to be mapped for handler methods
246             -builder Bio::Seq::SeqBuilder object (normally defined in
247             SequenceStreamI object implementation constructor)
248             Throws : On undefined '-format' sequence format parameter
249             Note : Still under heavy development
250              
251             =cut
252              
253             sub new {
254 42     42 1 89 my ($class, @args) = @_;
255 42         108 my $self = $class->SUPER::new(@args);
256 42         107 $self = {@args};
257 42         110 bless $self,$class;
258 42         160 my ($format, $builder) = $self->_rearrange([qw(FORMAT BUILDER)], @args);
259 42 50       112 $self->throw("Must define sequence record format") if !$format;
260 42         109 $self->format($format);
261 42         111 $self->handler_methods();
262 42 50       129 $builder && $self->seqbuilder($builder);
263 42         113 $self->location_factory();
264 42         155 return $self;
265             }
266              
267             =head1 L implementing methods
268              
269             =head2 handler_methods
270              
271             Title : handler_methods
272             Usage : $handler->handler_methods('GenBank')
273             %handlers = $handler->handler_methods();
274             Function: Retrieve the handler methods used for the current format() in
275             the handler. This assumes the handler methods are already
276             described in the HandlerI-implementing class.
277             Returns : a hash reference with the data type handled and the code ref
278             associated with it.
279             Args : [optional] String representing the sequence format. If set here
280             this will also set sequence_format()
281             Throws : On unimplemented sequence format in %HANDLERS
282              
283             =cut
284              
285             sub handler_methods {
286 75     75 1 81 my $self = shift;
287 75 100       164 if (!($self->{'handlers'})) {
288             $self->throw("No handlers defined for seqformat ",$self->format)
289 42 50       87 unless exists $HANDLERS{$self->format};
290 42         84 $self->{'handlers'} = $HANDLERS{$self->format};
291             }
292 75         91 return ($self->{'handlers'});
293             }
294              
295             =head2 data_handler
296              
297             Title : data_handler
298             Usage : $handler->data_handler($data)
299             Function: Centralized method which accepts all data chunks, then distributes
300             to the appropriate methods for processing based on the chunk name
301             from within the HandlerBaseI object.
302              
303             One can also use
304             Returns : None
305             Args : an hash ref containing a data chunk.
306              
307             =cut
308              
309             sub data_handler {
310 1646     1646 1 1582 my ($self, $data) = @_;
311 1646   33     2594 my $nm = $data->{NAME} || $self->throw("No name tag defined!");
312            
313             # this should handle data on the fly w/o caching; any caching should be
314             # done in the driver!
315             my $method = (exists $self->{'handlers'}->{$nm}) ? ($self->{'handlers'}->{$nm}) :
316 1646 50       3250 (exists $self->{'handlers'}->{'_DEFAULT_'}) ? ($self->{'handlers'}->{'_DEFAULT_'}) :
    100          
317             undef;
318 1646 50       2361 if (!$method) {
319 0         0 $self->debug("No handler defined for $nm\n");
320 0         0 return;
321             };
322 1646         2501 $self->$method($data);
323             }
324              
325             =head2 reset_parameters
326              
327             Title : reset_parameters
328             Usage : $handler->reset_parameters()
329             Function: Resets the internal cache of data (normally object parameters for
330             a builder or factory)
331             Returns : None
332             Args : None
333              
334             =cut
335              
336             sub reset_parameters {
337 58     58 1 82 my $self = shift;
338 58         89 $self->{'_params'} = undef;
339             }
340              
341             =head2 format
342              
343             Title : format
344             Usage : $handler->format('GenBank')
345             Function: Get/Set the format for the report/record being parsed. This can be
346             used to set handlers in classes which are capable of processing
347             similar data chunks from multiple driver modules.
348             Returns : String with the sequence format
349             Args : [optional] String with the sequence format
350             Note : The format may be used to set the handlers (as in the
351             current GenericRichSeqHandler implementation)
352              
353             =cut
354              
355             sub format {
356 1301     1301 1 1070 my $self = shift;
357 1301 100       2059 return $self->{'_seqformat'} = lc shift if @_;
358 1259         1858 return $self->{'_seqformat'};
359             }
360              
361             =head2 get_params
362              
363             Title : get_params
364             Usage : $handler->get_params('-species')
365             Function: Convenience method used to retrieve the specified
366             parameters from the internal parameter cache
367             Returns : Hash ref containing parameters requested and data as
368             key-value pairs. Note that some parameter values may be
369             objects, arrays, etc.
370             Args : List (array) representing the parameters requested
371              
372             =cut
373              
374             sub get_params {
375 818     818 1 891 my ($self, @ids) = @_;
376 818         583 my %data;
377 818         901 for my $id (@ids) {
378 818 50       1472 if (!index($id, '-')==0) {
379 818         1413 $id = '-'.$id ;
380             }
381 818 100       2820 $data{$id} = $self->{'_params'}->{$id} if (exists $self->{'_params'}->{$id});
382             }
383 818         1426 return \%data;
384             }
385              
386             =head2 set_params
387              
388             Title : set_params
389             Usage : $handler->set_param({'-species')
390             Function: Convenience method used to set specific parameters
391             Returns : None
392             Args : Hash ref containing the data to be passed as key-value pairs
393              
394             =cut
395              
396             sub set_params {
397 0     0 1 0 shift->throw('Not implemented yet!');
398             }
399              
400             =head1 Methods unique to this implementation
401              
402             =head2 seqbuilder
403              
404             Title : seqbuilder
405             Usage :
406             Function:
407             Returns :
408             Args :
409             Throws :
410             Note :
411              
412             =cut
413              
414             sub seqbuilder {
415 104     104 1 118 my $self = shift;
416 104 100       224 return $self->{'_seqbuilder'} = shift if @_;
417 62         99 return $self->{'_seqbuilder'};
418             }
419              
420             =head2 build_sequence
421              
422             Title : build_sequence
423             Usage :
424             Function:
425             Returns :
426             Args :
427             Throws :
428             Note :
429              
430             =cut
431              
432             sub build_sequence {
433 62     62 1 87 my $self = shift;
434 62         126 my $builder = $self->seqbuilder();
435 62         58 my $seq;
436 62 100       119 if (defined($self->{'_params'})) {
437 58         69 $builder->add_slot_value(%{ $self->{'_params'} });
  58         448  
438 58         172 $seq = $builder->make_object();
439 58         131 $self->reset_parameters;
440             }
441 62 100       639 return $seq if $seq;
442 4         14 return 0;
443             }
444              
445             =head2 location_factory
446              
447             Title : location_factory
448             Usage :
449             Function:
450             Returns :
451             Args :
452             Throws :
453             Note :
454              
455             =cut
456              
457             sub location_factory {
458 42     42 1 49 my ($self, $factory) = @_;
459 42 50       145 if ($factory) {
    50          
460 0 0 0     0 $self->throw("Must have a Bio::Factory::LocationFactoryI when ".
461             "explicitly setting factory()") unless
462             (ref($factory) && $factory->isa('Bio::Factory::LocationFactoryI'));
463 0         0 $self->{'_locfactory'} = $factory;
464             } elsif (!defined($self->{'_locfactory'})) {
465 42         180 $self->{'_locfactory'} = Bio::Factory::FTLocationFactory->new()
466             }
467 42         54 return $self->{'_locfactory'};
468             }
469              
470             =head2 annotation_collection
471              
472             Title : annotation_collection
473             Usage :
474             Function:
475             Returns :
476             Args :
477             Throws :
478             Note :
479              
480             =cut
481              
482             sub annotation_collection {
483 746     746 1 644 my ($self, $coll) = @_;
484 746 50       1611 if ($coll) {
    100          
485 0 0 0     0 $self->throw("Must have Bio::AnnotationCollectionI ".
486             "when explicitly setting collection()")
487             unless (ref($coll) && $coll->isa('Bio::AnnotationCollectionI'));
488 0         0 $self->{'_params'}->{'-annotation'} = $coll;
489             } elsif (!exists($self->{'_params'}->{'-annotation'})) {
490 57         276 $self->{'_params'}->{'-annotation'} = Bio::Annotation::Collection->new()
491             }
492 746         2205 return $self->{'_params'}->{'-annotation'};
493             }
494              
495             ####################### SEQUENCE HANDLERS #######################
496              
497             # any sequence data
498             sub _generic_seq {
499 53     53   72 my ($self, $data) = @_;
500 53         313 $self->{'_params'}->{'-seq'} = $data->{DATA};
501             }
502              
503             ####################### RAW DATA HANDLERS #######################
504              
505             # GenBank LOCUS line
506             sub _genbank_locus {
507 31     31   33 my ($self, $data) = @_;
508 31         189 my (@tokens) = split m{\s+}, $data->{DATA};
509 31         45 my $display_id = shift @tokens;
510 31         86 $self->{'_params'}->{'-display_id'} = $display_id;
511 31         37 my $seqlength = shift @tokens;
512 31 50       62 if (exists $VALID_ALPHABET{$seqlength}) {
513             # moved one token too far. No locus name?
514             $self->warn("Bad LOCUS name? Changing [".$self->{'_params'}->{'-display_id'}.
515 0         0 "] to 'unknown' and length to ".$self->{'_params'}->{'-display_id'});
516 0         0 $self->{'_params'}->{'-length'} = $self->{'_params'}->{'-display_id'};
517 0         0 $self->{'_params'}->{'-display_id'} = 'unknown';
518             # add token back...
519 0         0 unshift @tokens, $seqlength;
520             } else {
521 31         52 $self->{'_params'}->{'-length'} = $seqlength;
522             }
523 31         39 my $alphabet = lc(shift @tokens);
524             $self->{'_params'}->{'-alphabet'} =
525 31 50       78 (exists $VALID_ALPHABET{$alphabet}) ? $VALID_ALPHABET{$alphabet} :
526             $self->warn("Unknown alphabet: $alphabet");
527 31 50 66     103 if (($self->{'_params'}->{'-alphabet'} eq 'dna') || (@tokens > 2)) {
528 31         58 $self->{'_params'}->{'-molecule'} = shift(@tokens);
529 31         43 my $circ = shift(@tokens);
530 31 100       54 if ($circ eq 'circular') {
531 2         6 $self->{'_params'}->{'-is_circular'} = 1;
532 2         6 $self->{'_params'}->{'-division'} = shift(@tokens);
533             } else {
534             # 'linear' or 'circular' may actually be omitted altogether
535 29 100       65 $self->{'_params'}->{'-division'} =
536             (CORE::length($circ) == 3 ) ? $circ : shift(@tokens);
537             }
538             } else {
539 0 0       0 $self->{'_params'}->{'-molecule'} = 'PRT' if($self->{'_params'}->{'-alphabet'} eq 'aa');
540 0         0 $self->{'_params'}->{'-division'} = shift(@tokens);
541             }
542 31         64 my $date = join(' ', @tokens);
543             # maybe use Date::Time for dates?
544 31 100 66     246 if($date && $date =~ s{\s*((\d{1,2})-(\w{3})-(\d{2,4})).*}{$1}) {
545            
546 30 50       55 if( length($date) < 11 ) {
547             # improperly formatted date
548             # But we'll be nice and fix it for them
549 0         0 my ($d,$m,$y) = ($2,$3,$4);
550 0 0       0 if( length($d) == 1 ) {
551 0         0 $d = "0$d";
552             }
553             # guess the century here
554 0 0       0 if( length($y) == 2 ) {
555 0 0       0 if( $y > 60 ) { # arbitrarily guess that '60' means 1960
556 0         0 $y = "19$y";
557             } else {
558 0         0 $y = "20$y";
559             }
560 0         0 $self->warn("Date was malformed, guessing the century for $date to be $y\n");
561             }
562 0         0 $self->{'_params'}->{'-dates'} = [join('-',$d,$m,$y)];
563             } else {
564 30         94 $self->{'_params'}->{'-dates'} = [$date];
565             }
566             }
567             }
568              
569             # EMBL ID line
570             sub _embl_id {
571 10     10   11 my ($self, $data) = @_;
572 10         13 my $alphabet;
573 10         14 my ($name, $sv, $topology, $mol, $div);
574 10         17 my $line = $data->{DATA};
575             #$self->debug("$line\n");
576 10         18 my ($idtype) = $line =~ tr/;/;/;
577 10 100       27 if ( $idtype == 6) { # New style headers contain exactly six semicolons.
    100          
578             # New style header (EMBL Release >= 87, after June 2006)
579 1         3 my $topology;
580             my $sv;
581            
582             # ID DQ299383; SV 1; linear; mRNA; STD; MAM; 431 BP.
583             # This regexp comes from the new2old.pl conversion script, from EBI
584 1 50       7 if ($line =~ m/^(\w+);\s+SV (\d+); (\w+); ([^;]+); (\w{3}); (\w{3}); (\d+) \w{2}\./) {
585 1         6 ($name, $sv, $topology, $mol, $div) = ($1, $2, $3, $4, $6);
586             } else {
587 0         0 $self->throw("Unrecognized EMBL ID line:[$line]");
588             }
589 1 50       3 if (defined($sv)) {
590 1         3 $self->{'_params'}->{'-seq_version'} = $sv;
591 1         3 $self->{'_params'}->{'-version'} = $sv;
592             }
593              
594 1 50       2 if ($topology eq "circular") {
595 0         0 $self->{'_params'}->{'-is_circular'} = 1;
596             }
597            
598 1 50       3 if (defined $mol ) {
599 1 50       4 if ($mol =~ /DNA/) {
    50          
    0          
600 0         0 $alphabet='dna';
601             }
602             elsif ($mol =~ /RNA/) {
603 1         2 $alphabet='rna';
604             }
605             elsif ($mol =~ /AA/) {
606 0         0 $alphabet='protein';
607             }
608             }
609             } elsif ($idtype) { # has internal ';'
610             # Old style header (EMBL Release < 87, before June 2006)
611 8 50       55 if ($line =~ m{^(\S+)[^;]*;\s+(\S+)[^;]*;\s+(\S+)[^;]*;}) {
612 8         30 ($name, $mol, $div) = ($1, $2, $3);
613             #$self->debug("[$name][$mol][$div]");
614             }
615              
616 8 50       15 if($mol) {
617 8 50       20 if ( $mol =~ m{circular} ) {
618 0         0 $self->{'_params'}->{'-is_circular'} = 1;
619 0         0 $mol =~ s{circular }{};
620             }
621 8 50       19 if (defined $mol ) {
622 8 100       26 if ($mol =~ /DNA/) {
    50          
    0          
623 7         13 $alphabet='dna';
624             }
625             elsif ($mol =~ /RNA/) {
626 1         2 $alphabet='rna';
627             }
628             elsif ($mol =~ /AA/) {
629 0         0 $alphabet='protein';
630             }
631             }
632             }
633             } else {
634 1         2 $name = $data->{DATA};
635             }
636 10 50 33     52 unless( defined $name && length($name) ) {
637 0         0 $name = "unknown_id";
638             }
639 10         36 $self->{'_params'}->{'-display_id'} = $name;
640 10         16 $self->{'_params'}->{'-alphabet'} = $alphabet;
641 10 100       26 $self->{'_params'}->{'-division'} = $div if $div;
642 10 100       40 $self->{'_params'}->{'-molecule'} = $mol if $mol;
643             }
644              
645             # UniProt/SwissProt ID line
646             sub _swiss_id {
647 17     17   19 my ($self, $data) = @_;
648 17         18 my ($name, $seq_div);
649 17 50       112 if($data->{DATA} =~ m{^
650             (\S+) \s+ # $1 entryname
651             ([^\s;]+); \s+ # $2 DataClass
652             (?:PRT;)? \s+ # Molecule Type (optional)
653             [0-9]+[ ]AA \. # Sequencelength (capture?)
654             $
655             }ox ) {
656 17         47 ($name, $seq_div) = ($1, $2);
657 17 50 100     127 $self->{'_params'}->{'-namespace'} =
    100 66        
658             ($seq_div eq 'Reviewed' || $seq_div eq 'STANDARD') ? 'Swiss-Prot' :
659             ($seq_div eq 'Unreviewed' || $seq_div eq 'PRELIMINARY') ? 'TrEMBL' :
660             $seq_div;
661             # we shouldn't be setting the division, but for now...
662 17         52 my ($junk, $division) = split q(_), $name;
663 17         44 $self->{'_params'}->{'-division'} = $division;
664 17         28 $self->{'_params'}->{'-alphabet'} = 'protein';
665             # this is important to have the id for display in e.g. FTHelper, otherwise
666             # you won't know which entry caused an error
667 17         43 $self->{'_params'}->{'-display_id'} = $name;
668             } else {
669 0         0 $self->throw("Unrecognized UniProt/SwissProt ID line:[".$data->{DATA}."]");
670             }
671             }
672              
673             # UniProt/SwissProt GN line
674             sub _swiss_genename {
675 17     17   22 my ($self, $data) = @_;
676             #$self->debug(Dumper($data));
677 17         38 my $genename = $data->{DATA};
678 17         17 my $gn;
679 17 50       35 if ($genename) {
680 17         19 my @stags;
681 17 100       56 if ($genename =~ /\w=\w/) {
682             # new format (e.g., Name=RCHY1; Synonyms=ZNF363, CHIMP)
683 10         35 for my $n (split(m{\s+and\s+},$genename)) {
684 12         10 my @genenames;
685 12         50 for my $section (split(m{\s*;\s*},$n)) {
686 15         43 my ($tag, $rest) = split("=",$section);
687 15   50     35 $rest ||= '';
688 15         49 for my $val (split(m{\s*,\s*},$rest)) {
689 19         47 push @genenames, [$tag => $val];
690             }
691             }
692 12         37 push @stags, ['gene_name' => \@genenames];
693             }
694             } else {
695             # old format
696 7         22 for my $section (split(/ AND /, $genename)) {
697 9         9 my @genenames;
698 9         30 $section =~ s/[\(\)\.]//g;
699 9         25 my @names = split(m{\s+OR\s+}, $section);
700 9         37 push @genenames, ['Name' => shift @names];
701 9         16 push @genenames, map {['Synonyms' => $_]} @names;
  10         14  
702 9         45 push @stags, ['gene_name' => \@genenames]
703             }
704             } #use Data::Dumper; print Dumper $gn, $genename;# exit;
705 17         129 my $gn = Bio::Annotation::TagTree->new(-tagname => 'gene_name',
706             -value => ['gene_names' => \@stags]);
707 17         41 $self->annotation_collection->add_Annotation('gene_name', $gn);
708             }
709             }
710              
711             # GenBank VERSION line
712             # old EMBL SV line (now obsolete)
713             # UniProt/SwissProt?
714             sub _generic_version {
715 28     28   33 my ($self, $data) = @_;
716 28         83 my ($acc,$gi) = split(' ',$data->{DATA});
717 28 100       106 if($acc =~ m{^\w+\.(\d+)}xmso) {
718 27         64 $self->{'_params'}->{'-version'} = $1;
719 27         55 $self->{'_params'}->{'-seq_version'} = $1;
720             }
721 28 100 66     124 if($gi && (index($gi,"GI:") == 0)) {
722 24         80 $self->{'_params'}->{'-primary_id'} = substr($gi,3);
723             }
724             }
725              
726             # EMBL DT lines
727             sub _embl_date {
728 5     5   8 my ($self, $data) = @_;
729 5         35 while ($data->{DATA} =~ m{(\S+)\s\((.*?)\)}g) {
730 10         24 my ($date, $version) = ($1, $2);
731 10         11 $date =~ tr{,}{}d; # remove comma if new version
732 10 50       54 if ($version =~ m{\(Rel\.\s(\d+),\sCreated\)}xmso ) {
    50          
733 0         0 my $release = Bio::Annotation::SimpleValue->new(
734             -tagname => 'creation_release',
735             -value => $1
736             );
737 0         0 $self->annotation_collection->add_Annotation($release);
738             } elsif ($version =~ m{\(Rel\.\s(\d+),\sLast\supdated,\sVersion\s(\d+)\)}xmso ) {
739 0         0 my $release = Bio::Annotation::SimpleValue->new(
740             -tagname => 'update_release',
741             -value => $1
742             );
743 0         0 $self->annotation_collection->add_Annotation($release);
744 0         0 my $update = Bio::Annotation::SimpleValue->new(
745             -tagname => 'update_version',
746             -value => $2
747             );
748 0         0 $self->annotation_collection->add_Annotation($update);
749             }
750 10         10 push @{ $self->{'_params'}->{'-dates'} }, $date;
  10         51  
751             }
752             }
753              
754             # UniProt/SwissProt DT lines
755             sub _swiss_date {
756 17     17   27 my ($self, $data) = @_;
757             # swissprot
758 17         49 my @dls = split m{\n}, $data->{DATA};
759 17         44 for my $dl (@dls) {
760 51         104 my ($date, $version) = split(' ', $dl, 2);
761 51         75 $date =~ tr{,}{}d; # remove comma if new version
762 51 100 100     369 if ($version =~ m{\(Rel\. (\d+), Last sequence update\)} || # old
    100 100        
763             $version =~ m{sequence version (\d+)\.}) { #new
764 17         96 my $update = Bio::Annotation::SimpleValue->new(
765             -tagname => 'seq_update',
766             -value => $1
767             );
768 17         44 $self->annotation_collection->add_Annotation($update);
769             } elsif ($version =~ m{\(Rel\. (\d+), Last annotation update\)} || #old
770             $version =~ m{entry version (\d+)\.}) { #new
771 17         58 $self->{'_params'}->{'-version'} = $1;
772 17         28 $self->{'_params'}->{'-seq_version'} = $1;
773             }
774 51         46 push @{ $self->{'_params'}->{'-dates'} }, $date;
  51         146  
775             }
776             }
777              
778             # GenBank KEYWORDS line
779             # EMBL KW line
780             # UniProt/SwissProt KW line
781             sub _generic_keywords {
782 55     55   73 my ($self, $data) = @_;
783 55         184 $data->{DATA} =~ s{\.$}{};
784 55         366 my @kw = split m{\s*\;\s*}xo ,$data->{DATA};
785 55         131 $self->{'_params'}->{'-keywords'} = \@kw;
786             }
787              
788             # GenBank DEFINITION line
789             # EMBL DE line
790             # UniProt/SwissProt DE line
791             sub _generic_description {
792 57     57   63 my ($self, $data) = @_;
793 57         140 $self->{'_params'}->{'-desc'} = $data->{DATA};
794             }
795              
796             # GenBank ACCESSION line
797             # EMBL AC line
798             # UniProt/SwissProt AC line
799             sub _generic_accession {
800 56     56   70 my ($self, $data) = @_;
801 56         231 my @accs = split m{[\s;]+}, $data->{DATA};
802 56         129 $self->{'_params'}->{'-accession_number'} = shift @accs;
803 56 100       170 $self->{'_params'}->{'-secondary_accessions'} = \@accs if @accs;
804             }
805              
806             ####################### SPECIES HANDLERS #######################
807              
808             # uses Bio::Species
809             # GenBank SOURCE, ORGANISM lines
810             # EMBL O* lines
811             # UniProt/SwissProt O* lines
812             sub _generic_species {
813 56     56   63 my ($self, $data) = @_;
814            
815 56         102 my $seqformat = $self->format;
816             # if data is coming in from GenBank parser...
817 56 100 66     364 if ($seqformat eq 'genbank' &&
818             $data->{ORGANISM} =~ m{(.+?)\s(\S+;[^\n\.]+)}ox) {
819 30         110 ($data->{ORGANISM}, $data->{CLASSIFICATION}) = ($1, $2);
820             }
821            
822             # SwissProt stuff...
823             # hybrid names in swissprot files are no longer valid per intergration into
824             # UniProt. Files containing these have been split into separate entries, so
825             # it is probably a good idea to update if one has these lingering around...
826              
827 56         67 my $taxid;
828 56 100       128 if ($seqformat eq 'swiss') {
829 17 50       79 if ($data->{DATA} =~ m{^([^,]+)}ox) {
830 17         54 $data->{DATA} = $1;
831             }
832 17 100 66     107 if ($data->{CROSSREF} && $data->{CROSSREF} =~ m{NCBI_TaxID=(\d+)}) {
833 16         31 $taxid = $1;
834             }
835             }
836            
837             my ($sl, $class, $sci_name) = ($data->{DATA},
838             $data->{CLASSIFICATION},
839 56   100     200 $data->{ORGANISM} || '');
840 56         66 my ($organelle,$abbr_name, $common);
841 56         550 my @class = reverse split m{\s*;\s*}, $class;
842             # have to treat swiss different from everything else...
843 56 50       527 if ($sl =~ m{^(mitochondrion|chloroplast|plastid)? # GenBank format
844             \s*(.*?)
845             \s*(?: \( (.*?) \) )?\.?$
846             }xmso ){
847 56         157 ($organelle, $abbr_name, $common) = ($1, $2, $3); # optional
848             } else {
849 0         0 $abbr_name = $sl; # nothing caught; this is a backup!
850             }
851             # there is no 'abbreviated name' for EMBL
852 56 100       141 $sci_name = $abbr_name if $seqformat ne 'genbank';
853 56   100     189 $organelle ||= '';
854 56   100     119 $common ||= '';
855 56 50       93 $sci_name || return;
856 56         81 unshift @class, $sci_name;
857             # no genus/species parsing here; moving to Bio::Taxon-based taxonomy
858 56         267 my $make = Bio::Species->new();
859 56         149 $make->scientific_name($sci_name);
860 56 50       219 $make->classification(@class) if @class > 0;
861 56 100       139 $common && $make->common_name( $common );
862 56 50       185 $abbr_name && $make->name('abbreviated', $abbr_name);
863 56 100       108 $organelle && $make->organelle($organelle);
864 56 100       109 $taxid && $make->ncbi_taxid($taxid);
865 56         281 $self->{'_params'}->{'-species'} = $make;
866             }
867              
868             ####################### ANNOTATION HANDLERS #######################
869              
870             # GenBank DBSOURCE line
871             sub _genbank_dbsource {
872 1     1   2 my ($self, $data) = @_;
873 1         2 my $dbsource = $data->{DATA};
874 1         2 my $annotation = $self->annotation_collection;
875             # deal with swissprot dbsources
876             # we could possibly parcel these out to subhandlers...
877 1 50       4 if( $dbsource =~ s/(UniProt(?:KB)|swissprot):\s+locus\s+(\S+)\,.+\n// ) {
878 0         0 $annotation->add_Annotation
879             ('dblink',
880             Bio::Annotation::DBLink->new
881             (-primary_id => $2,
882             -database => $1,
883             -tagname => 'dblink'));
884 0 0       0 if( $dbsource =~ s/\s*created:\s+([^\.]+)\.\n// ) {
885 0         0 $annotation->add_Annotation
886             ('swissprot_dates',
887             Bio::Annotation::SimpleValue->new
888             (-tagname => 'date_created',
889             -value => $1));
890             }
891 0         0 while( $dbsource =~ s/\s*(sequence|annotation)\s+updated:\s+([^\.]+)\.\n//g ) {
892 0         0 $annotation->add_Annotation
893             ('swissprot_dates',
894             Bio::Annotation::SimpleValue->new
895             (-tagname => 'date_updated',
896             -value => $1));
897             }
898 0         0 $dbsource =~ s/\n/ /g;
899 0 0       0 if( $dbsource =~ s/\s*xrefs:\s+((?:\S+,\s+)+\S+)\s+xrefs/xrefs/ ) {
    0          
900             # will use $i to determine even or odd
901             # for swissprot the accessions are paired
902 0         0 my $i = 0;
903 0         0 for my $dbsrc ( split(/,\s+/,$1) ) {
904 0 0 0     0 if( $dbsrc =~ /(\S+)\.(\d+)/ || $dbsrc =~ /(\S+)/ ) {
905 0         0 my ($id,$version) = ($1,$2);
906 0 0       0 $version ='' unless defined $version;
907 0         0 my $db;
908 0 0       0 if( $id =~ /^\d\S{3}/) {
909 0         0 $db = 'PDB';
910             } else {
911 0 0       0 $db = ($i++ % 2 ) ? 'GenPept' : 'GenBank';
912             }
913 0         0 $annotation->add_Annotation
914             ('dblink',
915             Bio::Annotation::DBLink->new
916             (-primary_id => $id,
917             -version => $version,
918             -database => $db,
919             -tagname => 'dblink'));
920             }
921             }
922             } elsif( $dbsource =~ s/\s*xrefs:\s+(.+)\s+xrefs/xrefs/i ) {
923             # download screwed up and ncbi didn't put acc in for gi numbers
924 0         0 my $i = 0;
925 0         0 for my $id ( split(/\,\s+/,$1) ) {
926 0         0 my ($acc,$db);
927 0 0       0 if( $id =~ /gi:\s+(\d+)/ ) {
    0          
928 0         0 $acc= $1;
929 0 0       0 $db = ($i++ % 2 ) ? 'GenPept' : 'GenBank';
930             } elsif( $id =~ /pdb\s+accession\s+(\S+)/ ) {
931 0         0 $acc= $1;
932 0         0 $db = 'PDB';
933             } else {
934 0         0 $acc= $id;
935 0         0 $db = '';
936             }
937 0         0 $annotation->add_Annotation
938             ('dblink',
939             Bio::Annotation::DBLink->new
940             (-primary_id => $acc,
941             -database => $db,
942             -tagname => 'dblink'));
943             }
944             } else {
945 0         0 $self->warn("Cannot match $dbsource\n");
946             }
947 0 0       0 if( $dbsource =~ s/xrefs\s+\(non\-sequence\s+databases\):\s+
948             ((?:\S+,\s+)+\S+)//x ) {
949 0         0 for my $id ( split(/\,\s+/,$1) ) {
950 0         0 my $db;
951             # this is because GenBank dropped the spaces!!!
952             # I'm sure we're not going to get this right
953             ##if( $id =~ s/^://i ) {
954             ## $db = $1;
955             ##}
956 0         0 $db = substr($id,0,index($id,':'));
957 0 0       0 if (! exists $DBSOURCE{ $db }) {
958 0         0 $db = ''; # do we want 'GenBank' here?
959             }
960 0         0 $id = substr($id,index($id,':')+1);
961 0         0 $annotation->add_Annotation
962             ('dblink',Bio::Annotation::DBLink->new
963             (-primary_id => $id,
964             -database => $db,
965             -tagname => 'dblink'));
966             }
967             }
968             } else {
969 1 50       6 if( $dbsource =~ /^(\S*?):?\s*accession\s+(\S+)\.(\d+)/ ) {
    0          
970 1         2 my ($db,$id,$version) = ($1,$2,$3);
971 1   50     11 $annotation->add_Annotation
972             ('dblink',
973             Bio::Annotation::DBLink->new
974             (-primary_id => $id,
975             -version => $version,
976             -database => $db || 'GenBank',
977             -tagname => 'dblink'));
978             } elsif ( $dbsource =~ /(\S+)([\.:])(\d+)/ ) {
979 0         0 my ($id, $db, $version);
980 0 0       0 if ($2 eq ':') {
981 0         0 ($db, $id) = ($1, $3);
982             } else {
983 0         0 ($db, $id, $version) = ('GenBank', $1, $3);
984             }
985 0         0 $annotation->add_Annotation('dblink',
986             Bio::Annotation::DBLink->new(
987             -primary_id => $id,
988             -version => $version,
989             -database => $db,
990             -tagname => 'dblink')
991             );
992             } else {
993 0         0 $self->warn("Unrecognized DBSOURCE data: $dbsource\n");
994             }
995             }
996             }
997              
998             # EMBL DR lines
999             # UniProt/SwissProt DR lines
1000             sub _generic_dbsource {
1001 20     20   25 my ($self, $data) = @_;
1002             #$self->debug(Dumper($data));
1003 20         124 while ($data->{DATA} =~ m{([^\n]+)}og) {
1004 351         599 my $dblink = $1;
1005 351         800 $dblink =~ s{\.$}{};
1006 351         308 my $link;
1007 351         649 my @linkdata = split '; ',$dblink;
1008 351 50       1221 if ( $dblink =~ m{([^\s;]+);\s*([^\s;]+);?\s*([^\s;]+)?}) {
1009             #if ( $dblink =~ m{([^\s;]+);\s*([^\s;]+);?\s*([^\s;]+)?}) {
1010 351         631 my ($databse, $prim_id, $sec_id) = ($1,$2,$3);
1011 351         990 $link = Bio::Annotation::DBLink->new(-database => $databse,
1012             -primary_id => $prim_id,
1013             -optional_id => $sec_id);
1014             } else {
1015 0         0 $self->warn("No match for $dblink");
1016             }
1017 351         591 $self->annotation_collection->add_Annotation('dblink', $link);
1018             }
1019             }
1020              
1021              
1022             # GenBank REFERENCE and related lines
1023             # EMBL R* lines
1024             # UniProt/SwissProt R* lines
1025             sub _generic_reference {
1026 301     301   252 my ($self, $data) = @_;
1027 301         400 my $seqformat = $self->format;
1028 301         262 my ($start, $end);
1029             # get these in EMBL/Swiss
1030 301 100       448 if ($data->{CROSSREF}) {
1031 128         615 while ($data->{CROSSREF} =~ m{(pubmed|doi|medline)(?:=|;\s+)(\S+)}oig) {
1032 199         407 my ($db, $ref) = (uc $1, $2);
1033 199         407 $ref =~ s{[;.]+$}{};
1034 199         548 $data->{$db} = $ref;
1035             }
1036             }
1037             # run some cleanup for swissprot
1038 301 100       457 if ($seqformat eq 'swiss') {
1039 109         144 for my $val (values %{ $data }) {
  109         261  
1040 959         983 $val =~ s{;$}{};
1041 959         956 $val =~ s{(\w-)\s}{$1};
1042             }
1043             }
1044 301 100       430 if ( $data->{POSITION} ) {
1045 127 100       421 if ($seqformat eq 'embl') {
    100          
1046 18         53 ($start, $end) = split '-', $data->{POSITION},2;
1047             } elsif ($data->{POSITION} =~ m{.+? OF (\d+)-(\d+).*}) { #swiss
1048 23         42 ($start, $end) = ($1, $2);
1049             }
1050             }
1051 301 100       649 if ($data->{DATA} =~ m{^\d+\s+\([a-z]+\s+(\d+)\s+to\s+(\d+)\)}xmso) {
1052 59         98 ($start, $end) = ($1, $2);
1053             }
1054             my $ref = Bio::Annotation::Reference->new(
1055             -comment => $data->{REMARK},
1056             -location => $data->{JOURNAL},
1057             -pubmed => $data->{PUBMED},
1058             -consortium => $data->{CONSRTM},
1059             -title => $data->{TITLE},
1060             -authors => $data->{AUTHORS},
1061             -medline => $data->{MEDLINE},
1062             -doi => $data->{DOI},
1063             -rp => $data->{POSITION}, # JIC...
1064 301         2455 -start => $start,
1065             -end => $end,
1066             );
1067 301 100       1093 if ($data->{DATA} =~ m{^\d+\s+\((.*)\)}xmso) {
1068 59         147 $ref->gb_reference($1);
1069             }
1070 301         503 $self->annotation_collection->add_Annotation('reference', $ref);
1071             }
1072              
1073             # GenBank COMMENT lines
1074             # EMBL CC lines
1075             # UniProt/SwissProt CC lines
1076             sub _generic_comment {
1077 43     43   53 my ($self, $data) = @_;
1078             $self->annotation_collection->add_Annotation('comment',
1079 43         84 Bio::Annotation::Comment->new( -text => $data->{DATA} ));
1080             }
1081              
1082             ####################### SEQFEATURE HANDLER #######################
1083              
1084             # GenBank Feature Table
1085             sub _generic_seqfeatures {
1086 848     848   690 my ($self, $data) = @_;
1087 848 100       1377 return if $data->{FEATURE_KEY} eq 'FEATURES';
1088 818         720 my $primary_tag = $data->{FEATURE_KEY};
1089            
1090             # grab the NCBI taxon ID from the source SF
1091 818 100 66     1310 if ($primary_tag eq 'source' && exists $data->{'db_xref'}) {
1092 36 100 66     220 if ( $self->{'_params'}->{'-species'} &&
1093             $data->{'db_xref'} =~ m{taxon:(\d+)}xmso ) {
1094 35         112 $self->{'_params'}->{'-species'}->ncbi_taxid($1);
1095             }
1096             }
1097 818         1161 my $source = $self->format;
1098            
1099 818         654 my $seqid = ${ $self->get_params('accession_number') }{'accession_number'};
  818         1026  
1100            
1101 818         878 my $loc;
1102 818         838 eval {
1103 818         2061 $loc = $self->{'_locfactory'}->from_string($data->{'LOCATION'});
1104             };
1105 818 50       1399 if(! $loc) {
1106             $self->warn("exception while parsing location line [" .
1107             $data->{'LOCATION'} .
1108             "] in reading $source, ignoring feature " .
1109 0         0 $data->{'primary_tag'}.
1110             " (seqid=" . $seqid . "): " . $@);
1111 0         0 return;
1112             }
1113 818 50 33     1282 if($seqid && (! $loc->is_remote())) {
1114 0         0 $loc->seq_id($seqid); # propagates if it is a split location
1115             }
1116 818         1919 my $sf = Bio::SeqFeature::Generic->direct_new();
1117 818         1208 $sf->location($loc);
1118 818         1231 $sf->primary_tag($primary_tag);
1119 818         1222 $sf->seq_id($seqid);
1120 818         1106 $sf->source_tag($source);
1121 818         977 delete $data->{'FEATURE_KEY'};
1122 818         808 delete $data->{'LOCATION'};
1123 818         835 delete $data->{'NAME'};
1124 818         597 delete $data->{'DATA'};
1125 818         1788 $sf->set_attributes(-tag => $data);
1126 818         902 push @{ $self->{'_params'}->{'-features'} }, $sf;
  818         2462  
1127             }
1128              
1129             ####################### ODDS AND ENDS #######################
1130              
1131             # Those things that don't fit anywhere else. If a specific name
1132             # maps to the below table, that class and method are used, otherwise
1133             # it goes into a SimpleValue (I think this is a good argument for why
1134             # we need a generic mechanism for storing annotation)
1135              
1136             sub _generic_simplevalue {
1137 16     16   16 my ($self, $data) = @_;
1138             $self->annotation_collection->add_Annotation(
1139             Bio::Annotation::SimpleValue->new(-tagname => lc($data->{NAME}),
1140             -value => $data->{DATA})
1141 16         41 );
1142             }
1143              
1144       15 0   sub noop {}
1145              
1146             sub _debug {
1147 0     0     my ($self, $data) = @_;
1148 0           $self->debug(Dumper($data));
1149             }
1150              
1151              
1152             1;