File Coverage

Bio/SeqIO/Handler/GenericRichSeqHandler.pm

Criterion	Covered	Total	%
statement	324	411	78.8
branch	120	204	58.8
condition	36	61	59.0
subroutine	46	48	95.8
pod	11	12	91.6
total	537	736	72.9

line	stmt	bran	cond	sub	pod	time	code
1							#
2							# BioPerl module for Bio::SeqIO::Handler::GenericRichSeqHandler
3							#
4							# Please direct questions and support issues to
5							#
6							# Cared for by Chris Fields
7							#
8							# Copyright Chris Fields
9							#
10							# You may distribute this module under the same terms as perl itself
11
12							# POD documentation - main docs before the code
13
14							=head1 NAME
15
16							Bio::SeqIO::Handler::GenericRichSeqHandler - Bio::HandlerI-based
17							data handler for GenBank/EMBL/UniProt (and other) sequence data
18
19							=head1 SYNOPSIS
20
21							# MyHandler is a GenericRichSeqHandler object.
22							# inside a parser (driver) constructor....
23
24							$self->seq_handler($handler \|\| MyHandler->new(-format => 'genbank'));
25
26							# in next_seq() in driver...
27
28							$hobj = $self->seqhandler();
29
30							# roll data up into hashref chunks, pass off into Handler for processing...
31
32							$hobj->data_handler($data);
33
34							# or retrieve Handler methods and pass data directly to Handler methods...
35
36							my $hmeth = $hobj->handler_methods;
37
38							if ($hmeth->{ $data->{NAME} }) {
39							my $mth = $hmeth->{ $data->{NAME} };
40							$hobj->$mth($data);
41							}
42
43							=head1 DESCRIPTION
44
45							This is an experimental implementation of a sequence-based HandlerBaseI parser
46							and may change over time. It is possible (nay, likely) that the way handler
47							methods are set up will change over development to allow more flexibility.
48							Release pumpkins, please do not add this to a release until the API has settled.
49							It is also likely that write_seq() will not work properly for some data.
50
51							Standard Developer caveats:
52
53							Do not use for production purposes.
54							Not responsible for destroying (your data\|computer\|world).
55							Do not stare directly at GenericRichSeqHandler.
56							If GenericRichSeqHandler glows, back slowly away and call for help.
57
58							Consider yourself warned!
59
60							This class acts as a demonstration on how to handle similar data chunks derived
61							from Bio::SeqIO::gbdriver, Bio::SeqIO::embldriver, and Bio::SeqIO::swissdriver
62							using similar (or the same) handler methods.
63
64							The modules currently pass all previous tests in t/genbank.t, t/embl.t, and
65							t/swiss.t yet all use the same handler methods (the collected tests for handlers
66							can be found in t/Handler.t). Some tweaking of the methods themselves is
67							probably in order over the long run to ensure that data is consistently handled
68							for each parser. Round-trip tests are probably in order here...
69
70							Though a Bio::Seq::SeqBuilder is employed for building sequence objects no
71							bypassing of data based on builder slots has been implemented (yet); this is
72							planned in the near future.
73
74							As a reminder: this is the current Annotation data chunk (via Data::Dumper):
75
76							$VAR1 = {
77							'NAME' => 'REFERENCE',
78							'DATA' => '1 (bases 1 to 10001)'
79							'AUTHORS' => 'International Human Genome Sequencing Consortium.'
80							'TITLE' => 'The DNA sequence of Homo sapiens'
81							'JOURNAL' => 'Unpublished (2003)'
82							};
83							...
84
85							This is the current SeqFeature data chunk (again via Data::Dumper):
86
87							$VAR1 = {
88							'mol_type' => 'genomic DNA',
89							'LOCATION' => '<1..>10001',
90							'NAME' => 'FEATURES',
91							'FEATURE_KEY' => 'source',
92							'note' => 'Accession AL451081 sequenced by The Sanger Centre',
93							'db_xref' => 'taxon:9606',
94							'clone' => 'RP11-302I18',
95							'organism' => 'Homo sapiens'
96							};
97
98							=head1 FEEDBACK
99
100							=head2 Mailing Lists
101
102							User feedback is an integral part of the evolution of this and other
103							Bioperl modules. Send your comments and suggestions preferably to one
104							of the Bioperl mailing lists. Your participation is much appreciated.
105
106							bioperl-l@bioperl.org - General discussion
107							http://bioperl.org/wiki/Mailing_lists - About the mailing lists
108
109							=head2 Support
110
111							Please direct usage questions or support issues to the mailing list:
112
113							I
114
115							rather than to the module maintainer directly. Many experienced and
116							reponsive experts will be able look at the problem and quickly
117							address it. Please include a thorough description of the problem
118							with code and data examples if at all possible.
119
120							=head2 Reporting Bugs
121
122							Report bugs to the Bioperl bug tracking system to help us keep track
123							the bugs and their resolution. Bug reports can be submitted via the
124							web:
125
126							https://github.com/bioperl/bioperl-live/issues
127
128							=head1 AUTHOR - Chris Fields
129
130							Email cjfields at bioperl dot org
131
132							=head1 APPENDIX
133
134							The rest of the documentation details each of the object methods. Internal
135							methods are usually preceded with a _
136
137							=cut
138
139							# Let the code begin...
140
141							package Bio::SeqIO::Handler::GenericRichSeqHandler;
142	1			1		3	use strict;
	1					1
	1					21
143	1			1		3	use warnings;
	1					1
	1					16
144
145	1			1		687	use Bio::SeqIO::FTHelper;
	1					2
	1					23
146	1			1		4	use Bio::Annotation::Collection;
	1					1
	1					16
147	1			1		701	use Bio::Annotation::DBLink;
	1					2
	1					20
148	1			1		627	use Bio::Annotation::Comment;
	1					2
	1					20
149	1			1		675	use Bio::Annotation::Reference;
	1					1
	1					25
150	1			1		4	use Bio::Annotation::Collection;
	1					1
	1					15
151	1			1		3	use Bio::Annotation::SimpleValue;
	1					1
	1					13
152	1			1		690	use Bio::Annotation::TagTree;
	1					2
	1					24
153	1			1		4	use Bio::SeqFeature::Generic;
	1					1
	1					15
154	1			1		688	use Bio::Species;
	1					2
	1					21
155	1			1		4	use Bio::Taxon;
	1					2
	1					12
156	1			1		3	use Bio::DB::Taxonomy;
	1					1
	1					14
157	1			1		3	use Bio::Factory::FTLocationFactory;
	1					1
	1					18
158	1			1		3	use Data::Dumper;
	1					1
	1					44
159
160	1			1		3	use base qw(Bio::Root::Root Bio::HandlerBaseI);
	1					1
	1					742
161
162							my %HANDLERS = (
163							'genbank' => {
164							'LOCUS' => \&_genbank_locus,
165							'DEFINITION' => \&_generic_description,
166							'ACCESSION' => \&_generic_accession,
167							'VERSION' => \&_generic_version,
168							'KEYWORDS' => \&_generic_keywords,
169							'DBSOURCE' => \&_genbank_dbsource,
170							'DBLINK' => \&_genbank_dbsource,
171							'SOURCE' => \&_generic_species,
172							'REFERENCE' => \&_generic_reference,
173							'COMMENT' => \&_generic_comment,
174							'FEATURES' => \&_generic_seqfeatures,
175							'BASE' => \&noop, # this is generated from scratch
176							'ORIGIN' => \&_generic_seq,
177							# handles anything else (WGS, WGS_SCAFLD, CONTIG, PROJECT)
178							'_DEFAULT_' => \&_generic_simplevalue,
179							},
180							'embl' => {
181							'ID' => \&_embl_id,
182							'DT' => \&_embl_date,
183							'DR' => \&_generic_dbsource,
184							'SV' => \&_generic_version,
185							'RN' => \&_generic_reference,
186							'KW' => \&_generic_keywords,
187							'DE' => \&_generic_description,
188							'AC' => \&_generic_accession,
189							#'AH' => \&noop, # TPA data not dealt with yet...
190							#'AS' => \&noop,
191							'SQ' => \&_generic_seq,
192							'OS' => \&_generic_species,
193							'CC' => \&_generic_comment,
194							'FT' => \&_generic_seqfeatures,
195							# handles anything else (WGS, TPA, ANN...)
196							'_DEFAULT_' => \&_generic_simplevalue,
197							},
198							'swiss' => {
199							'ID' => \&_swiss_id,
200							'DT' => \&_swiss_date,
201							'GN' => \&_swiss_genename,
202							'DR' => \&_generic_dbsource,
203							'RN' => \&_generic_reference,
204							'KW' => \&_generic_keywords,
205							'DE' => \&_generic_description,
206							'AC' => \&_generic_accession,
207							'SQ' => \&_generic_seq,
208							'OS' => \&_generic_species,
209							'CC' => \&_generic_comment,
210							'FT' => \&_generic_seqfeatures,
211							# handles anything else, though I don't know what...
212							'_DEFAULT_' => \&_generic_simplevalue,
213							},
214							);
215
216							# can we do this generically? Seems like a lot of trouble...
217							my %DBSOURCE = map {$_ => 1} qw(
218							EchoBASE IntAct SWISS-2DPAGE ECO2DBASE ECOGENE TIGRFAMs
219							TIGR GO InterPro Pfam PROSITE SGD GermOnline
220							HSSP PhosSite Ensembl RGD AGD ArrayExpress KEGG
221							H-InvDB HGNC LinkHub PANTHER PRINTS SMART SMR
222							MGI MIM RZPD-ProtExp ProDom MEROPS TRANSFAC Reactome
223							UniGene GlycoSuiteDB PIRSF HSC-2DPAGE PHCI-2DPAGE
224							PMMA-2DPAGE Siena-2DPAGE Rat-heart-2DPAGE Aarhus/Ghent-2DPAGE
225							Biocyc MetaCyc Biocyc:Metacyc GenomeReviews FlyBase
226							TMHOBP COMPLUYEAST-2DPAGE OGP DictyBase HAMAP
227							PhotoList Gramene WormBase WormPep Genew ZFIN
228							PeroxiBase MaizeDB TAIR DrugBank REBASE HPA
229							swissprot GenBank GenPept REFSEQ embl PDB UniProtKB);
230
231							my %NOPROCESS = map {$_ => 1} qw(DBSOURCE ORGANISM FEATURES);
232
233							our %VALID_ALPHABET = (
234							'bp' => 'dna',
235							'aa' => 'protein',
236							'rc' => '' # rc = release candidate; file has no sequences
237							);
238
239							=head2 new
240
241							Title : new
242							Usage :
243							Function:
244							Returns :
245							Args : -format Sequence format to be mapped for handler methods
246							-builder Bio::Seq::SeqBuilder object (normally defined in
247							SequenceStreamI object implementation constructor)
248							Throws : On undefined '-format' sequence format parameter
249							Note : Still under heavy development
250
251							=cut
252
253							sub new {
254	42			42	1	123	my ($class, @args) = @_;
255	42					170	my $self = $class->SUPER::new(@args);
256	42					154	$self = {@args};
257	42					209	bless $self,$class;
258	42					188	my ($format, $builder) = $self->_rearrange([qw(FORMAT BUILDER)], @args);
259	42	50				155	$self->throw("Must define sequence record format") if !$format;
260	42					170	$self->format($format);
261	42					128	$self->handler_methods();
262	42	50				183	$builder && $self->seqbuilder($builder);
263	42					138	$self->location_factory();
264	42					210	return $self;
265							}
266
267							=head1 L implementing methods
268
269							=head2 handler_methods
270
271							Title : handler_methods
272							Usage : $handler->handler_methods('GenBank')
273							%handlers = $handler->handler_methods();
274							Function: Retrieve the handler methods used for the current format() in
275							the handler. This assumes the handler methods are already
276							described in the HandlerI-implementing class.
277							Returns : a hash reference with the data type handled and the code ref
278							associated with it.
279							Args : [optional] String representing the sequence format. If set here
280							this will also set sequence_format()
281							Throws : On unimplemented sequence format in %HANDLERS
282
283							=cut
284
285							sub handler_methods {
286	75			75	1	103	my $self = shift;
287	75	100				187	if (!($self->{'handlers'})) {
288							$self->throw("No handlers defined for seqformat ",$self->format)
289	42	50				119	unless exists $HANDLERS{$self->format};
290	42					113	$self->{'handlers'} = $HANDLERS{$self->format};
291							}
292	75					143	return ($self->{'handlers'});
293							}
294
295							=head2 data_handler
296
297							Title : data_handler
298							Usage : $handler->data_handler($data)
299							Function: Centralized method which accepts all data chunks, then distributes
300							to the appropriate methods for processing based on the chunk name
301							from within the HandlerBaseI object.
302
303							One can also use
304							Returns : None
305							Args : an hash ref containing a data chunk.
306
307							=cut
308
309							sub data_handler {
310	1646			1646	1	1977	my ($self, $data) = @_;
311	1646		33			3629	my $nm = $data->{NAME} \|\| $self->throw("No name tag defined!");
312
313							# this should handle data on the fly w/o caching; any caching should be
314							# done in the driver!
315							my $method = (exists $self->{'handlers'}->{$nm}) ? ($self->{'handlers'}->{$nm}) :
316	1646	50				4123	(exists $self->{'handlers'}->{'_DEFAULT_'}) ? ($self->{'handlers'}->{'_DEFAULT_'}) :
		100
317							undef;
318	1646	50				2620	if (!$method) {
319	0					0	$self->debug("No handler defined for $nm\n");
320	0					0	return;
321							};
322	1646					3156	$self->$method($data);
323							}
324
325							=head2 reset_parameters
326
327							Title : reset_parameters
328							Usage : $handler->reset_parameters()
329							Function: Resets the internal cache of data (normally object parameters for
330							a builder or factory)
331							Returns : None
332							Args : None
333
334							=cut
335
336							sub reset_parameters {
337	58			58	1	115	my $self = shift;
338	58					150	$self->{'_params'} = undef;
339							}
340
341							=head2 format
342
343							Title : format
344							Usage : $handler->format('GenBank')
345							Function: Get/Set the format for the report/record being parsed. This can be
346							used to set handlers in classes which are capable of processing
347							similar data chunks from multiple driver modules.
348							Returns : String with the sequence format
349							Args : [optional] String with the sequence format
350							Note : The format may be used to set the handlers (as in the
351							current GenericRichSeqHandler implementation)
352
353							=cut
354
355							sub format {
356	1301			1301	1	1351	my $self = shift;
357	1301	100				2312	return $self->{'_seqformat'} = lc shift if @_;
358	1259					2782	return $self->{'_seqformat'};
359							}
360
361							=head2 get_params
362
363							Title : get_params
364							Usage : $handler->get_params('-species')
365							Function: Convenience method used to retrieve the specified
366							parameters from the internal parameter cache
367							Returns : Hash ref containing parameters requested and data as
368							key-value pairs. Note that some parameter values may be
369							objects, arrays, etc.
370							Args : List (array) representing the parameters requested
371
372							=cut
373
374							sub get_params {
375	818			818	1	1299	my ($self, @ids) = @_;
376	818					668	my %data;
377	818					1389	for my $id (@ids) {
378	818	50				2085	if (!index($id, '-')==0) {
379	818					1800	$id = '-'.$id ;
380							}
381	818	100				3709	$data{$id} = $self->{'_params'}->{$id} if (exists $self->{'_params'}->{$id});
382							}
383	818					1870	return \%data;
384							}
385
386							=head2 set_params
387
388							Title : set_params
389							Usage : $handler->set_param({'-species')
390							Function: Convenience method used to set specific parameters
391							Returns : None
392							Args : Hash ref containing the data to be passed as key-value pairs
393
394							=cut
395
396							sub set_params {
397	0			0	1	0	shift->throw('Not implemented yet!');
398							}
399
400							=head1 Methods unique to this implementation
401
402							=head2 seqbuilder
403
404							Title : seqbuilder
405							Usage :
406							Function:
407							Returns :
408							Args :
409							Throws :
410							Note :
411
412							=cut
413
414							sub seqbuilder {
415	104			104	1	138	my $self = shift;
416	104	100				370	return $self->{'_seqbuilder'} = shift if @_;
417	62					137	return $self->{'_seqbuilder'};
418							}
419
420							=head2 build_sequence
421
422							Title : build_sequence
423							Usage :
424							Function:
425							Returns :
426							Args :
427							Throws :
428							Note :
429
430							=cut
431
432							sub build_sequence {
433	62			62	1	120	my $self = shift;
434	62					228	my $builder = $self->seqbuilder();
435	62					155	my $seq;
436	62	100				189	if (defined($self->{'_params'})) {
437	58					75	$builder->add_slot_value(%{ $self->{'_params'} });
	58					783
438	58					243	$seq = $builder->make_object();
439	58					229	$self->reset_parameters;
440							}
441	62	100				1073	return $seq if $seq;
442	4					22	return 0;
443							}
444
445							=head2 location_factory
446
447							Title : location_factory
448							Usage :
449							Function:
450							Returns :
451							Args :
452							Throws :
453							Note :
454
455							=cut
456
457							sub location_factory {
458	42			42	1	69	my ($self, $factory) = @_;
459	42	50				193	if ($factory) {
		50
460	0	0	0			0	$self->throw("Must have a Bio::Factory::LocationFactoryI when ".
461							"explicitly setting factory()") unless
462							(ref($factory) && $factory->isa('Bio::Factory::LocationFactoryI'));
463	0					0	$self->{'_locfactory'} = $factory;
464							} elsif (!defined($self->{'_locfactory'})) {
465	42					250	$self->{'_locfactory'} = Bio::Factory::FTLocationFactory->new()
466							}
467	42					71	return $self->{'_locfactory'};
468							}
469
470							=head2 annotation_collection
471
472							Title : annotation_collection
473							Usage :
474							Function:
475							Returns :
476							Args :
477							Throws :
478							Note :
479
480							=cut
481
482							sub annotation_collection {
483	746			746	1	957	my ($self, $coll) = @_;
484	746	50				1997	if ($coll) {
		100
485	0	0	0			0	$self->throw("Must have Bio::AnnotationCollectionI ".
486							"when explicitly setting collection()")
487							unless (ref($coll) && $coll->isa('Bio::AnnotationCollectionI'));
488	0					0	$self->{'_params'}->{'-annotation'} = $coll;
489							} elsif (!exists($self->{'_params'}->{'-annotation'})) {
490	57					538	$self->{'_params'}->{'-annotation'} = Bio::Annotation::Collection->new()
491							}
492	746					2578	return $self->{'_params'}->{'-annotation'};
493							}
494
495							####################### SEQUENCE HANDLERS #######################
496
497							# any sequence data
498							sub _generic_seq {
499	53			53		87	my ($self, $data) = @_;
500	53					409	$self->{'_params'}->{'-seq'} = $data->{DATA};
501							}
502
503							####################### RAW DATA HANDLERS #######################
504
505							# GenBank LOCUS line
506							sub _genbank_locus {
507	31			31		51	my ($self, $data) = @_;
508	31					266	my (@tokens) = split m{\s+}, $data->{DATA};
509	31					57	my $display_id = shift @tokens;
510	31					120	$self->{'_params'}->{'-display_id'} = $display_id;
511	31					58	my $seqlength = shift @tokens;
512	31	50				92	if (exists $VALID_ALPHABET{$seqlength}) {
513							# moved one token too far. No locus name?
514							$self->warn("Bad LOCUS name? Changing [".$self->{'_params'}->{'-display_id'}.
515	0					0	"] to 'unknown' and length to ".$self->{'_params'}->{'-display_id'});
516	0					0	$self->{'_params'}->{'-length'} = $self->{'_params'}->{'-display_id'};
517	0					0	$self->{'_params'}->{'-display_id'} = 'unknown';
518							# add token back...
519	0					0	unshift @tokens, $seqlength;
520							} else {
521	31					91	$self->{'_params'}->{'-length'} = $seqlength;
522							}
523	31					108	my $alphabet = lc(shift @tokens);
524							$self->{'_params'}->{'-alphabet'} =
525	31	50				107	(exists $VALID_ALPHABET{$alphabet}) ? $VALID_ALPHABET{$alphabet} :
526							$self->warn("Unknown alphabet: $alphabet");
527	31	50	66			157	if (($self->{'_params'}->{'-alphabet'} eq 'dna') \|\| (@tokens > 2)) {
528	31					71	$self->{'_params'}->{'-molecule'} = shift(@tokens);
529	31					67	my $circ = shift(@tokens);
530	31	100				85	if ($circ eq 'circular') {
531	2					5	$self->{'_params'}->{'-is_circular'} = 1;
532	2					4	$self->{'_params'}->{'-division'} = shift(@tokens);
533							} else {
534							# 'linear' or 'circular' may actually be omitted altogether
535	29	100				98	$self->{'_params'}->{'-division'} =
536							(CORE::length($circ) == 3 ) ? $circ : shift(@tokens);
537							}
538							} else {
539	0	0				0	$self->{'_params'}->{'-molecule'} = 'PRT' if($self->{'_params'}->{'-alphabet'} eq 'aa');
540	0					0	$self->{'_params'}->{'-division'} = shift(@tokens);
541							}
542	31					170	my $date = join(' ', @tokens);
543							# maybe use Date::Time for dates?
544	31	100	66			346	if($date && $date =~ s{\s((\d{1,2})-(\w{3})-(\d{2,4})).}{$1}) {
545
546	30	50				76	if( length($date) < 11 ) {
547							# improperly formatted date
548							# But we'll be nice and fix it for them
549	0					0	my ($d,$m,$y) = ($2,$3,$4);
550	0	0				0	if( length($d) == 1 ) {
551	0					0	$d = "0$d";
552							}
553							# guess the century here
554	0	0				0	if( length($y) == 2 ) {
555	0	0				0	if( $y > 60 ) { # arbitrarily guess that '60' means 1960
556	0					0	$y = "19$y";
557							} else {
558	0					0	$y = "20$y";
559							}
560	0					0	$self->warn("Date was malformed, guessing the century for $date to be $y\n");
561							}
562	0					0	$self->{'_params'}->{'-dates'} = [join('-',$d,$m,$y)];
563							} else {
564	30					144	$self->{'_params'}->{'-dates'} = [$date];
565							}
566							}
567							}
568
569							# EMBL ID line
570							sub _embl_id {
571	10			10		22	my ($self, $data) = @_;
572	10					20	my $alphabet;
573	10					19	my ($name, $sv, $topology, $mol, $div);
574	10					32	my $line = $data->{DATA};
575							#$self->debug("$line\n");
576	10					35	my ($idtype) = $line =~ tr/;/;/;
577	10	100				44	if ( $idtype == 6) { # New style headers contain exactly six semicolons.
		100
578							# New style header (EMBL Release >= 87, after June 2006)
579	1					4	my $topology;
580							my $sv;
581
582							# ID DQ299383; SV 1; linear; mRNA; STD; MAM; 431 BP.
583							# This regexp comes from the new2old.pl conversion script, from EBI
584	1	50				13	if ($line =~ m/^(\w+);\s+SV (\d+); (\w+); ([^;]+); (\w{3}); (\w{3}); (\d+) \w{2}\./) {
585	1					11	($name, $sv, $topology, $mol, $div) = ($1, $2, $3, $4, $6);
586							} else {
587	0					0	$self->throw("Unrecognized EMBL ID line:[$line]");
588							}
589	1	50				6	if (defined($sv)) {
590	1					7	$self->{'_params'}->{'-seq_version'} = $sv;
591	1					3	$self->{'_params'}->{'-version'} = $sv;
592							}
593
594	1	50				5	if ($topology eq "circular") {
595	0					0	$self->{'_params'}->{'-is_circular'} = 1;
596							}
597
598	1	50				4	if (defined $mol ) {
599	1	50				8	if ($mol =~ /DNA/) {
		50
		0
600	0					0	$alphabet='dna';
601							}
602							elsif ($mol =~ /RNA/) {
603	1					3	$alphabet='rna';
604							}
605							elsif ($mol =~ /AA/) {
606	0					0	$alphabet='protein';
607							}
608							}
609							} elsif ($idtype) { # has internal ';'
610							# Old style header (EMBL Release < 87, before June 2006)
611	8	50				97	if ($line =~ m{^(\S+)[^;];\s+(\S+)[^;];\s+(\S+)[^;]*;}) {
612	8					51	($name, $mol, $div) = ($1, $2, $3);
613							#$self->debug("[$name][$mol][$div]");
614							}
615
616	8	50				27	if($mol) {
617	8	50				30	if ( $mol =~ m{circular} ) {
618	0					0	$self->{'_params'}->{'-is_circular'} = 1;
619	0					0	$mol =~ s{circular }{};
620							}
621	8	50				26	if (defined $mol ) {
622	8	100				46	if ($mol =~ /DNA/) {
		50
		0
623	7					26	$alphabet='dna';
624							}
625							elsif ($mol =~ /RNA/) {
626	1					3	$alphabet='rna';
627							}
628							elsif ($mol =~ /AA/) {
629	0					0	$alphabet='protein';
630							}
631							}
632							}
633							} else {
634	1					5	$name = $data->{DATA};
635							}
636	10	50	33			79	unless( defined $name && length($name) ) {
637	0					0	$name = "unknown_id";
638							}
639	10					55	$self->{'_params'}->{'-display_id'} = $name;
640	10					27	$self->{'_params'}->{'-alphabet'} = $alphabet;
641	10	100				45	$self->{'_params'}->{'-division'} = $div if $div;
642	10	100				65	$self->{'_params'}->{'-molecule'} = $mol if $mol;
643							}
644
645							# UniProt/SwissProt ID line
646							sub _swiss_id {
647	17			17		29	my ($self, $data) = @_;
648	17					27	my ($name, $seq_div);
649	17	50				181	if($data->{DATA} =~ m{^
650							(\S+) \s+ # $1 entryname
651							([^\s;]+); \s+ # $2 DataClass
652							(?:PRT;)? \s+ # Molecule Type (optional)
653							[0-9]+[ ]AA \. # Sequencelength (capture?)
654							$
655							}ox ) {
656	17					78	($name, $seq_div) = ($1, $2);
657	17	50	100			166	$self->{'_params'}->{'-namespace'} =
		100	66
658							($seq_div eq 'Reviewed' \|\| $seq_div eq 'STANDARD') ? 'Swiss-Prot' :
659							($seq_div eq 'Unreviewed' \|\| $seq_div eq 'PRELIMINARY') ? 'TrEMBL' :
660							$seq_div;
661							# we shouldn't be setting the division, but for now...
662	17					76	my ($junk, $division) = split q(_), $name;
663	17					57	$self->{'_params'}->{'-division'} = $division;
664	17					45	$self->{'_params'}->{'-alphabet'} = 'protein';
665							# this is important to have the id for display in e.g. FTHelper, otherwise
666							# you won't know which entry caused an error
667	17					72	$self->{'_params'}->{'-display_id'} = $name;
668							} else {
669	0					0	$self->throw("Unrecognized UniProt/SwissProt ID line:[".$data->{DATA}."]");
670							}
671							}
672
673							# UniProt/SwissProt GN line
674							sub _swiss_genename {
675	17			17		34	my ($self, $data) = @_;
676							#$self->debug(Dumper($data));
677	17					45	my $genename = $data->{DATA};
678	17					26	my $gn;
679	17	50				51	if ($genename) {
680	17					24	my @stags;
681	17	100				86	if ($genename =~ /\w=\w/) {
682							# new format (e.g., Name=RCHY1; Synonyms=ZNF363, CHIMP)
683	10					45	for my $n (split(m{\s+and\s+},$genename)) {
684	12					30	my @genenames;
685	12					62	for my $section (split(m{\s;\s},$n)) {
686	15					42	my ($tag, $rest) = split("=",$section);
687	15		50			42	$rest \|\|= '';
688	15					33	for my $val (split(m{\s,\s},$rest)) {
689	19					63	push @genenames, [$tag => $val];
690							}
691							}
692	12					42	push @stags, ['gene_name' => \@genenames];
693							}
694							} else {
695							# old format
696	7					37	for my $section (split(/ AND /, $genename)) {
697	9					15	my @genenames;
698	9					49	$section =~ s/[\.]//g;
699	9					49	my @names = split(m{\s+OR\s+}, $section);
700	9					34	push @genenames, ['Name' => shift @names];
701	9					24	push @genenames, map {['Synonyms' => $_]} @names;
	10					31
702	9					44	push @stags, ['gene_name' => \@genenames]
703							}
704							} #use Data::Dumper; print Dumper $gn, $genename;# exit;
705	17					205	my $gn = Bio::Annotation::TagTree->new(-tagname => 'gene_name',
706							-value => ['gene_names' => \@stags]);
707	17					60	$self->annotation_collection->add_Annotation('gene_name', $gn);
708							}
709							}
710
711							# GenBank VERSION line
712							# old EMBL SV line (now obsolete)
713							# UniProt/SwissProt?
714							sub _generic_version {
715	28			28		70	my ($self, $data) = @_;
716	28					194	my ($acc,$gi) = split(' ',$data->{DATA});
717	28	100				172	if($acc =~ m{^\w+\.(\d+)}xmso) {
718	27					81	$self->{'_params'}->{'-version'} = $1;
719	27					65	$self->{'_params'}->{'-seq_version'} = $1;
720							}
721	28	100	66			160	if($gi && (index($gi,"GI:") == 0)) {
722	24					106	$self->{'_params'}->{'-primary_id'} = substr($gi,3);
723							}
724							}
725
726							# EMBL DT lines
727							sub _embl_date {
728	5			5		11	my ($self, $data) = @_;
729	5					59	while ($data->{DATA} =~ m{(\S+)\s$(.*?)$}g) {
730	10					36	my ($date, $version) = ($1, $2);
731	10					21	$date =~ tr{,}{}d; # remove comma if new version
732	10	50				49	if ($version =~ m{$Rel\.\s(\d+),\sCreated$}xmso ) {
		50
733	0					0	my $release = Bio::Annotation::SimpleValue->new(
734							-tagname => 'creation_release',
735							-value => $1
736							);
737	0					0	$self->annotation_collection->add_Annotation($release);
738							} elsif ($version =~ m{$Rel\.\s(\d+),\sLast\supdated,\sVersion\s(\d+)$}xmso ) {
739	0					0	my $release = Bio::Annotation::SimpleValue->new(
740							-tagname => 'update_release',
741							-value => $1
742							);
743	0					0	$self->annotation_collection->add_Annotation($release);
744	0					0	my $update = Bio::Annotation::SimpleValue->new(
745							-tagname => 'update_version',
746							-value => $2
747							);
748	0					0	$self->annotation_collection->add_Annotation($update);
749							}
750	10					13	push @{ $self->{'_params'}->{'-dates'} }, $date;
	10					79
751							}
752							}
753
754							# UniProt/SwissProt DT lines
755							sub _swiss_date {
756	17			17		30	my ($self, $data) = @_;
757							# swissprot
758	17					75	my @dls = split m{\n}, $data->{DATA};
759	17					49	for my $dl (@dls) {
760	51					133	my ($date, $version) = split(' ', $dl, 2);
761	51					89	$date =~ tr{,}{}d; # remove comma if new version
762	51	100	100			547	if ($version =~ m{$Rel\. (\d+), Last sequence update$} \|\| # old
		100	100
763							$version =~ m{sequence version (\d+)\.}) { #new
764	17					187	my $update = Bio::Annotation::SimpleValue->new(
765							-tagname => 'seq_update',
766							-value => $1
767							);
768	17					70	$self->annotation_collection->add_Annotation($update);
769							} elsif ($version =~ m{$Rel\. (\d+), Last annotation update$} \|\| #old
770							$version =~ m{entry version (\d+)\.}) { #new
771	17					87	$self->{'_params'}->{'-version'} = $1;
772	17					56	$self->{'_params'}->{'-seq_version'} = $1;
773							}
774	51					55	push @{ $self->{'_params'}->{'-dates'} }, $date;
	51					229
775							}
776							}
777
778							# GenBank KEYWORDS line
779							# EMBL KW line
780							# UniProt/SwissProt KW line
781							sub _generic_keywords {
782	55			55		126	my ($self, $data) = @_;
783	55					286	$data->{DATA} =~ s{\.$}{};
784	55					518	my @kw = split m{\s\;\s}xo ,$data->{DATA};
785	55					213	$self->{'_params'}->{'-keywords'} = \@kw;
786							}
787
788							# GenBank DEFINITION line
789							# EMBL DE line
790							# UniProt/SwissProt DE line
791							sub _generic_description {
792	57			57		121	my ($self, $data) = @_;
793	57					233	$self->{'_params'}->{'-desc'} = $data->{DATA};
794							}
795
796							# GenBank ACCESSION line
797							# EMBL AC line
798							# UniProt/SwissProt AC line
799							sub _generic_accession {
800	56			56		104	my ($self, $data) = @_;
801	56					337	my @accs = split m{[\s;]+}, $data->{DATA};
802	56					186	$self->{'_params'}->{'-accession_number'} = shift @accs;
803	56	100				245	$self->{'_params'}->{'-secondary_accessions'} = \@accs if @accs;
804							}
805
806							####################### SPECIES HANDLERS #######################
807
808							# uses Bio::Species
809							# GenBank SOURCE, ORGANISM lines
810							# EMBL O* lines
811							# UniProt/SwissProt O* lines
812							sub _generic_species {
813	56			56		94	my ($self, $data) = @_;
814
815	56					180	my $seqformat = $self->format;
816							# if data is coming in from GenBank parser...
817	56	100	66			506	if ($seqformat eq 'genbank' &&
818							$data->{ORGANISM} =~ m{(.+?)\s(\S+;[^\n\.]+)}ox) {
819	30					200	($data->{ORGANISM}, $data->{CLASSIFICATION}) = ($1, $2);
820							}
821
822							# SwissProt stuff...
823							# hybrid names in swissprot files are no longer valid per intergration into
824							# UniProt. Files containing these have been split into separate entries, so
825							# it is probably a good idea to update if one has these lingering around...
826
827	56					93	my $taxid;
828	56	100				143	if ($seqformat eq 'swiss') {
829	17	50				116	if ($data->{DATA} =~ m{^([^,]+)}ox) {
830	17					60	$data->{DATA} = $1;
831							}
832	17	100	66			153	if ($data->{CROSSREF} && $data->{CROSSREF} =~ m{NCBI_TaxID=(\d+)}) {
833	16					39	$taxid = $1;
834							}
835							}
836
837							my ($sl, $class, $sci_name) = ($data->{DATA},
838							$data->{CLASSIFICATION},
839	56		100			320	$data->{ORGANISM} \|\| '');
840	56					85	my ($organelle,$abbr_name, $common);
841	56					986	my @class = reverse split m{\s;\s}, $class;
842							# have to treat swiss different from everything else...
843	56	50				703	if ($sl =~ m{^(mitochondrion\|chloroplast\|plastid)? # GenBank format
844							\s(.?)
845							\s(?: $ (.?) $ )?\.?$
846							}xmso ){
847	56					364	($organelle, $abbr_name, $common) = ($1, $2, $3); # optional
848							} else {
849	0					0	$abbr_name = $sl; # nothing caught; this is a backup!
850							}
851							# there is no 'abbreviated name' for EMBL
852	56	100				177	$sci_name = $abbr_name if $seqformat ne 'genbank';
853	56		100			225	$organelle \|\|= '';
854	56		100			186	$common \|\|= '';
855	56	50				127	$sci_name \|\| return;
856	56					128	unshift @class, $sci_name;
857							# no genus/species parsing here; moving to Bio::Taxon-based taxonomy
858	56					559	my $make = Bio::Species->new();
859	56					217	$make->scientific_name($sci_name);
860	56	50				454	$make->classification(@class) if @class > 0;
861	56	100				260	$common && $make->common_name( $common );
862	56	50				297	$abbr_name && $make->name('abbreviated', $abbr_name);
863	56	100				135	$organelle && $make->organelle($organelle);
864	56	100				219	$taxid && $make->ncbi_taxid($taxid);
865	56					643	$self->{'_params'}->{'-species'} = $make;
866							}
867
868							####################### ANNOTATION HANDLERS #######################
869
870							# GenBank DBSOURCE line
871							sub _genbank_dbsource {
872	1			1		2	my ($self, $data) = @_;
873	1					2	my $dbsource = $data->{DATA};
874	1					3	my $annotation = $self->annotation_collection;
875							# deal with swissprot dbsources
876							# we could possibly parcel these out to subhandlers...
877	1	50				3	if( $dbsource =~ s/(UniProt(?:KB)\|swissprot):\s+locus\s+(\S+)\,.+\n// ) {
878	0					0	$annotation->add_Annotation
879							('dblink',
880							Bio::Annotation::DBLink->new
881							(-primary_id => $2,
882							-database => $1,
883							-tagname => 'dblink'));
884	0	0				0	if( $dbsource =~ s/\s*created:\s+([^\.]+)\.\n// ) {
885	0					0	$annotation->add_Annotation
886							('swissprot_dates',
887							Bio::Annotation::SimpleValue->new
888							(-tagname => 'date_created',
889							-value => $1));
890							}
891	0					0	while( $dbsource =~ s/\s*(sequence\|annotation)\s+updated:\s+([^\.]+)\.\n//g ) {
892	0					0	$annotation->add_Annotation
893							('swissprot_dates',
894							Bio::Annotation::SimpleValue->new
895							(-tagname => 'date_updated',
896							-value => $1));
897							}
898	0					0	$dbsource =~ s/\n/ /g;
899	0	0				0	if( $dbsource =~ s/\s*xrefs:\s+((?:\S+,\s+)+\S+)\s+xrefs/xrefs/ ) {
		0
900							# will use $i to determine even or odd
901							# for swissprot the accessions are paired
902	0					0	my $i = 0;
903	0					0	for my $dbsrc ( split(/,\s+/,$1) ) {
904	0	0	0			0	if( $dbsrc =~ /(\S+)\.(\d+)/ \|\| $dbsrc =~ /(\S+)/ ) {
905	0					0	my ($id,$version) = ($1,$2);
906	0	0				0	$version ='' unless defined $version;
907	0					0	my $db;
908	0	0				0	if( $id =~ /^\d\S{3}/) {
909	0					0	$db = 'PDB';
910							} else {
911	0	0				0	$db = ($i++ % 2 ) ? 'GenPept' : 'GenBank';
912							}
913	0					0	$annotation->add_Annotation
914							('dblink',
915							Bio::Annotation::DBLink->new
916							(-primary_id => $id,
917							-version => $version,
918							-database => $db,
919							-tagname => 'dblink'));
920							}
921							}
922							} elsif( $dbsource =~ s/\s*xrefs:\s+(.+)\s+xrefs/xrefs/i ) {
923							# download screwed up and ncbi didn't put acc in for gi numbers
924	0					0	my $i = 0;
925	0					0	for my $id ( split(/\,\s+/,$1) ) {
926	0					0	my ($acc,$db);
927	0	0				0	if( $id =~ /gi:\s+(\d+)/ ) {
		0
928	0					0	$acc= $1;
929	0	0				0	$db = ($i++ % 2 ) ? 'GenPept' : 'GenBank';
930							} elsif( $id =~ /pdb\s+accession\s+(\S+)/ ) {
931	0					0	$acc= $1;
932	0					0	$db = 'PDB';
933							} else {
934	0					0	$acc= $id;
935	0					0	$db = '';
936							}
937	0					0	$annotation->add_Annotation
938							('dblink',
939							Bio::Annotation::DBLink->new
940							(-primary_id => $acc,
941							-database => $db,
942							-tagname => 'dblink'));
943							}
944							} else {
945	0					0	$self->warn("Cannot match $dbsource\n");
946							}
947	0	0				0	if( $dbsource =~ s/xrefs\s+$non\-sequence\s+databases$:\s+
948							((?:\S+,\s+)+\S+)//x ) {
949	0					0	for my $id ( split(/\,\s+/,$1) ) {
950	0					0	my $db;
951							# this is because GenBank dropped the spaces!!!
952							# I'm sure we're not going to get this right
953							##if( $id =~ s/^://i ) {
954							## $db = $1;
955							##}
956	0					0	$db = substr($id,0,index($id,':'));
957	0	0				0	if (! exists $DBSOURCE{ $db }) {
958	0					0	$db = ''; # do we want 'GenBank' here?
959							}
960	0					0	$id = substr($id,index($id,':')+1);
961	0					0	$annotation->add_Annotation
962							('dblink',Bio::Annotation::DBLink->new
963							(-primary_id => $id,
964							-database => $db,
965							-tagname => 'dblink'));
966							}
967							}
968							} else {
969	1	50				5	if( $dbsource =~ /^(\S?):?\saccession\s+(\S+)\.(\d+)/ ) {
		0
970	1					3	my ($db,$id,$version) = ($1,$2,$3);
971	1		50			12	$annotation->add_Annotation
972							('dblink',
973							Bio::Annotation::DBLink->new
974							(-primary_id => $id,
975							-version => $version,
976							-database => $db \|\| 'GenBank',
977							-tagname => 'dblink'));
978							} elsif ( $dbsource =~ /(\S+)([\.:])(\d+)/ ) {
979	0					0	my ($id, $db, $version);
980	0	0				0	if ($2 eq ':') {
981	0					0	($db, $id) = ($1, $3);
982							} else {
983	0					0	($db, $id, $version) = ('GenBank', $1, $3);
984							}
985	0					0	$annotation->add_Annotation('dblink',
986							Bio::Annotation::DBLink->new(
987							-primary_id => $id,
988							-version => $version,
989							-database => $db,
990							-tagname => 'dblink')
991							);
992							} else {
993	0					0	$self->warn("Unrecognized DBSOURCE data: $dbsource\n");
994							}
995							}
996							}
997
998							# EMBL DR lines
999							# UniProt/SwissProt DR lines
1000							sub _generic_dbsource {
1001	20			20		61	my ($self, $data) = @_;
1002							#$self->debug(Dumper($data));
1003	20					131	while ($data->{DATA} =~ m{([^\n]+)}og) {
1004	351					710	my $dblink = $1;
1005	351					919	$dblink =~ s{\.$}{};
1006	351					338	my $link;
1007	351					744	my @linkdata = split '; ',$dblink;
1008	351	50				1319	if ( $dblink =~ m{([^\s;]+);\s([^\s;]+);?\s([^\s;]+)?}) {
1009							#if ( $dblink =~ m{([^\s;]+);\s([^\s;]+);?\s([^\s;]+)?}) {
1010	351					695	my ($databse, $prim_id, $sec_id) = ($1,$2,$3);
1011	351					1247	$link = Bio::Annotation::DBLink->new(-database => $databse,
1012							-primary_id => $prim_id,
1013							-optional_id => $sec_id);
1014							} else {
1015	0					0	$self->warn("No match for $dblink");
1016							}
1017	351					635	$self->annotation_collection->add_Annotation('dblink', $link);
1018							}
1019							}
1020
1021
1022							# GenBank REFERENCE and related lines
1023							# EMBL R* lines
1024							# UniProt/SwissProt R* lines
1025							sub _generic_reference {
1026	301			301		281	my ($self, $data) = @_;
1027	301					601	my $seqformat = $self->format;
1028	301					293	my ($start, $end);
1029							# get these in EMBL/Swiss
1030	301	100				526	if ($data->{CROSSREF}) {
1031	128					805	while ($data->{CROSSREF} =~ m{(pubmed\|doi\|medline)(?:=\|;\s+)(\S+)}oig) {
1032	199					473	my ($db, $ref) = (uc $1, $2);
1033	199					489	$ref =~ s{[;.]+$}{};
1034	199					666	$data->{$db} = $ref;
1035							}
1036							}
1037							# run some cleanup for swissprot
1038	301	100				716	if ($seqformat eq 'swiss') {
1039	109					140	for my $val (values %{ $data }) {
	109					300
1040	959					1055	$val =~ s{;$}{};
1041	959					980	$val =~ s{(\w-)\s}{$1};
1042							}
1043							}
1044	301	100				525	if ( $data->{POSITION} ) {
1045	127	100				507	if ($seqformat eq 'embl') {
		100
1046	18					207	($start, $end) = split '-', $data->{POSITION},2;
1047							} elsif ($data->{POSITION} =~ m{.+? OF (\d+)-(\d+).*}) { #swiss
1048	23					56	($start, $end) = ($1, $2);
1049							}
1050							}
1051	301	100				883	if ($data->{DATA} =~ m{^\d+\s+$[a-z]+\s+(\d+)\s+to\s+(\d+)$}xmso) {
1052	59					158	($start, $end) = ($1, $2);
1053							}
1054							my $ref = Bio::Annotation::Reference->new(
1055							-comment => $data->{REMARK},
1056							-location => $data->{JOURNAL},
1057							-pubmed => $data->{PUBMED},
1058							-consortium => $data->{CONSRTM},
1059							-title => $data->{TITLE},
1060							-authors => $data->{AUTHORS},
1061							-medline => $data->{MEDLINE},
1062							-doi => $data->{DOI},
1063							-rp => $data->{POSITION}, # JIC...
1064	301					3419	-start => $start,
1065							-end => $end,
1066							);
1067	301	100				1418	if ($data->{DATA} =~ m{^\d+\s+$(.*)$}xmso) {
1068	59					177	$ref->gb_reference($1);
1069							}
1070	301					690	$self->annotation_collection->add_Annotation('reference', $ref);
1071							}
1072
1073							# GenBank COMMENT lines
1074							# EMBL CC lines
1075							# UniProt/SwissProt CC lines
1076							sub _generic_comment {
1077	43			43		76	my ($self, $data) = @_;
1078							$self->annotation_collection->add_Annotation('comment',
1079	43					122	Bio::Annotation::Comment->new( -text => $data->{DATA} ));
1080							}
1081
1082							####################### SEQFEATURE HANDLER #######################
1083
1084							# GenBank Feature Table
1085							sub _generic_seqfeatures {
1086	848			848		889	my ($self, $data) = @_;
1087	848	100				1771	return if $data->{FEATURE_KEY} eq 'FEATURES';
1088	818					1021	my $primary_tag = $data->{FEATURE_KEY};
1089
1090							# grab the NCBI taxon ID from the source SF
1091	818	100	66			1458	if ($primary_tag eq 'source' && exists $data->{'db_xref'}) {
1092	36	100	66			332	if ( $self->{'_params'}->{'-species'} &&
1093							$data->{'db_xref'} =~ m{taxon:(\d+)}xmso ) {
1094	35					164	$self->{'_params'}->{'-species'}->ncbi_taxid($1);
1095							}
1096							}
1097	818					1526	my $source = $self->format;
1098
1099	818					851	my $seqid = ${ $self->get_params('accession_number') }{'accession_number'};
	818					1519
1100
1101	818					1097	my $loc;
1102	818					1048	eval {
1103	818					3152	$loc = $self->{'_locfactory'}->from_string($data->{'LOCATION'});
1104							};
1105	818	50				1810	if(! $loc) {
1106							$self->warn("exception while parsing location line [" .
1107							$data->{'LOCATION'} .
1108							"] in reading $source, ignoring feature " .
1109	0					0	$data->{'primary_tag'}.
1110							" (seqid=" . $seqid . "): " . $@);
1111	0					0	return;
1112							}
1113	818	50	33			1652	if($seqid && (! $loc->is_remote())) {
1114	0					0	$loc->seq_id($seqid); # propagates if it is a split location
1115							}
1116	818					2697	my $sf = Bio::SeqFeature::Generic->direct_new();
1117	818					1872	$sf->location($loc);
1118	818					2030	$sf->primary_tag($primary_tag);
1119	818					1697	$sf->seq_id($seqid);
1120	818					1598	$sf->source_tag($source);
1121	818					1207	delete $data->{'FEATURE_KEY'};
1122	818					905	delete $data->{'LOCATION'};
1123	818					973	delete $data->{'NAME'};
1124	818					850	delete $data->{'DATA'};
1125	818					2490	$sf->set_attributes(-tag => $data);
1126	818					1040	push @{ $self->{'_params'}->{'-features'} }, $sf;
	818					3440
1127							}
1128
1129							####################### ODDS AND ENDS #######################
1130
1131							# Those things that don't fit anywhere else. If a specific name
1132							# maps to the below table, that class and method are used, otherwise
1133							# it goes into a SimpleValue (I think this is a good argument for why
1134							# we need a generic mechanism for storing annotation)
1135
1136							sub _generic_simplevalue {
1137	16			16		29	my ($self, $data) = @_;
1138							$self->annotation_collection->add_Annotation(
1139							Bio::Annotation::SimpleValue->new(-tagname => lc($data->{NAME}),
1140							-value => $data->{DATA})
1141	16					61	);
1142							}
1143
1144				15	0		sub noop {}
1145
1146							sub _debug {
1147	0			0			my ($self, $data) = @_;
1148	0						$self->debug(Dumper($data));
1149							}
1150
1151
1152							1;