File Coverage

Bio/SeqIO/Handler/GenericRichSeqHandler.pm

Criterion	Covered	Total	%
statement	324	411	78.8
branch	120	204	58.8
condition	36	61	59.0
subroutine	46	48	95.8
pod	11	12	91.6
total	537	736	72.9

line	stmt	bran	cond	sub	pod	time	code
1							#
2							# BioPerl module for Bio::SeqIO::Handler::GenericRichSeqHandler
3							#
4							# Please direct questions and support issues to
5							#
6							# Cared for by Chris Fields
7							#
8							# Copyright Chris Fields
9							#
10							# You may distribute this module under the same terms as perl itself
11
12							# POD documentation - main docs before the code
13
14							=head1 NAME
15
16							Bio::SeqIO::Handler::GenericRichSeqHandler - Bio::HandlerI-based
17							data handler for GenBank/EMBL/UniProt (and other) sequence data
18
19							=head1 SYNOPSIS
20
21							# MyHandler is a GenericRichSeqHandler object.
22							# inside a parser (driver) constructor....
23
24							$self->seq_handler($handler \|\| MyHandler->new(-format => 'genbank'));
25
26							# in next_seq() in driver...
27
28							$hobj = $self->seqhandler();
29
30							# roll data up into hashref chunks, pass off into Handler for processing...
31
32							$hobj->data_handler($data);
33
34							# or retrieve Handler methods and pass data directly to Handler methods...
35
36							my $hmeth = $hobj->handler_methods;
37
38							if ($hmeth->{ $data->{NAME} }) {
39							my $mth = $hmeth->{ $data->{NAME} };
40							$hobj->$mth($data);
41							}
42
43							=head1 DESCRIPTION
44
45							This is an experimental implementation of a sequence-based HandlerBaseI parser
46							and may change over time. It is possible (nay, likely) that the way handler
47							methods are set up will change over development to allow more flexibility.
48							Release pumpkins, please do not add this to a release until the API has settled.
49							It is also likely that write_seq() will not work properly for some data.
50
51							Standard Developer caveats:
52
53							Do not use for production purposes.
54							Not responsible for destroying (your data\|computer\|world).
55							Do not stare directly at GenericRichSeqHandler.
56							If GenericRichSeqHandler glows, back slowly away and call for help.
57
58							Consider yourself warned!
59
60							This class acts as a demonstration on how to handle similar data chunks derived
61							from Bio::SeqIO::gbdriver, Bio::SeqIO::embldriver, and Bio::SeqIO::swissdriver
62							using similar (or the same) handler methods.
63
64							The modules currently pass all previous tests in t/genbank.t, t/embl.t, and
65							t/swiss.t yet all use the same handler methods (the collected tests for handlers
66							can be found in t/Handler.t). Some tweaking of the methods themselves is
67							probably in order over the long run to ensure that data is consistently handled
68							for each parser. Round-trip tests are probably in order here...
69
70							Though a Bio::Seq::SeqBuilder is employed for building sequence objects no
71							bypassing of data based on builder slots has been implemented (yet); this is
72							planned in the near future.
73
74							As a reminder: this is the current Annotation data chunk (via Data::Dumper):
75
76							$VAR1 = {
77							'NAME' => 'REFERENCE',
78							'DATA' => '1 (bases 1 to 10001)'
79							'AUTHORS' => 'International Human Genome Sequencing Consortium.'
80							'TITLE' => 'The DNA sequence of Homo sapiens'
81							'JOURNAL' => 'Unpublished (2003)'
82							};
83							...
84
85							This is the current SeqFeature data chunk (again via Data::Dumper):
86
87							$VAR1 = {
88							'mol_type' => 'genomic DNA',
89							'LOCATION' => '<1..>10001',
90							'NAME' => 'FEATURES',
91							'FEATURE_KEY' => 'source',
92							'note' => 'Accession AL451081 sequenced by The Sanger Centre',
93							'db_xref' => 'taxon:9606',
94							'clone' => 'RP11-302I18',
95							'organism' => 'Homo sapiens'
96							};
97
98							=head1 FEEDBACK
99
100							=head2 Mailing Lists
101
102							User feedback is an integral part of the evolution of this and other
103							Bioperl modules. Send your comments and suggestions preferably to one
104							of the Bioperl mailing lists. Your participation is much appreciated.
105
106							bioperl-l@bioperl.org - General discussion
107							http://bioperl.org/wiki/Mailing_lists - About the mailing lists
108
109							=head2 Support
110
111							Please direct usage questions or support issues to the mailing list:
112
113							I
114
115							rather than to the module maintainer directly. Many experienced and
116							reponsive experts will be able look at the problem and quickly
117							address it. Please include a thorough description of the problem
118							with code and data examples if at all possible.
119
120							=head2 Reporting Bugs
121
122							Report bugs to the Bioperl bug tracking system to help us keep track
123							the bugs and their resolution. Bug reports can be submitted via the
124							web:
125
126							https://github.com/bioperl/bioperl-live/issues
127
128							=head1 AUTHOR - Chris Fields
129
130							Email cjfields at bioperl dot org
131
132							=head1 APPENDIX
133
134							The rest of the documentation details each of the object methods. Internal
135							methods are usually preceded with a _
136
137							=cut
138
139							# Let the code begin...
140
141							package Bio::SeqIO::Handler::GenericRichSeqHandler;
142	1			1		3	use strict;
	1					1
	1					23
143	1			1		3	use warnings;
	1					0
	1					18
144
145	1			1		653	use Bio::SeqIO::FTHelper;
	1					2
	1					21
146	1			1		5	use Bio::Annotation::Collection;
	1					1
	1					16
147	1			1		716	use Bio::Annotation::DBLink;
	1					2
	1					19
148	1			1		656	use Bio::Annotation::Comment;
	1					2
	1					30
149	1			1		724	use Bio::Annotation::Reference;
	1					3
	1					23
150	1			1		5	use Bio::Annotation::Collection;
	1					1
	1					15
151	1			1		3	use Bio::Annotation::SimpleValue;
	1					0
	1					14
152	1			1		717	use Bio::Annotation::TagTree;
	1					1
	1					27
153	1			1		4	use Bio::SeqFeature::Generic;
	1					1
	1					14
154	1			1		732	use Bio::Species;
	1					1
	1					22
155	1			1		4	use Bio::Taxon;
	1					2
	1					12
156	1			1		3	use Bio::DB::Taxonomy;
	1					2
	1					13
157	1			1		3	use Bio::Factory::FTLocationFactory;
	1					1
	1					17
158	1			1		3	use Data::Dumper;
	1					1
	1					43
159
160	1			1		3	use base qw(Bio::Root::Root Bio::HandlerBaseI);
	1					1
	1					731
161
162							my %HANDLERS = (
163							'genbank' => {
164							'LOCUS' => \&_genbank_locus,
165							'DEFINITION' => \&_generic_description,
166							'ACCESSION' => \&_generic_accession,
167							'VERSION' => \&_generic_version,
168							'KEYWORDS' => \&_generic_keywords,
169							'DBSOURCE' => \&_genbank_dbsource,
170							'DBLINK' => \&_genbank_dbsource,
171							'SOURCE' => \&_generic_species,
172							'REFERENCE' => \&_generic_reference,
173							'COMMENT' => \&_generic_comment,
174							'FEATURES' => \&_generic_seqfeatures,
175							'BASE' => \&noop, # this is generated from scratch
176							'ORIGIN' => \&_generic_seq,
177							# handles anything else (WGS, WGS_SCAFLD, CONTIG, PROJECT)
178							'_DEFAULT_' => \&_generic_simplevalue,
179							},
180							'embl' => {
181							'ID' => \&_embl_id,
182							'DT' => \&_embl_date,
183							'DR' => \&_generic_dbsource,
184							'SV' => \&_generic_version,
185							'RN' => \&_generic_reference,
186							'KW' => \&_generic_keywords,
187							'DE' => \&_generic_description,
188							'AC' => \&_generic_accession,
189							#'AH' => \&noop, # TPA data not dealt with yet...
190							#'AS' => \&noop,
191							'SQ' => \&_generic_seq,
192							'OS' => \&_generic_species,
193							'CC' => \&_generic_comment,
194							'FT' => \&_generic_seqfeatures,
195							# handles anything else (WGS, TPA, ANN...)
196							'_DEFAULT_' => \&_generic_simplevalue,
197							},
198							'swiss' => {
199							'ID' => \&_swiss_id,
200							'DT' => \&_swiss_date,
201							'GN' => \&_swiss_genename,
202							'DR' => \&_generic_dbsource,
203							'RN' => \&_generic_reference,
204							'KW' => \&_generic_keywords,
205							'DE' => \&_generic_description,
206							'AC' => \&_generic_accession,
207							'SQ' => \&_generic_seq,
208							'OS' => \&_generic_species,
209							'CC' => \&_generic_comment,
210							'FT' => \&_generic_seqfeatures,
211							# handles anything else, though I don't know what...
212							'_DEFAULT_' => \&_generic_simplevalue,
213							},
214							);
215
216							# can we do this generically? Seems like a lot of trouble...
217							my %DBSOURCE = map {$_ => 1} qw(
218							EchoBASE IntAct SWISS-2DPAGE ECO2DBASE ECOGENE TIGRFAMs
219							TIGR GO InterPro Pfam PROSITE SGD GermOnline
220							HSSP PhosSite Ensembl RGD AGD ArrayExpress KEGG
221							H-InvDB HGNC LinkHub PANTHER PRINTS SMART SMR
222							MGI MIM RZPD-ProtExp ProDom MEROPS TRANSFAC Reactome
223							UniGene GlycoSuiteDB PIRSF HSC-2DPAGE PHCI-2DPAGE
224							PMMA-2DPAGE Siena-2DPAGE Rat-heart-2DPAGE Aarhus/Ghent-2DPAGE
225							Biocyc MetaCyc Biocyc:Metacyc GenomeReviews FlyBase
226							TMHOBP COMPLUYEAST-2DPAGE OGP DictyBase HAMAP
227							PhotoList Gramene WormBase WormPep Genew ZFIN
228							PeroxiBase MaizeDB TAIR DrugBank REBASE HPA
229							swissprot GenBank GenPept REFSEQ embl PDB UniProtKB);
230
231							my %NOPROCESS = map {$_ => 1} qw(DBSOURCE ORGANISM FEATURES);
232
233							our %VALID_ALPHABET = (
234							'bp' => 'dna',
235							'aa' => 'protein',
236							'rc' => '' # rc = release candidate; file has no sequences
237							);
238
239							=head2 new
240
241							Title : new
242							Usage :
243							Function:
244							Returns :
245							Args : -format Sequence format to be mapped for handler methods
246							-builder Bio::Seq::SeqBuilder object (normally defined in
247							SequenceStreamI object implementation constructor)
248							Throws : On undefined '-format' sequence format parameter
249							Note : Still under heavy development
250
251							=cut
252
253							sub new {
254	42			42	1	89	my ($class, @args) = @_;
255	42					108	my $self = $class->SUPER::new(@args);
256	42					107	$self = {@args};
257	42					110	bless $self,$class;
258	42					160	my ($format, $builder) = $self->_rearrange([qw(FORMAT BUILDER)], @args);
259	42	50				112	$self->throw("Must define sequence record format") if !$format;
260	42					109	$self->format($format);
261	42					111	$self->handler_methods();
262	42	50				129	$builder && $self->seqbuilder($builder);
263	42					113	$self->location_factory();
264	42					155	return $self;
265							}
266
267							=head1 L implementing methods
268
269							=head2 handler_methods
270
271							Title : handler_methods
272							Usage : $handler->handler_methods('GenBank')
273							%handlers = $handler->handler_methods();
274							Function: Retrieve the handler methods used for the current format() in
275							the handler. This assumes the handler methods are already
276							described in the HandlerI-implementing class.
277							Returns : a hash reference with the data type handled and the code ref
278							associated with it.
279							Args : [optional] String representing the sequence format. If set here
280							this will also set sequence_format()
281							Throws : On unimplemented sequence format in %HANDLERS
282
283							=cut
284
285							sub handler_methods {
286	75			75	1	81	my $self = shift;
287	75	100				164	if (!($self->{'handlers'})) {
288							$self->throw("No handlers defined for seqformat ",$self->format)
289	42	50				87	unless exists $HANDLERS{$self->format};
290	42					84	$self->{'handlers'} = $HANDLERS{$self->format};
291							}
292	75					91	return ($self->{'handlers'});
293							}
294
295							=head2 data_handler
296
297							Title : data_handler
298							Usage : $handler->data_handler($data)
299							Function: Centralized method which accepts all data chunks, then distributes
300							to the appropriate methods for processing based on the chunk name
301							from within the HandlerBaseI object.
302
303							One can also use
304							Returns : None
305							Args : an hash ref containing a data chunk.
306
307							=cut
308
309							sub data_handler {
310	1646			1646	1	1582	my ($self, $data) = @_;
311	1646		33			2594	my $nm = $data->{NAME} \|\| $self->throw("No name tag defined!");
312
313							# this should handle data on the fly w/o caching; any caching should be
314							# done in the driver!
315							my $method = (exists $self->{'handlers'}->{$nm}) ? ($self->{'handlers'}->{$nm}) :
316	1646	50				3250	(exists $self->{'handlers'}->{'_DEFAULT_'}) ? ($self->{'handlers'}->{'_DEFAULT_'}) :
		100
317							undef;
318	1646	50				2361	if (!$method) {
319	0					0	$self->debug("No handler defined for $nm\n");
320	0					0	return;
321							};
322	1646					2501	$self->$method($data);
323							}
324
325							=head2 reset_parameters
326
327							Title : reset_parameters
328							Usage : $handler->reset_parameters()
329							Function: Resets the internal cache of data (normally object parameters for
330							a builder or factory)
331							Returns : None
332							Args : None
333
334							=cut
335
336							sub reset_parameters {
337	58			58	1	82	my $self = shift;
338	58					89	$self->{'_params'} = undef;
339							}
340
341							=head2 format
342
343							Title : format
344							Usage : $handler->format('GenBank')
345							Function: Get/Set the format for the report/record being parsed. This can be
346							used to set handlers in classes which are capable of processing
347							similar data chunks from multiple driver modules.
348							Returns : String with the sequence format
349							Args : [optional] String with the sequence format
350							Note : The format may be used to set the handlers (as in the
351							current GenericRichSeqHandler implementation)
352
353							=cut
354
355							sub format {
356	1301			1301	1	1070	my $self = shift;
357	1301	100				2059	return $self->{'_seqformat'} = lc shift if @_;
358	1259					1858	return $self->{'_seqformat'};
359							}
360
361							=head2 get_params
362
363							Title : get_params
364							Usage : $handler->get_params('-species')
365							Function: Convenience method used to retrieve the specified
366							parameters from the internal parameter cache
367							Returns : Hash ref containing parameters requested and data as
368							key-value pairs. Note that some parameter values may be
369							objects, arrays, etc.
370							Args : List (array) representing the parameters requested
371
372							=cut
373
374							sub get_params {
375	818			818	1	891	my ($self, @ids) = @_;
376	818					583	my %data;
377	818					901	for my $id (@ids) {
378	818	50				1472	if (!index($id, '-')==0) {
379	818					1413	$id = '-'.$id ;
380							}
381	818	100				2820	$data{$id} = $self->{'_params'}->{$id} if (exists $self->{'_params'}->{$id});
382							}
383	818					1426	return \%data;
384							}
385
386							=head2 set_params
387
388							Title : set_params
389							Usage : $handler->set_param({'-species')
390							Function: Convenience method used to set specific parameters
391							Returns : None
392							Args : Hash ref containing the data to be passed as key-value pairs
393
394							=cut
395
396							sub set_params {
397	0			0	1	0	shift->throw('Not implemented yet!');
398							}
399
400							=head1 Methods unique to this implementation
401
402							=head2 seqbuilder
403
404							Title : seqbuilder
405							Usage :
406							Function:
407							Returns :
408							Args :
409							Throws :
410							Note :
411
412							=cut
413
414							sub seqbuilder {
415	104			104	1	118	my $self = shift;
416	104	100				224	return $self->{'_seqbuilder'} = shift if @_;
417	62					99	return $self->{'_seqbuilder'};
418							}
419
420							=head2 build_sequence
421
422							Title : build_sequence
423							Usage :
424							Function:
425							Returns :
426							Args :
427							Throws :
428							Note :
429
430							=cut
431
432							sub build_sequence {
433	62			62	1	87	my $self = shift;
434	62					126	my $builder = $self->seqbuilder();
435	62					58	my $seq;
436	62	100				119	if (defined($self->{'_params'})) {
437	58					69	$builder->add_slot_value(%{ $self->{'_params'} });
	58					448
438	58					172	$seq = $builder->make_object();
439	58					131	$self->reset_parameters;
440							}
441	62	100				639	return $seq if $seq;
442	4					14	return 0;
443							}
444
445							=head2 location_factory
446
447							Title : location_factory
448							Usage :
449							Function:
450							Returns :
451							Args :
452							Throws :
453							Note :
454
455							=cut
456
457							sub location_factory {
458	42			42	1	49	my ($self, $factory) = @_;
459	42	50				145	if ($factory) {
		50
460	0	0	0			0	$self->throw("Must have a Bio::Factory::LocationFactoryI when ".
461							"explicitly setting factory()") unless
462							(ref($factory) && $factory->isa('Bio::Factory::LocationFactoryI'));
463	0					0	$self->{'_locfactory'} = $factory;
464							} elsif (!defined($self->{'_locfactory'})) {
465	42					180	$self->{'_locfactory'} = Bio::Factory::FTLocationFactory->new()
466							}
467	42					54	return $self->{'_locfactory'};
468							}
469
470							=head2 annotation_collection
471
472							Title : annotation_collection
473							Usage :
474							Function:
475							Returns :
476							Args :
477							Throws :
478							Note :
479
480							=cut
481
482							sub annotation_collection {
483	746			746	1	644	my ($self, $coll) = @_;
484	746	50				1611	if ($coll) {
		100
485	0	0	0			0	$self->throw("Must have Bio::AnnotationCollectionI ".
486							"when explicitly setting collection()")
487							unless (ref($coll) && $coll->isa('Bio::AnnotationCollectionI'));
488	0					0	$self->{'_params'}->{'-annotation'} = $coll;
489							} elsif (!exists($self->{'_params'}->{'-annotation'})) {
490	57					276	$self->{'_params'}->{'-annotation'} = Bio::Annotation::Collection->new()
491							}
492	746					2205	return $self->{'_params'}->{'-annotation'};
493							}
494
495							####################### SEQUENCE HANDLERS #######################
496
497							# any sequence data
498							sub _generic_seq {
499	53			53		72	my ($self, $data) = @_;
500	53					313	$self->{'_params'}->{'-seq'} = $data->{DATA};
501							}
502
503							####################### RAW DATA HANDLERS #######################
504
505							# GenBank LOCUS line
506							sub _genbank_locus {
507	31			31		33	my ($self, $data) = @_;
508	31					189	my (@tokens) = split m{\s+}, $data->{DATA};
509	31					45	my $display_id = shift @tokens;
510	31					86	$self->{'_params'}->{'-display_id'} = $display_id;
511	31					37	my $seqlength = shift @tokens;
512	31	50				62	if (exists $VALID_ALPHABET{$seqlength}) {
513							# moved one token too far. No locus name?
514							$self->warn("Bad LOCUS name? Changing [".$self->{'_params'}->{'-display_id'}.
515	0					0	"] to 'unknown' and length to ".$self->{'_params'}->{'-display_id'});
516	0					0	$self->{'_params'}->{'-length'} = $self->{'_params'}->{'-display_id'};
517	0					0	$self->{'_params'}->{'-display_id'} = 'unknown';
518							# add token back...
519	0					0	unshift @tokens, $seqlength;
520							} else {
521	31					52	$self->{'_params'}->{'-length'} = $seqlength;
522							}
523	31					39	my $alphabet = lc(shift @tokens);
524							$self->{'_params'}->{'-alphabet'} =
525	31	50				78	(exists $VALID_ALPHABET{$alphabet}) ? $VALID_ALPHABET{$alphabet} :
526							$self->warn("Unknown alphabet: $alphabet");
527	31	50	66			103	if (($self->{'_params'}->{'-alphabet'} eq 'dna') \|\| (@tokens > 2)) {
528	31					58	$self->{'_params'}->{'-molecule'} = shift(@tokens);
529	31					43	my $circ = shift(@tokens);
530	31	100				54	if ($circ eq 'circular') {
531	2					6	$self->{'_params'}->{'-is_circular'} = 1;
532	2					6	$self->{'_params'}->{'-division'} = shift(@tokens);
533							} else {
534							# 'linear' or 'circular' may actually be omitted altogether
535	29	100				65	$self->{'_params'}->{'-division'} =
536							(CORE::length($circ) == 3 ) ? $circ : shift(@tokens);
537							}
538							} else {
539	0	0				0	$self->{'_params'}->{'-molecule'} = 'PRT' if($self->{'_params'}->{'-alphabet'} eq 'aa');
540	0					0	$self->{'_params'}->{'-division'} = shift(@tokens);
541							}
542	31					64	my $date = join(' ', @tokens);
543							# maybe use Date::Time for dates?
544	31	100	66			246	if($date && $date =~ s{\s((\d{1,2})-(\w{3})-(\d{2,4})).}{$1}) {
545
546	30	50				55	if( length($date) < 11 ) {
547							# improperly formatted date
548							# But we'll be nice and fix it for them
549	0					0	my ($d,$m,$y) = ($2,$3,$4);
550	0	0				0	if( length($d) == 1 ) {
551	0					0	$d = "0$d";
552							}
553							# guess the century here
554	0	0				0	if( length($y) == 2 ) {
555	0	0				0	if( $y > 60 ) { # arbitrarily guess that '60' means 1960
556	0					0	$y = "19$y";
557							} else {
558	0					0	$y = "20$y";
559							}
560	0					0	$self->warn("Date was malformed, guessing the century for $date to be $y\n");
561							}
562	0					0	$self->{'_params'}->{'-dates'} = [join('-',$d,$m,$y)];
563							} else {
564	30					94	$self->{'_params'}->{'-dates'} = [$date];
565							}
566							}
567							}
568
569							# EMBL ID line
570							sub _embl_id {
571	10			10		11	my ($self, $data) = @_;
572	10					13	my $alphabet;
573	10					14	my ($name, $sv, $topology, $mol, $div);
574	10					17	my $line = $data->{DATA};
575							#$self->debug("$line\n");
576	10					18	my ($idtype) = $line =~ tr/;/;/;
577	10	100				27	if ( $idtype == 6) { # New style headers contain exactly six semicolons.
		100
578							# New style header (EMBL Release >= 87, after June 2006)
579	1					3	my $topology;
580							my $sv;
581
582							# ID DQ299383; SV 1; linear; mRNA; STD; MAM; 431 BP.
583							# This regexp comes from the new2old.pl conversion script, from EBI
584	1	50				7	if ($line =~ m/^(\w+);\s+SV (\d+); (\w+); ([^;]+); (\w{3}); (\w{3}); (\d+) \w{2}\./) {
585	1					6	($name, $sv, $topology, $mol, $div) = ($1, $2, $3, $4, $6);
586							} else {
587	0					0	$self->throw("Unrecognized EMBL ID line:[$line]");
588							}
589	1	50				3	if (defined($sv)) {
590	1					3	$self->{'_params'}->{'-seq_version'} = $sv;
591	1					3	$self->{'_params'}->{'-version'} = $sv;
592							}
593
594	1	50				2	if ($topology eq "circular") {
595	0					0	$self->{'_params'}->{'-is_circular'} = 1;
596							}
597
598	1	50				3	if (defined $mol ) {
599	1	50				4	if ($mol =~ /DNA/) {
		50
		0
600	0					0	$alphabet='dna';
601							}
602							elsif ($mol =~ /RNA/) {
603	1					2	$alphabet='rna';
604							}
605							elsif ($mol =~ /AA/) {
606	0					0	$alphabet='protein';
607							}
608							}
609							} elsif ($idtype) { # has internal ';'
610							# Old style header (EMBL Release < 87, before June 2006)
611	8	50				55	if ($line =~ m{^(\S+)[^;];\s+(\S+)[^;];\s+(\S+)[^;]*;}) {
612	8					30	($name, $mol, $div) = ($1, $2, $3);
613							#$self->debug("[$name][$mol][$div]");
614							}
615
616	8	50				15	if($mol) {
617	8	50				20	if ( $mol =~ m{circular} ) {
618	0					0	$self->{'_params'}->{'-is_circular'} = 1;
619	0					0	$mol =~ s{circular }{};
620							}
621	8	50				19	if (defined $mol ) {
622	8	100				26	if ($mol =~ /DNA/) {
		50
		0
623	7					13	$alphabet='dna';
624							}
625							elsif ($mol =~ /RNA/) {
626	1					2	$alphabet='rna';
627							}
628							elsif ($mol =~ /AA/) {
629	0					0	$alphabet='protein';
630							}
631							}
632							}
633							} else {
634	1					2	$name = $data->{DATA};
635							}
636	10	50	33			52	unless( defined $name && length($name) ) {
637	0					0	$name = "unknown_id";
638							}
639	10					36	$self->{'_params'}->{'-display_id'} = $name;
640	10					16	$self->{'_params'}->{'-alphabet'} = $alphabet;
641	10	100				26	$self->{'_params'}->{'-division'} = $div if $div;
642	10	100				40	$self->{'_params'}->{'-molecule'} = $mol if $mol;
643							}
644
645							# UniProt/SwissProt ID line
646							sub _swiss_id {
647	17			17		19	my ($self, $data) = @_;
648	17					18	my ($name, $seq_div);
649	17	50				112	if($data->{DATA} =~ m{^
650							(\S+) \s+ # $1 entryname
651							([^\s;]+); \s+ # $2 DataClass
652							(?:PRT;)? \s+ # Molecule Type (optional)
653							[0-9]+[ ]AA \. # Sequencelength (capture?)
654							$
655							}ox ) {
656	17					47	($name, $seq_div) = ($1, $2);
657	17	50	100			127	$self->{'_params'}->{'-namespace'} =
		100	66
658							($seq_div eq 'Reviewed' \|\| $seq_div eq 'STANDARD') ? 'Swiss-Prot' :
659							($seq_div eq 'Unreviewed' \|\| $seq_div eq 'PRELIMINARY') ? 'TrEMBL' :
660							$seq_div;
661							# we shouldn't be setting the division, but for now...
662	17					52	my ($junk, $division) = split q(_), $name;
663	17					44	$self->{'_params'}->{'-division'} = $division;
664	17					28	$self->{'_params'}->{'-alphabet'} = 'protein';
665							# this is important to have the id for display in e.g. FTHelper, otherwise
666							# you won't know which entry caused an error
667	17					43	$self->{'_params'}->{'-display_id'} = $name;
668							} else {
669	0					0	$self->throw("Unrecognized UniProt/SwissProt ID line:[".$data->{DATA}."]");
670							}
671							}
672
673							# UniProt/SwissProt GN line
674							sub _swiss_genename {
675	17			17		22	my ($self, $data) = @_;
676							#$self->debug(Dumper($data));
677	17					38	my $genename = $data->{DATA};
678	17					17	my $gn;
679	17	50				35	if ($genename) {
680	17					19	my @stags;
681	17	100				56	if ($genename =~ /\w=\w/) {
682							# new format (e.g., Name=RCHY1; Synonyms=ZNF363, CHIMP)
683	10					35	for my $n (split(m{\s+and\s+},$genename)) {
684	12					10	my @genenames;
685	12					50	for my $section (split(m{\s;\s},$n)) {
686	15					43	my ($tag, $rest) = split("=",$section);
687	15		50			35	$rest \|\|= '';
688	15					49	for my $val (split(m{\s,\s},$rest)) {
689	19					47	push @genenames, [$tag => $val];
690							}
691							}
692	12					37	push @stags, ['gene_name' => \@genenames];
693							}
694							} else {
695							# old format
696	7					22	for my $section (split(/ AND /, $genename)) {
697	9					9	my @genenames;
698	9					30	$section =~ s/[\.]//g;
699	9					25	my @names = split(m{\s+OR\s+}, $section);
700	9					37	push @genenames, ['Name' => shift @names];
701	9					16	push @genenames, map {['Synonyms' => $_]} @names;
	10					14
702	9					45	push @stags, ['gene_name' => \@genenames]
703							}
704							} #use Data::Dumper; print Dumper $gn, $genename;# exit;
705	17					129	my $gn = Bio::Annotation::TagTree->new(-tagname => 'gene_name',
706							-value => ['gene_names' => \@stags]);
707	17					41	$self->annotation_collection->add_Annotation('gene_name', $gn);
708							}
709							}
710
711							# GenBank VERSION line
712							# old EMBL SV line (now obsolete)
713							# UniProt/SwissProt?
714							sub _generic_version {
715	28			28		33	my ($self, $data) = @_;
716	28					83	my ($acc,$gi) = split(' ',$data->{DATA});
717	28	100				106	if($acc =~ m{^\w+\.(\d+)}xmso) {
718	27					64	$self->{'_params'}->{'-version'} = $1;
719	27					55	$self->{'_params'}->{'-seq_version'} = $1;
720							}
721	28	100	66			124	if($gi && (index($gi,"GI:") == 0)) {
722	24					80	$self->{'_params'}->{'-primary_id'} = substr($gi,3);
723							}
724							}
725
726							# EMBL DT lines
727							sub _embl_date {
728	5			5		8	my ($self, $data) = @_;
729	5					35	while ($data->{DATA} =~ m{(\S+)\s$(.*?)$}g) {
730	10					24	my ($date, $version) = ($1, $2);
731	10					11	$date =~ tr{,}{}d; # remove comma if new version
732	10	50				54	if ($version =~ m{$Rel\.\s(\d+),\sCreated$}xmso ) {
		50
733	0					0	my $release = Bio::Annotation::SimpleValue->new(
734							-tagname => 'creation_release',
735							-value => $1
736							);
737	0					0	$self->annotation_collection->add_Annotation($release);
738							} elsif ($version =~ m{$Rel\.\s(\d+),\sLast\supdated,\sVersion\s(\d+)$}xmso ) {
739	0					0	my $release = Bio::Annotation::SimpleValue->new(
740							-tagname => 'update_release',
741							-value => $1
742							);
743	0					0	$self->annotation_collection->add_Annotation($release);
744	0					0	my $update = Bio::Annotation::SimpleValue->new(
745							-tagname => 'update_version',
746							-value => $2
747							);
748	0					0	$self->annotation_collection->add_Annotation($update);
749							}
750	10					10	push @{ $self->{'_params'}->{'-dates'} }, $date;
	10					51
751							}
752							}
753
754							# UniProt/SwissProt DT lines
755							sub _swiss_date {
756	17			17		27	my ($self, $data) = @_;
757							# swissprot
758	17					49	my @dls = split m{\n}, $data->{DATA};
759	17					44	for my $dl (@dls) {
760	51					104	my ($date, $version) = split(' ', $dl, 2);
761	51					75	$date =~ tr{,}{}d; # remove comma if new version
762	51	100	100			369	if ($version =~ m{$Rel\. (\d+), Last sequence update$} \|\| # old
		100	100
763							$version =~ m{sequence version (\d+)\.}) { #new
764	17					96	my $update = Bio::Annotation::SimpleValue->new(
765							-tagname => 'seq_update',
766							-value => $1
767							);
768	17					44	$self->annotation_collection->add_Annotation($update);
769							} elsif ($version =~ m{$Rel\. (\d+), Last annotation update$} \|\| #old
770							$version =~ m{entry version (\d+)\.}) { #new
771	17					58	$self->{'_params'}->{'-version'} = $1;
772	17					28	$self->{'_params'}->{'-seq_version'} = $1;
773							}
774	51					46	push @{ $self->{'_params'}->{'-dates'} }, $date;
	51					146
775							}
776							}
777
778							# GenBank KEYWORDS line
779							# EMBL KW line
780							# UniProt/SwissProt KW line
781							sub _generic_keywords {
782	55			55		73	my ($self, $data) = @_;
783	55					184	$data->{DATA} =~ s{\.$}{};
784	55					366	my @kw = split m{\s\;\s}xo ,$data->{DATA};
785	55					131	$self->{'_params'}->{'-keywords'} = \@kw;
786							}
787
788							# GenBank DEFINITION line
789							# EMBL DE line
790							# UniProt/SwissProt DE line
791							sub _generic_description {
792	57			57		63	my ($self, $data) = @_;
793	57					140	$self->{'_params'}->{'-desc'} = $data->{DATA};
794							}
795
796							# GenBank ACCESSION line
797							# EMBL AC line
798							# UniProt/SwissProt AC line
799							sub _generic_accession {
800	56			56		70	my ($self, $data) = @_;
801	56					231	my @accs = split m{[\s;]+}, $data->{DATA};
802	56					129	$self->{'_params'}->{'-accession_number'} = shift @accs;
803	56	100				170	$self->{'_params'}->{'-secondary_accessions'} = \@accs if @accs;
804							}
805
806							####################### SPECIES HANDLERS #######################
807
808							# uses Bio::Species
809							# GenBank SOURCE, ORGANISM lines
810							# EMBL O* lines
811							# UniProt/SwissProt O* lines
812							sub _generic_species {
813	56			56		63	my ($self, $data) = @_;
814
815	56					102	my $seqformat = $self->format;
816							# if data is coming in from GenBank parser...
817	56	100	66			364	if ($seqformat eq 'genbank' &&
818							$data->{ORGANISM} =~ m{(.+?)\s(\S+;[^\n\.]+)}ox) {
819	30					110	($data->{ORGANISM}, $data->{CLASSIFICATION}) = ($1, $2);
820							}
821
822							# SwissProt stuff...
823							# hybrid names in swissprot files are no longer valid per intergration into
824							# UniProt. Files containing these have been split into separate entries, so
825							# it is probably a good idea to update if one has these lingering around...
826
827	56					67	my $taxid;
828	56	100				128	if ($seqformat eq 'swiss') {
829	17	50				79	if ($data->{DATA} =~ m{^([^,]+)}ox) {
830	17					54	$data->{DATA} = $1;
831							}
832	17	100	66			107	if ($data->{CROSSREF} && $data->{CROSSREF} =~ m{NCBI_TaxID=(\d+)}) {
833	16					31	$taxid = $1;
834							}
835							}
836
837							my ($sl, $class, $sci_name) = ($data->{DATA},
838							$data->{CLASSIFICATION},
839	56		100			200	$data->{ORGANISM} \|\| '');
840	56					66	my ($organelle,$abbr_name, $common);
841	56					550	my @class = reverse split m{\s;\s}, $class;
842							# have to treat swiss different from everything else...
843	56	50				527	if ($sl =~ m{^(mitochondrion\|chloroplast\|plastid)? # GenBank format
844							\s(.?)
845							\s(?: $ (.?) $ )?\.?$
846							}xmso ){
847	56					157	($organelle, $abbr_name, $common) = ($1, $2, $3); # optional
848							} else {
849	0					0	$abbr_name = $sl; # nothing caught; this is a backup!
850							}
851							# there is no 'abbreviated name' for EMBL
852	56	100				141	$sci_name = $abbr_name if $seqformat ne 'genbank';
853	56		100			189	$organelle \|\|= '';
854	56		100			119	$common \|\|= '';
855	56	50				93	$sci_name \|\| return;
856	56					81	unshift @class, $sci_name;
857							# no genus/species parsing here; moving to Bio::Taxon-based taxonomy
858	56					267	my $make = Bio::Species->new();
859	56					149	$make->scientific_name($sci_name);
860	56	50				219	$make->classification(@class) if @class > 0;
861	56	100				139	$common && $make->common_name( $common );
862	56	50				185	$abbr_name && $make->name('abbreviated', $abbr_name);
863	56	100				108	$organelle && $make->organelle($organelle);
864	56	100				109	$taxid && $make->ncbi_taxid($taxid);
865	56					281	$self->{'_params'}->{'-species'} = $make;
866							}
867
868							####################### ANNOTATION HANDLERS #######################
869
870							# GenBank DBSOURCE line
871							sub _genbank_dbsource {
872	1			1		2	my ($self, $data) = @_;
873	1					2	my $dbsource = $data->{DATA};
874	1					2	my $annotation = $self->annotation_collection;
875							# deal with swissprot dbsources
876							# we could possibly parcel these out to subhandlers...
877	1	50				4	if( $dbsource =~ s/(UniProt(?:KB)\|swissprot):\s+locus\s+(\S+)\,.+\n// ) {
878	0					0	$annotation->add_Annotation
879							('dblink',
880							Bio::Annotation::DBLink->new
881							(-primary_id => $2,
882							-database => $1,
883							-tagname => 'dblink'));
884	0	0				0	if( $dbsource =~ s/\s*created:\s+([^\.]+)\.\n// ) {
885	0					0	$annotation->add_Annotation
886							('swissprot_dates',
887							Bio::Annotation::SimpleValue->new
888							(-tagname => 'date_created',
889							-value => $1));
890							}
891	0					0	while( $dbsource =~ s/\s*(sequence\|annotation)\s+updated:\s+([^\.]+)\.\n//g ) {
892	0					0	$annotation->add_Annotation
893							('swissprot_dates',
894							Bio::Annotation::SimpleValue->new
895							(-tagname => 'date_updated',
896							-value => $1));
897							}
898	0					0	$dbsource =~ s/\n/ /g;
899	0	0				0	if( $dbsource =~ s/\s*xrefs:\s+((?:\S+,\s+)+\S+)\s+xrefs/xrefs/ ) {
		0
900							# will use $i to determine even or odd
901							# for swissprot the accessions are paired
902	0					0	my $i = 0;
903	0					0	for my $dbsrc ( split(/,\s+/,$1) ) {
904	0	0	0			0	if( $dbsrc =~ /(\S+)\.(\d+)/ \|\| $dbsrc =~ /(\S+)/ ) {
905	0					0	my ($id,$version) = ($1,$2);
906	0	0				0	$version ='' unless defined $version;
907	0					0	my $db;
908	0	0				0	if( $id =~ /^\d\S{3}/) {
909	0					0	$db = 'PDB';
910							} else {
911	0	0				0	$db = ($i++ % 2 ) ? 'GenPept' : 'GenBank';
912							}
913	0					0	$annotation->add_Annotation
914							('dblink',
915							Bio::Annotation::DBLink->new
916							(-primary_id => $id,
917							-version => $version,
918							-database => $db,
919							-tagname => 'dblink'));
920							}
921							}
922							} elsif( $dbsource =~ s/\s*xrefs:\s+(.+)\s+xrefs/xrefs/i ) {
923							# download screwed up and ncbi didn't put acc in for gi numbers
924	0					0	my $i = 0;
925	0					0	for my $id ( split(/\,\s+/,$1) ) {
926	0					0	my ($acc,$db);
927	0	0				0	if( $id =~ /gi:\s+(\d+)/ ) {
		0
928	0					0	$acc= $1;
929	0	0				0	$db = ($i++ % 2 ) ? 'GenPept' : 'GenBank';
930							} elsif( $id =~ /pdb\s+accession\s+(\S+)/ ) {
931	0					0	$acc= $1;
932	0					0	$db = 'PDB';
933							} else {
934	0					0	$acc= $id;
935	0					0	$db = '';
936							}
937	0					0	$annotation->add_Annotation
938							('dblink',
939							Bio::Annotation::DBLink->new
940							(-primary_id => $acc,
941							-database => $db,
942							-tagname => 'dblink'));
943							}
944							} else {
945	0					0	$self->warn("Cannot match $dbsource\n");
946							}
947	0	0				0	if( $dbsource =~ s/xrefs\s+$non\-sequence\s+databases$:\s+
948							((?:\S+,\s+)+\S+)//x ) {
949	0					0	for my $id ( split(/\,\s+/,$1) ) {
950	0					0	my $db;
951							# this is because GenBank dropped the spaces!!!
952							# I'm sure we're not going to get this right
953							##if( $id =~ s/^://i ) {
954							## $db = $1;
955							##}
956	0					0	$db = substr($id,0,index($id,':'));
957	0	0				0	if (! exists $DBSOURCE{ $db }) {
958	0					0	$db = ''; # do we want 'GenBank' here?
959							}
960	0					0	$id = substr($id,index($id,':')+1);
961	0					0	$annotation->add_Annotation
962							('dblink',Bio::Annotation::DBLink->new
963							(-primary_id => $id,
964							-database => $db,
965							-tagname => 'dblink'));
966							}
967							}
968							} else {
969	1	50				6	if( $dbsource =~ /^(\S?):?\saccession\s+(\S+)\.(\d+)/ ) {
		0
970	1					2	my ($db,$id,$version) = ($1,$2,$3);
971	1		50			11	$annotation->add_Annotation
972							('dblink',
973							Bio::Annotation::DBLink->new
974							(-primary_id => $id,
975							-version => $version,
976							-database => $db \|\| 'GenBank',
977							-tagname => 'dblink'));
978							} elsif ( $dbsource =~ /(\S+)([\.:])(\d+)/ ) {
979	0					0	my ($id, $db, $version);
980	0	0				0	if ($2 eq ':') {
981	0					0	($db, $id) = ($1, $3);
982							} else {
983	0					0	($db, $id, $version) = ('GenBank', $1, $3);
984							}
985	0					0	$annotation->add_Annotation('dblink',
986							Bio::Annotation::DBLink->new(
987							-primary_id => $id,
988							-version => $version,
989							-database => $db,
990							-tagname => 'dblink')
991							);
992							} else {
993	0					0	$self->warn("Unrecognized DBSOURCE data: $dbsource\n");
994							}
995							}
996							}
997
998							# EMBL DR lines
999							# UniProt/SwissProt DR lines
1000							sub _generic_dbsource {
1001	20			20		25	my ($self, $data) = @_;
1002							#$self->debug(Dumper($data));
1003	20					124	while ($data->{DATA} =~ m{([^\n]+)}og) {
1004	351					599	my $dblink = $1;
1005	351					800	$dblink =~ s{\.$}{};
1006	351					308	my $link;
1007	351					649	my @linkdata = split '; ',$dblink;
1008	351	50				1221	if ( $dblink =~ m{([^\s;]+);\s([^\s;]+);?\s([^\s;]+)?}) {
1009							#if ( $dblink =~ m{([^\s;]+);\s([^\s;]+);?\s([^\s;]+)?}) {
1010	351					631	my ($databse, $prim_id, $sec_id) = ($1,$2,$3);
1011	351					990	$link = Bio::Annotation::DBLink->new(-database => $databse,
1012							-primary_id => $prim_id,
1013							-optional_id => $sec_id);
1014							} else {
1015	0					0	$self->warn("No match for $dblink");
1016							}
1017	351					591	$self->annotation_collection->add_Annotation('dblink', $link);
1018							}
1019							}
1020
1021
1022							# GenBank REFERENCE and related lines
1023							# EMBL R* lines
1024							# UniProt/SwissProt R* lines
1025							sub _generic_reference {
1026	301			301		252	my ($self, $data) = @_;
1027	301					400	my $seqformat = $self->format;
1028	301					262	my ($start, $end);
1029							# get these in EMBL/Swiss
1030	301	100				448	if ($data->{CROSSREF}) {
1031	128					615	while ($data->{CROSSREF} =~ m{(pubmed\|doi\|medline)(?:=\|;\s+)(\S+)}oig) {
1032	199					407	my ($db, $ref) = (uc $1, $2);
1033	199					407	$ref =~ s{[;.]+$}{};
1034	199					548	$data->{$db} = $ref;
1035							}
1036							}
1037							# run some cleanup for swissprot
1038	301	100				457	if ($seqformat eq 'swiss') {
1039	109					144	for my $val (values %{ $data }) {
	109					261
1040	959					983	$val =~ s{;$}{};
1041	959					956	$val =~ s{(\w-)\s}{$1};
1042							}
1043							}
1044	301	100				430	if ( $data->{POSITION} ) {
1045	127	100				421	if ($seqformat eq 'embl') {
		100
1046	18					53	($start, $end) = split '-', $data->{POSITION},2;
1047							} elsif ($data->{POSITION} =~ m{.+? OF (\d+)-(\d+).*}) { #swiss
1048	23					42	($start, $end) = ($1, $2);
1049							}
1050							}
1051	301	100				649	if ($data->{DATA} =~ m{^\d+\s+$[a-z]+\s+(\d+)\s+to\s+(\d+)$}xmso) {
1052	59					98	($start, $end) = ($1, $2);
1053							}
1054							my $ref = Bio::Annotation::Reference->new(
1055							-comment => $data->{REMARK},
1056							-location => $data->{JOURNAL},
1057							-pubmed => $data->{PUBMED},
1058							-consortium => $data->{CONSRTM},
1059							-title => $data->{TITLE},
1060							-authors => $data->{AUTHORS},
1061							-medline => $data->{MEDLINE},
1062							-doi => $data->{DOI},
1063							-rp => $data->{POSITION}, # JIC...
1064	301					2455	-start => $start,
1065							-end => $end,
1066							);
1067	301	100				1093	if ($data->{DATA} =~ m{^\d+\s+$(.*)$}xmso) {
1068	59					147	$ref->gb_reference($1);
1069							}
1070	301					503	$self->annotation_collection->add_Annotation('reference', $ref);
1071							}
1072
1073							# GenBank COMMENT lines
1074							# EMBL CC lines
1075							# UniProt/SwissProt CC lines
1076							sub _generic_comment {
1077	43			43		53	my ($self, $data) = @_;
1078							$self->annotation_collection->add_Annotation('comment',
1079	43					84	Bio::Annotation::Comment->new( -text => $data->{DATA} ));
1080							}
1081
1082							####################### SEQFEATURE HANDLER #######################
1083
1084							# GenBank Feature Table
1085							sub _generic_seqfeatures {
1086	848			848		690	my ($self, $data) = @_;
1087	848	100				1377	return if $data->{FEATURE_KEY} eq 'FEATURES';
1088	818					720	my $primary_tag = $data->{FEATURE_KEY};
1089
1090							# grab the NCBI taxon ID from the source SF
1091	818	100	66			1310	if ($primary_tag eq 'source' && exists $data->{'db_xref'}) {
1092	36	100	66			220	if ( $self->{'_params'}->{'-species'} &&
1093							$data->{'db_xref'} =~ m{taxon:(\d+)}xmso ) {
1094	35					112	$self->{'_params'}->{'-species'}->ncbi_taxid($1);
1095							}
1096							}
1097	818					1161	my $source = $self->format;
1098
1099	818					654	my $seqid = ${ $self->get_params('accession_number') }{'accession_number'};
	818					1026
1100
1101	818					878	my $loc;
1102	818					838	eval {
1103	818					2061	$loc = $self->{'_locfactory'}->from_string($data->{'LOCATION'});
1104							};
1105	818	50				1399	if(! $loc) {
1106							$self->warn("exception while parsing location line [" .
1107							$data->{'LOCATION'} .
1108							"] in reading $source, ignoring feature " .
1109	0					0	$data->{'primary_tag'}.
1110							" (seqid=" . $seqid . "): " . $@);
1111	0					0	return;
1112							}
1113	818	50	33			1282	if($seqid && (! $loc->is_remote())) {
1114	0					0	$loc->seq_id($seqid); # propagates if it is a split location
1115							}
1116	818					1919	my $sf = Bio::SeqFeature::Generic->direct_new();
1117	818					1208	$sf->location($loc);
1118	818					1231	$sf->primary_tag($primary_tag);
1119	818					1222	$sf->seq_id($seqid);
1120	818					1106	$sf->source_tag($source);
1121	818					977	delete $data->{'FEATURE_KEY'};
1122	818					808	delete $data->{'LOCATION'};
1123	818					835	delete $data->{'NAME'};
1124	818					597	delete $data->{'DATA'};
1125	818					1788	$sf->set_attributes(-tag => $data);
1126	818					902	push @{ $self->{'_params'}->{'-features'} }, $sf;
	818					2462
1127							}
1128
1129							####################### ODDS AND ENDS #######################
1130
1131							# Those things that don't fit anywhere else. If a specific name
1132							# maps to the below table, that class and method are used, otherwise
1133							# it goes into a SimpleValue (I think this is a good argument for why
1134							# we need a generic mechanism for storing annotation)
1135
1136							sub _generic_simplevalue {
1137	16			16		16	my ($self, $data) = @_;
1138							$self->annotation_collection->add_Annotation(
1139							Bio::Annotation::SimpleValue->new(-tagname => lc($data->{NAME}),
1140							-value => $data->{DATA})
1141	16					41	);
1142							}
1143
1144				15	0		sub noop {}
1145
1146							sub _debug {
1147	0			0			my ($self, $data) = @_;
1148	0						$self->debug(Dumper($data));
1149							}
1150
1151
1152							1;