File Coverage

Bio/SeqIO/Handler/GenericRichSeqHandler.pm

Criterion	Covered	Total	%
statement	324	411	78.8
branch	120	204	58.8
condition	37	61	60.6
subroutine	46	48	95.8
pod	11	12	91.6
total	538	736	73.1

line	stmt	bran	cond	sub	pod	time	code
1							#
2							# BioPerl module for Bio::SeqIO::Handler::GenericRichSeqHandler
3							#
4							# Please direct questions and support issues to
5							#
6							# Cared for by Chris Fields
7							#
8							# Copyright Chris Fields
9							#
10							# You may distribute this module under the same terms as perl itself
11
12							# POD documentation - main docs before the code
13
14							=head1 NAME
15
16							Bio::SeqIO::Handler::GenericRichSeqHandler - Bio::HandlerI-based
17							data handler for GenBank/EMBL/UniProt (and other) sequence data
18
19							=head1 SYNOPSIS
20
21							# MyHandler is a GenericRichSeqHandler object.
22							# inside a parser (driver) constructor....
23
24							$self->seq_handler($handler \|\| MyHandler->new(-format => 'genbank'));
25
26							# in next_seq() in driver...
27
28							$hobj = $self->seqhandler();
29
30							# roll data up into hashref chunks, pass off into Handler for processing...
31
32							$hobj->data_handler($data);
33
34							# or retrieve Handler methods and pass data directly to Handler methods...
35
36							my $hmeth = $hobj->handler_methods;
37
38							if ($hmeth->{ $data->{NAME} }) {
39							my $mth = $hmeth->{ $data->{NAME} };
40							$hobj->$mth($data);
41							}
42
43							=head1 DESCRIPTION
44
45							This is an experimental implementation of a sequence-based HandlerBaseI parser
46							and may change over time. It is possible (nay, likely) that the way handler
47							methods are set up will change over development to allow more flexibility.
48							Release pumpkins, please do not add this to a release until the API has settled.
49							It is also likely that write_seq() will not work properly for some data.
50
51							Standard Developer caveats:
52
53							Do not use for production purposes.
54							Not responsible for destroying (your data\|computer\|world).
55							Do not stare directly at GenericRichSeqHandler.
56							If GenericRichSeqHandler glows, back slowly away and call for help.
57
58							Consider yourself warned!
59
60							This class acts as a demonstration on how to handle similar data chunks derived
61							from Bio::SeqIO::gbdriver, Bio::SeqIO::embldriver, and Bio::SeqIO::swissdriver
62							using similar (or the same) handler methods.
63
64							The modules currently pass all previous tests in t/genbank.t, t/embl.t, and
65							t/swiss.t yet all use the same handler methods (the collected tests for handlers
66							can be found in t/Handler.t). Some tweaking of the methods themselves is
67							probably in order over the long run to ensure that data is consistently handled
68							for each parser. Round-trip tests are probably in order here...
69
70							Though a Bio::Seq::SeqBuilder is employed for building sequence objects no
71							bypassing of data based on builder slots has been implemented (yet); this is
72							planned in the near future.
73
74							As a reminder: this is the current Annotation data chunk (via Data::Dumper):
75
76							$VAR1 = {
77							'NAME' => 'REFERENCE',
78							'DATA' => '1 (bases 1 to 10001)'
79							'AUTHORS' => 'International Human Genome Sequencing Consortium.'
80							'TITLE' => 'The DNA sequence of Homo sapiens'
81							'JOURNAL' => 'Unpublished (2003)'
82							};
83							...
84
85							This is the current SeqFeature data chunk (again via Data::Dumper):
86
87							$VAR1 = {
88							'mol_type' => 'genomic DNA',
89							'LOCATION' => '<1..>10001',
90							'NAME' => 'FEATURES',
91							'FEATURE_KEY' => 'source',
92							'note' => 'Accession AL451081 sequenced by The Sanger Centre',
93							'db_xref' => 'taxon:9606',
94							'clone' => 'RP11-302I18',
95							'organism' => 'Homo sapiens'
96							};
97
98							=head1 FEEDBACK
99
100							=head2 Mailing Lists
101
102							User feedback is an integral part of the evolution of this and other
103							Bioperl modules. Send your comments and suggestions preferably to one
104							of the Bioperl mailing lists. Your participation is much appreciated.
105
106							bioperl-l@bioperl.org - General discussion
107							http://bioperl.org/wiki/Mailing_lists - About the mailing lists
108
109							=head2 Support
110
111							Please direct usage questions or support issues to the mailing list:
112
113							I
114
115							rather than to the module maintainer directly. Many experienced and
116							reponsive experts will be able look at the problem and quickly
117							address it. Please include a thorough description of the problem
118							with code and data examples if at all possible.
119
120							=head2 Reporting Bugs
121
122							Report bugs to the Bioperl bug tracking system to help us keep track
123							the bugs and their resolution. Bug reports can be submitted via the
124							web:
125
126							https://github.com/bioperl/bioperl-live/issues
127
128							=head1 AUTHOR - Chris Fields
129
130							Email cjfields at bioperl dot org
131
132							=head1 APPENDIX
133
134							The rest of the documentation details each of the object methods. Internal
135							methods are usually preceded with a _
136
137							=cut
138
139							# Let the code begin...
140
141							package Bio::SeqIO::Handler::GenericRichSeqHandler;
142	1			1		6	use strict;
	1					1
	1					23
143	1			1		4	use warnings;
	1					1
	1					20
144
145	1			1		592	use Bio::SeqIO::FTHelper;
	1					3
	1					24
146	1			1		6	use Bio::Annotation::Collection;
	1					1
	1					17
147	1			1		706	use Bio::Annotation::DBLink;
	1					2
	1					28
148	1			1		589	use Bio::Annotation::Comment;
	1					2
	1					23
149	1			1		597	use Bio::Annotation::Reference;
	1					2
	1					26
150	1			1		6	use Bio::Annotation::Collection;
	1					1
	1					15
151	1			1		5	use Bio::Annotation::SimpleValue;
	1					2
	1					16
152	1			1		662	use Bio::Annotation::TagTree;
	1					2
	1					28
153	1			1		7	use Bio::SeqFeature::Generic;
	1					1
	1					17
154	1			1		624	use Bio::Species;
	1					2
	1					25
155	1			1		6	use Bio::Taxon;
	1					2
	1					14
156	1			1		4	use Bio::DB::Taxonomy;
	1					2
	1					15
157	1			1		4	use Bio::Factory::FTLocationFactory;
	1					2
	1					20
158	1			1		4	use Data::Dumper;
	1					2
	1					47
159
160	1			1		4	use base qw(Bio::Root::Root Bio::HandlerBaseI);
	1					2
	1					662
161
162							my %HANDLERS = (
163							'genbank' => {
164							'LOCUS' => \&_genbank_locus,
165							'DEFINITION' => \&_generic_description,
166							'ACCESSION' => \&_generic_accession,
167							'VERSION' => \&_generic_version,
168							'KEYWORDS' => \&_generic_keywords,
169							'DBSOURCE' => \&_genbank_dbsource,
170							'DBLINK' => \&_genbank_dbsource,
171							'SOURCE' => \&_generic_species,
172							'REFERENCE' => \&_generic_reference,
173							'COMMENT' => \&_generic_comment,
174							'FEATURES' => \&_generic_seqfeatures,
175							'BASE' => \&noop, # this is generated from scratch
176							'ORIGIN' => \&_generic_seq,
177							# handles anything else (WGS, WGS_SCAFLD, CONTIG, PROJECT)
178							'_DEFAULT_' => \&_generic_simplevalue,
179							},
180							'embl' => {
181							'ID' => \&_embl_id,
182							'DT' => \&_embl_date,
183							'DR' => \&_generic_dbsource,
184							'SV' => \&_generic_version,
185							'RN' => \&_generic_reference,
186							'KW' => \&_generic_keywords,
187							'DE' => \&_generic_description,
188							'AC' => \&_generic_accession,
189							#'AH' => \&noop, # TPA data not dealt with yet...
190							#'AS' => \&noop,
191							'SQ' => \&_generic_seq,
192							'OS' => \&_generic_species,
193							'CC' => \&_generic_comment,
194							'FT' => \&_generic_seqfeatures,
195							# handles anything else (WGS, TPA, ANN...)
196							'_DEFAULT_' => \&_generic_simplevalue,
197							},
198							'swiss' => {
199							'ID' => \&_swiss_id,
200							'DT' => \&_swiss_date,
201							'GN' => \&_swiss_genename,
202							'DR' => \&_generic_dbsource,
203							'RN' => \&_generic_reference,
204							'KW' => \&_generic_keywords,
205							'DE' => \&_generic_description,
206							'AC' => \&_generic_accession,
207							'SQ' => \&_generic_seq,
208							'OS' => \&_generic_species,
209							'CC' => \&_generic_comment,
210							'FT' => \&_generic_seqfeatures,
211							# handles anything else, though I don't know what...
212							'_DEFAULT_' => \&_generic_simplevalue,
213							},
214							);
215
216							# can we do this generically? Seems like a lot of trouble...
217							my %DBSOURCE = map {$_ => 1} qw(
218							EchoBASE IntAct SWISS-2DPAGE ECO2DBASE ECOGENE TIGRFAMs
219							TIGR GO InterPro Pfam PROSITE SGD GermOnline
220							HSSP PhosSite Ensembl RGD AGD ArrayExpress KEGG
221							H-InvDB HGNC LinkHub PANTHER PRINTS SMART SMR
222							MGI MIM RZPD-ProtExp ProDom MEROPS TRANSFAC Reactome
223							UniGene GlycoSuiteDB PIRSF HSC-2DPAGE PHCI-2DPAGE
224							PMMA-2DPAGE Siena-2DPAGE Rat-heart-2DPAGE Aarhus/Ghent-2DPAGE
225							Biocyc MetaCyc Biocyc:Metacyc GenomeReviews FlyBase
226							TMHOBP COMPLUYEAST-2DPAGE OGP DictyBase HAMAP
227							PhotoList Gramene WormBase WormPep Genew ZFIN
228							PeroxiBase MaizeDB TAIR DrugBank REBASE HPA
229							swissprot GenBank GenPept REFSEQ embl PDB UniProtKB);
230
231							my %NOPROCESS = map {$_ => 1} qw(DBSOURCE ORGANISM FEATURES);
232
233							our %VALID_ALPHABET = (
234							'bp' => 'dna',
235							'aa' => 'protein',
236							'rc' => '' # rc = release candidate; file has no sequences
237							);
238
239							=head2 new
240
241							Title : new
242							Usage :
243							Function:
244							Returns :
245							Args : -format Sequence format to be mapped for handler methods
246							-builder Bio::Seq::SeqBuilder object (normally defined in
247							SequenceStreamI object implementation constructor)
248							Throws : On undefined '-format' sequence format parameter
249							Note : Still under heavy development
250
251							=cut
252
253							sub new {
254	42			42	1	185	my ($class, @args) = @_;
255	42					165	my $self = $class->SUPER::new(@args);
256	42					234	$self = {@args};
257	42					89	bless $self,$class;
258	42					170	my ($format, $builder) = $self->_rearrange([qw(FORMAT BUILDER)], @args);
259	42	50				153	$self->throw("Must define sequence record format") if !$format;
260	42					218	$self->format($format);
261	42					171	$self->handler_methods();
262	42	50				220	$builder && $self->seqbuilder($builder);
263	42					170	$self->location_factory();
264	42					227	return $self;
265							}
266
267							=head1 L implementing methods
268
269							=head2 handler_methods
270
271							Title : handler_methods
272							Usage : $handler->handler_methods('GenBank')
273							%handlers = $handler->handler_methods();
274							Function: Retrieve the handler methods used for the current format() in
275							the handler. This assumes the handler methods are already
276							described in the HandlerI-implementing class.
277							Returns : a hash reference with the data type handled and the code ref
278							associated with it.
279							Args : [optional] String representing the sequence format. If set here
280							this will also set sequence_format()
281							Throws : On unimplemented sequence format in %HANDLERS
282
283							=cut
284
285							sub handler_methods {
286	75			75	1	132	my $self = shift;
287	75	100				189	if (!($self->{'handlers'})) {
288							$self->throw("No handlers defined for seqformat ",$self->format)
289	42	50				123	unless exists $HANDLERS{$self->format};
290	42					121	$self->{'handlers'} = $HANDLERS{$self->format};
291							}
292	75					156	return ($self->{'handlers'});
293							}
294
295							=head2 data_handler
296
297							Title : data_handler
298							Usage : $handler->data_handler($data)
299							Function: Centralized method which accepts all data chunks, then distributes
300							to the appropriate methods for processing based on the chunk name
301							from within the HandlerBaseI object.
302
303							One can also use
304							Returns : None
305							Args : an hash ref containing a data chunk.
306
307							=cut
308
309							sub data_handler {
310	1646			1646	1	3476	my ($self, $data) = @_;
311	1646		33			4039	my $nm = $data->{NAME} \|\| $self->throw("No name tag defined!");
312
313							# this should handle data on the fly w/o caching; any caching should be
314							# done in the driver!
315							my $method = (exists $self->{'handlers'}->{$nm}) ? ($self->{'handlers'}->{$nm}) :
316	1646	50				6349	(exists $self->{'handlers'}->{'_DEFAULT_'}) ? ($self->{'handlers'}->{'_DEFAULT_'}) :
		100
317							undef;
318	1646	50				3093	if (!$method) {
319	0					0	$self->debug("No handler defined for $nm\n");
320	0					0	return;
321							};
322	1646					4154	$self->$method($data);
323							}
324
325							=head2 reset_parameters
326
327							Title : reset_parameters
328							Usage : $handler->reset_parameters()
329							Function: Resets the internal cache of data (normally object parameters for
330							a builder or factory)
331							Returns : None
332							Args : None
333
334							=cut
335
336							sub reset_parameters {
337	58			58	1	115	my $self = shift;
338	58					592	$self->{'_params'} = undef;
339							}
340
341							=head2 format
342
343							Title : format
344							Usage : $handler->format('GenBank')
345							Function: Get/Set the format for the report/record being parsed. This can be
346							used to set handlers in classes which are capable of processing
347							similar data chunks from multiple driver modules.
348							Returns : String with the sequence format
349							Args : [optional] String with the sequence format
350							Note : The format may be used to set the handlers (as in the
351							current GenericRichSeqHandler implementation)
352
353							=cut
354
355							sub format {
356	1301			1301	1	2114	my $self = shift;
357	1301	100				2871	return $self->{'_seqformat'} = lc shift if @_;
358	1259					3472	return $self->{'_seqformat'};
359							}
360
361							=head2 get_params
362
363							Title : get_params
364							Usage : $handler->get_params('-species')
365							Function: Convenience method used to retrieve the specified
366							parameters from the internal parameter cache
367							Returns : Hash ref containing parameters requested and data as
368							key-value pairs. Note that some parameter values may be
369							objects, arrays, etc.
370							Args : List (array) representing the parameters requested
371
372							=cut
373
374							sub get_params {
375	818			818	1	1647	my ($self, @ids) = @_;
376	818					1306	my %data;
377	818					1426	for my $id (@ids) {
378	818	50				2422	if (!index($id, '-')==0) {
379	818					2103	$id = '-'.$id ;
380							}
381	818	100				3653	$data{$id} = $self->{'_params'}->{$id} if (exists $self->{'_params'}->{$id});
382							}
383	818					2380	return \%data;
384							}
385
386							=head2 set_params
387
388							Title : set_params
389							Usage : $handler->set_param({'-species')
390							Function: Convenience method used to set specific parameters
391							Returns : None
392							Args : Hash ref containing the data to be passed as key-value pairs
393
394							=cut
395
396							sub set_params {
397	0			0	1	0	shift->throw('Not implemented yet!');
398							}
399
400							=head1 Methods unique to this implementation
401
402							=head2 seqbuilder
403
404							Title : seqbuilder
405							Usage :
406							Function:
407							Returns :
408							Args :
409							Throws :
410							Note :
411
412							=cut
413
414							sub seqbuilder {
415	104			104	1	208	my $self = shift;
416	104	100				361	return $self->{'_seqbuilder'} = shift if @_;
417	62					175	return $self->{'_seqbuilder'};
418							}
419
420							=head2 build_sequence
421
422							Title : build_sequence
423							Usage :
424							Function:
425							Returns :
426							Args :
427							Throws :
428							Note :
429
430							=cut
431
432							sub build_sequence {
433	62			62	1	146	my $self = shift;
434	62					251	my $builder = $self->seqbuilder();
435	62					109	my $seq;
436	62	100				194	if (defined($self->{'_params'})) {
437	58					111	$builder->add_slot_value(%{ $self->{'_params'} });
	58					839
438	58					340	$seq = $builder->make_object();
439	58					321	$self->reset_parameters;
440							}
441	62	100				1149	return $seq if $seq;
442	4					25	return 0;
443							}
444
445							=head2 location_factory
446
447							Title : location_factory
448							Usage :
449							Function:
450							Returns :
451							Args :
452							Throws :
453							Note :
454
455							=cut
456
457							sub location_factory {
458	42			42	1	108	my ($self, $factory) = @_;
459	42	50				213	if ($factory) {
		50
460	0	0	0			0	$self->throw("Must have a Bio::Factory::LocationFactoryI when ".
461							"explicitly setting factory()") unless
462							(ref($factory) && $factory->isa('Bio::Factory::LocationFactoryI'));
463	0					0	$self->{'_locfactory'} = $factory;
464							} elsif (!defined($self->{'_locfactory'})) {
465	42					279	$self->{'_locfactory'} = Bio::Factory::FTLocationFactory->new()
466							}
467	42					93	return $self->{'_locfactory'};
468							}
469
470							=head2 annotation_collection
471
472							Title : annotation_collection
473							Usage :
474							Function:
475							Returns :
476							Args :
477							Throws :
478							Note :
479
480							=cut
481
482							sub annotation_collection {
483	746			746	1	1516	my ($self, $coll) = @_;
484	746	50				2781	if ($coll) {
		100
485	0	0	0			0	$self->throw("Must have Bio::AnnotationCollectionI ".
486							"when explicitly setting collection()")
487							unless (ref($coll) && $coll->isa('Bio::AnnotationCollectionI'));
488	0					0	$self->{'_params'}->{'-annotation'} = $coll;
489							} elsif (!exists($self->{'_params'}->{'-annotation'})) {
490	57					536	$self->{'_params'}->{'-annotation'} = Bio::Annotation::Collection->new()
491							}
492	746					3127	return $self->{'_params'}->{'-annotation'};
493							}
494
495							####################### SEQUENCE HANDLERS #######################
496
497							# any sequence data
498							sub _generic_seq {
499	53			53		155	my ($self, $data) = @_;
500	53					435	$self->{'_params'}->{'-seq'} = $data->{DATA};
501							}
502
503							####################### RAW DATA HANDLERS #######################
504
505							# GenBank LOCUS line
506							sub _genbank_locus {
507	31			31		71	my ($self, $data) = @_;
508	31					237	my (@tokens) = split m{\s+}, $data->{DATA};
509	31					71	my $display_id = shift @tokens;
510	31					118	$self->{'_params'}->{'-display_id'} = $display_id;
511	31					64	my $seqlength = shift @tokens;
512	31	50				94	if (exists $VALID_ALPHABET{$seqlength}) {
513							# moved one token too far. No locus name?
514							$self->warn("Bad LOCUS name? Changing [".$self->{'_params'}->{'-display_id'}.
515	0					0	"] to 'unknown' and length to ".$self->{'_params'}->{'-display_id'});
516	0					0	$self->{'_params'}->{'-length'} = $self->{'_params'}->{'-display_id'};
517	0					0	$self->{'_params'}->{'-display_id'} = 'unknown';
518							# add token back...
519	0					0	unshift @tokens, $seqlength;
520							} else {
521	31					79	$self->{'_params'}->{'-length'} = $seqlength;
522							}
523	31					67	my $alphabet = lc(shift @tokens);
524							$self->{'_params'}->{'-alphabet'} =
525	31	50				133	(exists $VALID_ALPHABET{$alphabet}) ? $VALID_ALPHABET{$alphabet} :
526							$self->warn("Unknown alphabet: $alphabet");
527	31	50	66			142	if (($self->{'_params'}->{'-alphabet'} eq 'dna') \|\| (@tokens > 2)) {
528	31					98	$self->{'_params'}->{'-molecule'} = shift(@tokens);
529	31					71	my $circ = shift(@tokens);
530	31	100				83	if ($circ eq 'circular') {
531	2					6	$self->{'_params'}->{'-is_circular'} = 1;
532	2					7	$self->{'_params'}->{'-division'} = shift(@tokens);
533							} else {
534							# 'linear' or 'circular' may actually be omitted altogether
535	29	100				96	$self->{'_params'}->{'-division'} =
536							(CORE::length($circ) == 3 ) ? $circ : shift(@tokens);
537							}
538							} else {
539	0	0				0	$self->{'_params'}->{'-molecule'} = 'PRT' if($self->{'_params'}->{'-alphabet'} eq 'aa');
540	0					0	$self->{'_params'}->{'-division'} = shift(@tokens);
541							}
542	31					103	my $date = join(' ', @tokens);
543							# maybe use Date::Time for dates?
544	31	100	66			287	if($date && $date =~ s{\s((\d{1,2})-(\w{3})-(\d{2,4})).}{$1}) {
545
546	30	50				75	if( length($date) < 11 ) {
547							# improperly formatted date
548							# But we'll be nice and fix it for them
549	0					0	my ($d,$m,$y) = ($2,$3,$4);
550	0	0				0	if( length($d) == 1 ) {
551	0					0	$d = "0$d";
552							}
553							# guess the century here
554	0	0				0	if( length($y) == 2 ) {
555	0	0				0	if( $y > 60 ) { # arbitrarily guess that '60' means 1960
556	0					0	$y = "19$y";
557							} else {
558	0					0	$y = "20$y";
559							}
560	0					0	$self->warn("Date was malformed, guessing the century for $date to be $y\n");
561							}
562	0					0	$self->{'_params'}->{'-dates'} = [join('-',$d,$m,$y)];
563							} else {
564	30					107	$self->{'_params'}->{'-dates'} = [$date];
565							}
566							}
567							}
568
569							# EMBL ID line
570							sub _embl_id {
571	10			10		33	my ($self, $data) = @_;
572	10					21	my $alphabet;
573	10					26	my ($name, $sv, $topology, $mol, $div);
574	10					28	my $line = $data->{DATA};
575							#$self->debug("$line\n");
576	10					34	my ($idtype) = $line =~ tr/;/;/;
577	10	100				61	if ( $idtype == 6) { # New style headers contain exactly six semicolons.
		100
578							# New style header (EMBL Release >= 87, after June 2006)
579	1					5	my $topology;
580							my $sv;
581
582							# ID DQ299383; SV 1; linear; mRNA; STD; MAM; 431 BP.
583							# This regexp comes from the new2old.pl conversion script, from EBI
584	1	50				13	if ($line =~ m/^(\w+);\s+SV (\d+); (\w+); ([^;]+); (\w{3}); (\w{3}); (\d+) \w{2}\./) {
585	1					9	($name, $sv, $topology, $mol, $div) = ($1, $2, $3, $4, $6);
586							} else {
587	0					0	$self->throw("Unrecognized EMBL ID line:[$line]");
588							}
589	1	50				5	if (defined($sv)) {
590	1					6	$self->{'_params'}->{'-seq_version'} = $sv;
591	1					4	$self->{'_params'}->{'-version'} = $sv;
592							}
593
594	1	50				4	if ($topology eq "circular") {
595	0					0	$self->{'_params'}->{'-is_circular'} = 1;
596							}
597
598	1	50				4	if (defined $mol ) {
599	1	50				8	if ($mol =~ /DNA/) {
		50
		0
600	0					0	$alphabet='dna';
601							}
602							elsif ($mol =~ /RNA/) {
603	1					4	$alphabet='rna';
604							}
605							elsif ($mol =~ /AA/) {
606	0					0	$alphabet='protein';
607							}
608							}
609							} elsif ($idtype) { # has internal ';'
610							# Old style header (EMBL Release < 87, before June 2006)
611	8	50				99	if ($line =~ m{^(\S+)[^;];\s+(\S+)[^;];\s+(\S+)[^;]*;}) {
612	8					52	($name, $mol, $div) = ($1, $2, $3);
613							#$self->debug("[$name][$mol][$div]");
614							}
615
616	8	50				41	if($mol) {
617	8	50				33	if ( $mol =~ m{circular} ) {
618	0					0	$self->{'_params'}->{'-is_circular'} = 1;
619	0					0	$mol =~ s{circular }{};
620							}
621	8	50				37	if (defined $mol ) {
622	8	100				53	if ($mol =~ /DNA/) {
		50
		0
623	7					30	$alphabet='dna';
624							}
625							elsif ($mol =~ /RNA/) {
626	1					5	$alphabet='rna';
627							}
628							elsif ($mol =~ /AA/) {
629	0					0	$alphabet='protein';
630							}
631							}
632							}
633							} else {
634	1					2	$name = $data->{DATA};
635							}
636	10	50	33			85	unless( defined $name && length($name) ) {
637	0					0	$name = "unknown_id";
638							}
639	10					69	$self->{'_params'}->{'-display_id'} = $name;
640	10					35	$self->{'_params'}->{'-alphabet'} = $alphabet;
641	10	100				62	$self->{'_params'}->{'-division'} = $div if $div;
642	10	100				71	$self->{'_params'}->{'-molecule'} = $mol if $mol;
643							}
644
645							# UniProt/SwissProt ID line
646							sub _swiss_id {
647	17			17		55	my ($self, $data) = @_;
648	17					46	my ($name, $seq_div);
649	17	50				198	if($data->{DATA} =~ m{^
650							(\S+) \s+ # $1 entryname
651							([^\s;]+); \s+ # $2 DataClass
652							(?:PRT;)? \s+ # Molecule Type (optional)
653							[0-9]+[ ]AA \. # Sequencelength (capture?)
654							$
655							}ox ) {
656	17					93	($name, $seq_div) = ($1, $2);
657	17	50	100			197	$self->{'_params'}->{'-namespace'} =
		100	66
658							($seq_div eq 'Reviewed' \|\| $seq_div eq 'STANDARD') ? 'Swiss-Prot' :
659							($seq_div eq 'Unreviewed' \|\| $seq_div eq 'PRELIMINARY') ? 'TrEMBL' :
660							$seq_div;
661							# we shouldn't be setting the division, but for now...
662	17					113	my ($junk, $division) = split q(_), $name;
663	17					87	$self->{'_params'}->{'-division'} = $division;
664	17					53	$self->{'_params'}->{'-alphabet'} = 'protein';
665							# this is important to have the id for display in e.g. FTHelper, otherwise
666							# you won't know which entry caused an error
667	17					97	$self->{'_params'}->{'-display_id'} = $name;
668							} else {
669	0					0	$self->throw("Unrecognized UniProt/SwissProt ID line:[".$data->{DATA}."]");
670							}
671							}
672
673							# UniProt/SwissProt GN line
674							sub _swiss_genename {
675	17			17		51	my ($self, $data) = @_;
676							#$self->debug(Dumper($data));
677	17					68	my $genename = $data->{DATA};
678	17					31	my $gn;
679	17	50				73	if ($genename) {
680	17					45	my @stags;
681	17	100				98	if ($genename =~ /\w=\w/) {
682							# new format (e.g., Name=RCHY1; Synonyms=ZNF363, CHIMP)
683	10					57	for my $n (split(m{\s+and\s+},$genename)) {
684	12					26	my @genenames;
685	12					136	for my $section (split(m{\s;\s},$n)) {
686	15					60	my ($tag, $rest) = split("=",$section);
687	15		50			45	$rest \|\|= '';
688	15					48	for my $val (split(m{\s,\s},$rest)) {
689	19					73	push @genenames, [$tag => $val];
690							}
691							}
692	12					58	push @stags, ['gene_name' => \@genenames];
693							}
694							} else {
695							# old format
696	7					38	for my $section (split(/ AND /, $genename)) {
697	9					19	my @genenames;
698	9					51	$section =~ s/[\.]//g;
699	9					44	my @names = split(m{\s+OR\s+}, $section);
700	9					39	push @genenames, ['Name' => shift @names];
701	9					24	push @genenames, map {['Synonyms' => $_]} @names;
	10					23
702	9					45	push @stags, ['gene_name' => \@genenames]
703							}
704							} #use Data::Dumper; print Dumper $gn, $genename;# exit;
705	17					251	my $gn = Bio::Annotation::TagTree->new(-tagname => 'gene_name',
706							-value => ['gene_names' => \@stags]);
707	17					78	$self->annotation_collection->add_Annotation('gene_name', $gn);
708							}
709							}
710
711							# GenBank VERSION line
712							# old EMBL SV line (now obsolete)
713							# UniProt/SwissProt?
714							sub _generic_version {
715	28			28		74	my ($self, $data) = @_;
716	28					131	my ($acc,$gi) = split(' ',$data->{DATA});
717	28	100				157	if($acc =~ m{^\w+\.(\d+)}xmso) {
718	27					95	$self->{'_params'}->{'-version'} = $1;
719	27					60	$self->{'_params'}->{'-seq_version'} = $1;
720							}
721	28	100	66			158	if($gi && (index($gi,"GI:") == 0)) {
722	24					90	$self->{'_params'}->{'-primary_id'} = substr($gi,3);
723							}
724							}
725
726							# EMBL DT lines
727							sub _embl_date {
728	5			5		21	my ($self, $data) = @_;
729	5					58	while ($data->{DATA} =~ m{(\S+)\s$(.*?)$}g) {
730	10					44	my ($date, $version) = ($1, $2);
731	10					25	$date =~ tr{,}{}d; # remove comma if new version
732	10	50				51	if ($version =~ m{$Rel\.\s(\d+),\sCreated$}xmso ) {
		50
733	0					0	my $release = Bio::Annotation::SimpleValue->new(
734							-tagname => 'creation_release',
735							-value => $1
736							);
737	0					0	$self->annotation_collection->add_Annotation($release);
738							} elsif ($version =~ m{$Rel\.\s(\d+),\sLast\supdated,\sVersion\s(\d+)$}xmso ) {
739	0					0	my $release = Bio::Annotation::SimpleValue->new(
740							-tagname => 'update_release',
741							-value => $1
742							);
743	0					0	$self->annotation_collection->add_Annotation($release);
744	0					0	my $update = Bio::Annotation::SimpleValue->new(
745							-tagname => 'update_version',
746							-value => $2
747							);
748	0					0	$self->annotation_collection->add_Annotation($update);
749							}
750	10					23	push @{ $self->{'_params'}->{'-dates'} }, $date;
	10					83
751							}
752							}
753
754							# UniProt/SwissProt DT lines
755							sub _swiss_date {
756	17			17		71	my ($self, $data) = @_;
757							# swissprot
758	17					101	my @dls = split m{\n}, $data->{DATA};
759	17					66	for my $dl (@dls) {
760	51					191	my ($date, $version) = split(' ', $dl, 2);
761	51					121	$date =~ tr{,}{}d; # remove comma if new version
762	51	100	100			718	if ($version =~ m{$Rel\. (\d+), Last sequence update$} \|\| # old
		100	100
763							$version =~ m{sequence version (\d+)\.}) { #new
764	17					214	my $update = Bio::Annotation::SimpleValue->new(
765							-tagname => 'seq_update',
766							-value => $1
767							);
768	17					91	$self->annotation_collection->add_Annotation($update);
769							} elsif ($version =~ m{$Rel\. (\d+), Last annotation update$} \|\| #old
770							$version =~ m{entry version (\d+)\.}) { #new
771	17					112	$self->{'_params'}->{'-version'} = $1;
772	17					74	$self->{'_params'}->{'-seq_version'} = $1;
773							}
774	51					92	push @{ $self->{'_params'}->{'-dates'} }, $date;
	51					233
775							}
776							}
777
778							# GenBank KEYWORDS line
779							# EMBL KW line
780							# UniProt/SwissProt KW line
781							sub _generic_keywords {
782	55			55		151	my ($self, $data) = @_;
783	55					368	$data->{DATA} =~ s{\.$}{};
784	55					562	my @kw = split m{\s\;\s}xo ,$data->{DATA};
785	55					246	$self->{'_params'}->{'-keywords'} = \@kw;
786							}
787
788							# GenBank DEFINITION line
789							# EMBL DE line
790							# UniProt/SwissProt DE line
791							sub _generic_description {
792	57			57		159	my ($self, $data) = @_;
793	57					216	$self->{'_params'}->{'-desc'} = $data->{DATA};
794							}
795
796							# GenBank ACCESSION line
797							# EMBL AC line
798							# UniProt/SwissProt AC line
799							sub _generic_accession {
800	56			56		163	my ($self, $data) = @_;
801	56					370	my @accs = split m{[\s;]+}, $data->{DATA};
802	56					214	$self->{'_params'}->{'-accession_number'} = shift @accs;
803	56	100				253	$self->{'_params'}->{'-secondary_accessions'} = \@accs if @accs;
804							}
805
806							####################### SPECIES HANDLERS #######################
807
808							# uses Bio::Species
809							# GenBank SOURCE, ORGANISM lines
810							# EMBL O* lines
811							# UniProt/SwissProt O* lines
812							sub _generic_species {
813	56			56		149	my ($self, $data) = @_;
814
815	56					195	my $seqformat = $self->format;
816							# if data is coming in from GenBank parser...
817	56	100	66			456	if ($seqformat eq 'genbank' &&
818							$data->{ORGANISM} =~ m{(.+?)\s(\S+;[^\n\.]+)}ox) {
819	30					135	($data->{ORGANISM}, $data->{CLASSIFICATION}) = ($1, $2);
820							}
821
822							# SwissProt stuff...
823							# hybrid names in swissprot files are no longer valid per intergration into
824							# UniProt. Files containing these have been split into separate entries, so
825							# it is probably a good idea to update if one has these lingering around...
826
827	56					137	my $taxid;
828	56	100				172	if ($seqformat eq 'swiss') {
829	17	50				135	if ($data->{DATA} =~ m{^([^,]+)}ox) {
830	17					98	$data->{DATA} = $1;
831							}
832	17	100	66			170	if ($data->{CROSSREF} && $data->{CROSSREF} =~ m{NCBI_TaxID=(\d+)}) {
833	16					61	$taxid = $1;
834							}
835							}
836
837							my ($sl, $class, $sci_name) = ($data->{DATA},
838							$data->{CLASSIFICATION},
839	56		100			334	$data->{ORGANISM} \|\| '');
840	56					109	my ($organelle,$abbr_name, $common);
841	56					708	my @class = reverse split m{\s;\s}, $class;
842							# have to treat swiss different from everything else...
843	56	50				749	if ($sl =~ m{^(mitochondrion\|chloroplast\|plastid)? # GenBank format
844							\s(.?)
845							\s(?: $ (.?) $ )?\.?$
846							}xmso ){
847	56					272	($organelle, $abbr_name, $common) = ($1, $2, $3); # optional
848							} else {
849	0					0	$abbr_name = $sl; # nothing caught; this is a backup!
850							}
851							# there is no 'abbreviated name' for EMBL
852	56	100				208	$sci_name = $abbr_name if $seqformat ne 'genbank';
853	56		100			325	$organelle \|\|= '';
854	56		100			195	$common \|\|= '';
855	56	50				118	$sci_name \|\| return;
856	56					150	unshift @class, $sci_name;
857							# no genus/species parsing here; moving to Bio::Taxon-based taxonomy
858	56					532	my $make = Bio::Species->new();
859	56					294	$make->scientific_name($sci_name);
860	56	50				344	$make->classification(@class) if @class > 0;
861	56	100				302	$common && $make->common_name( $common );
862	56	50				334	$abbr_name && $make->name('abbreviated', $abbr_name);
863	56	100				247	$organelle && $make->organelle($organelle);
864	56	100				287	$taxid && $make->ncbi_taxid($taxid);
865	56					668	$self->{'_params'}->{'-species'} = $make;
866							}
867
868							####################### ANNOTATION HANDLERS #######################
869
870							# GenBank DBSOURCE line
871							sub _genbank_dbsource {
872	1			1		3	my ($self, $data) = @_;
873	1					3	my $dbsource = $data->{DATA};
874	1					3	my $annotation = $self->annotation_collection;
875							# deal with swissprot dbsources
876							# we could possibly parcel these out to subhandlers...
877	1	50				4	if( $dbsource =~ s/(UniProt(?:KB)\|swissprot):\s+locus\s+(\S+)\,.+\n// ) {
878	0					0	$annotation->add_Annotation
879							('dblink',
880							Bio::Annotation::DBLink->new
881							(-primary_id => $2,
882							-database => $1,
883							-tagname => 'dblink'));
884	0	0				0	if( $dbsource =~ s/\s*created:\s+([^\.]+)\.\n// ) {
885	0					0	$annotation->add_Annotation
886							('swissprot_dates',
887							Bio::Annotation::SimpleValue->new
888							(-tagname => 'date_created',
889							-value => $1));
890							}
891	0					0	while( $dbsource =~ s/\s*(sequence\|annotation)\s+updated:\s+([^\.]+)\.\n//g ) {
892	0					0	$annotation->add_Annotation
893							('swissprot_dates',
894							Bio::Annotation::SimpleValue->new
895							(-tagname => 'date_updated',
896							-value => $1));
897							}
898	0					0	$dbsource =~ s/\n/ /g;
899	0	0				0	if( $dbsource =~ s/\s*xrefs:\s+((?:\S+,\s+)+\S+)\s+xrefs/xrefs/ ) {
		0
900							# will use $i to determine even or odd
901							# for swissprot the accessions are paired
902	0					0	my $i = 0;
903	0					0	for my $dbsrc ( split(/,\s+/,$1) ) {
904	0	0	0			0	if( $dbsrc =~ /(\S+)\.(\d+)/ \|\| $dbsrc =~ /(\S+)/ ) {
905	0					0	my ($id,$version) = ($1,$2);
906	0	0				0	$version ='' unless defined $version;
907	0					0	my $db;
908	0	0				0	if( $id =~ /^\d\S{3}/) {
909	0					0	$db = 'PDB';
910							} else {
911	0	0				0	$db = ($i++ % 2 ) ? 'GenPept' : 'GenBank';
912							}
913	0					0	$annotation->add_Annotation
914							('dblink',
915							Bio::Annotation::DBLink->new
916							(-primary_id => $id,
917							-version => $version,
918							-database => $db,
919							-tagname => 'dblink'));
920							}
921							}
922							} elsif( $dbsource =~ s/\s*xrefs:\s+(.+)\s+xrefs/xrefs/i ) {
923							# download screwed up and ncbi didn't put acc in for gi numbers
924	0					0	my $i = 0;
925	0					0	for my $id ( split(/\,\s+/,$1) ) {
926	0					0	my ($acc,$db);
927	0	0				0	if( $id =~ /gi:\s+(\d+)/ ) {
		0
928	0					0	$acc= $1;
929	0	0				0	$db = ($i++ % 2 ) ? 'GenPept' : 'GenBank';
930							} elsif( $id =~ /pdb\s+accession\s+(\S+)/ ) {
931	0					0	$acc= $1;
932	0					0	$db = 'PDB';
933							} else {
934	0					0	$acc= $id;
935	0					0	$db = '';
936							}
937	0					0	$annotation->add_Annotation
938							('dblink',
939							Bio::Annotation::DBLink->new
940							(-primary_id => $acc,
941							-database => $db,
942							-tagname => 'dblink'));
943							}
944							} else {
945	0					0	$self->warn("Cannot match $dbsource\n");
946							}
947	0	0				0	if( $dbsource =~ s/xrefs\s+$non\-sequence\s+databases$:\s+
948							((?:\S+,\s+)+\S+)//x ) {
949	0					0	for my $id ( split(/\,\s+/,$1) ) {
950	0					0	my $db;
951							# this is because GenBank dropped the spaces!!!
952							# I'm sure we're not going to get this right
953							##if( $id =~ s/^://i ) {
954							## $db = $1;
955							##}
956	0					0	$db = substr($id,0,index($id,':'));
957	0	0				0	if (! exists $DBSOURCE{ $db }) {
958	0					0	$db = ''; # do we want 'GenBank' here?
959							}
960	0					0	$id = substr($id,index($id,':')+1);
961	0					0	$annotation->add_Annotation
962							('dblink',Bio::Annotation::DBLink->new
963							(-primary_id => $id,
964							-database => $db,
965							-tagname => 'dblink'));
966							}
967							}
968							} else {
969	1	50				7	if( $dbsource =~ /^(\S?):?\saccession\s+(\S+)\.(\d+)/ ) {
		0
970	1					4	my ($db,$id,$version) = ($1,$2,$3);
971	1		50			11	$annotation->add_Annotation
972							('dblink',
973							Bio::Annotation::DBLink->new
974							(-primary_id => $id,
975							-version => $version,
976							-database => $db \|\| 'GenBank',
977							-tagname => 'dblink'));
978							} elsif ( $dbsource =~ /(\S+)([\.:])(\d+)/ ) {
979	0					0	my ($id, $db, $version);
980	0	0				0	if ($2 eq ':') {
981	0					0	($db, $id) = ($1, $3);
982							} else {
983	0					0	($db, $id, $version) = ('GenBank', $1, $3);
984							}
985	0					0	$annotation->add_Annotation('dblink',
986							Bio::Annotation::DBLink->new(
987							-primary_id => $id,
988							-version => $version,
989							-database => $db,
990							-tagname => 'dblink')
991							);
992							} else {
993	0					0	$self->warn("Unrecognized DBSOURCE data: $dbsource\n");
994							}
995							}
996							}
997
998							# EMBL DR lines
999							# UniProt/SwissProt DR lines
1000							sub _generic_dbsource {
1001	20			20		76	my ($self, $data) = @_;
1002							#$self->debug(Dumper($data));
1003	20					222	while ($data->{DATA} =~ m{([^\n]+)}og) {
1004	351					1061	my $dblink = $1;
1005	351					1515	$dblink =~ s{\.$}{};
1006	351					585	my $link;
1007	351					1079	my @linkdata = split '; ',$dblink;
1008	351	50				1899	if ( $dblink =~ m{([^\s;]+);\s([^\s;]+);?\s([^\s;]+)?}) {
1009							#if ( $dblink =~ m{([^\s;]+);\s([^\s;]+);?\s([^\s;]+)?}) {
1010	351					1323	my ($databse, $prim_id, $sec_id) = ($1,$2,$3);
1011	351					1388	$link = Bio::Annotation::DBLink->new(-database => $databse,
1012							-primary_id => $prim_id,
1013							-optional_id => $sec_id);
1014							} else {
1015	0					0	$self->warn("No match for $dblink");
1016							}
1017	351					1010	$self->annotation_collection->add_Annotation('dblink', $link);
1018							}
1019							}
1020
1021
1022							# GenBank REFERENCE and related lines
1023							# EMBL R* lines
1024							# UniProt/SwissProt R* lines
1025							sub _generic_reference {
1026	301			301		594	my ($self, $data) = @_;
1027	301					707	my $seqformat = $self->format;
1028	301					526	my ($start, $end);
1029							# get these in EMBL/Swiss
1030	301	100				803	if ($data->{CROSSREF}) {
1031	128					1322	while ($data->{CROSSREF} =~ m{(pubmed\|doi\|medline)(?:=\|;\s+)(\S+)}oig) {
1032	199					838	my ($db, $ref) = (uc $1, $2);
1033	199					872	$ref =~ s{[;.]+$}{};
1034	199					1015	$data->{$db} = $ref;
1035							}
1036							}
1037							# run some cleanup for swissprot
1038	301	100				639	if ($seqformat eq 'swiss') {
1039	109					165	for my $val (values %{ $data }) {
	109					623
1040	959					2027	$val =~ s{;$}{};
1041	959					1776	$val =~ s{(\w-)\s}{$1};
1042							}
1043							}
1044	301	100				692	if ( $data->{POSITION} ) {
1045	127	100				628	if ($seqformat eq 'embl') {
		100
1046	18					110	($start, $end) = split '-', $data->{POSITION},2;
1047							} elsif ($data->{POSITION} =~ m{.+? OF (\d+)-(\d+).*}) { #swiss
1048	23					94	($start, $end) = ($1, $2);
1049							}
1050							}
1051	301	100				925	if ($data->{DATA} =~ m{^\d+\s+$[a-z]+\s+(\d+)\s+to\s+(\d+)$}xmso) {
1052	59					161	($start, $end) = ($1, $2);
1053							}
1054							my $ref = Bio::Annotation::Reference->new(
1055							-comment => $data->{REMARK},
1056							-location => $data->{JOURNAL},
1057							-pubmed => $data->{PUBMED},
1058							-consortium => $data->{CONSRTM},
1059							-title => $data->{TITLE},
1060							-authors => $data->{AUTHORS},
1061							-medline => $data->{MEDLINE},
1062							-doi => $data->{DOI},
1063							-rp => $data->{POSITION}, # JIC...
1064	301					3272	-start => $start,
1065							-end => $end,
1066							);
1067	301	100				1780	if ($data->{DATA} =~ m{^\d+\s+$(.*)$}xmso) {
1068	59					185	$ref->gb_reference($1);
1069							}
1070	301					925	$self->annotation_collection->add_Annotation('reference', $ref);
1071							}
1072
1073							# GenBank COMMENT lines
1074							# EMBL CC lines
1075							# UniProt/SwissProt CC lines
1076							sub _generic_comment {
1077	43			43		137	my ($self, $data) = @_;
1078							$self->annotation_collection->add_Annotation('comment',
1079	43					185	Bio::Annotation::Comment->new( -text => $data->{DATA} ));
1080							}
1081
1082							####################### SEQFEATURE HANDLER #######################
1083
1084							# GenBank Feature Table
1085							sub _generic_seqfeatures {
1086	848			848		1542	my ($self, $data) = @_;
1087	848	100				1951	return if $data->{FEATURE_KEY} eq 'FEATURES';
1088	818					1479	my $primary_tag = $data->{FEATURE_KEY};
1089
1090							# grab the NCBI taxon ID from the source SF
1091	818	100	100			1879	if ($primary_tag eq 'source' && exists $data->{'db_xref'}) {
1092	36	100	66			290	if ( $self->{'_params'}->{'-species'} &&
1093							$data->{'db_xref'} =~ m{taxon:(\d+)}xmso ) {
1094	35					188	$self->{'_params'}->{'-species'}->ncbi_taxid($1);
1095							}
1096							}
1097	818					1903	my $source = $self->format;
1098
1099	818					1214	my $seqid = ${ $self->get_params('accession_number') }{'accession_number'};
	818					1930
1100
1101	818					1651	my $loc;
1102	818					1519	eval {
1103	818					3376	$loc = $self->{'_locfactory'}->from_string($data->{'LOCATION'});
1104							};
1105	818	50				1987	if(! $loc) {
1106							$self->warn("exception while parsing location line [" .
1107							$data->{'LOCATION'} .
1108							"] in reading $source, ignoring feature " .
1109	0					0	$data->{'primary_tag'}.
1110							" (seqid=" . $seqid . "): " . $@);
1111	0					0	return;
1112							}
1113	818	50	33			1747	if($seqid && (! $loc->is_remote())) {
1114	0					0	$loc->seq_id($seqid); # propagates if it is a split location
1115							}
1116	818					3299	my $sf = Bio::SeqFeature::Generic->direct_new();
1117	818					2458	$sf->location($loc);
1118	818					2680	$sf->primary_tag($primary_tag);
1119	818					2441	$sf->seq_id($seqid);
1120	818					2149	$sf->source_tag($source);
1121	818					1813	delete $data->{'FEATURE_KEY'};
1122	818					1444	delete $data->{'LOCATION'};
1123	818					1220	delete $data->{'NAME'};
1124	818					1113	delete $data->{'DATA'};
1125	818					2673	$sf->set_attributes(-tag => $data);
1126	818					1546	push @{ $self->{'_params'}->{'-features'} }, $sf;
	818					3767
1127							}
1128
1129							####################### ODDS AND ENDS #######################
1130
1131							# Those things that don't fit anywhere else. If a specific name
1132							# maps to the below table, that class and method are used, otherwise
1133							# it goes into a SimpleValue (I think this is a good argument for why
1134							# we need a generic mechanism for storing annotation)
1135
1136							sub _generic_simplevalue {
1137	16			16		36	my ($self, $data) = @_;
1138							$self->annotation_collection->add_Annotation(
1139							Bio::Annotation::SimpleValue->new(-tagname => lc($data->{NAME}),
1140							-value => $data->{DATA})
1141	16					49	);
1142							}
1143
1144				15	0		sub noop {}
1145
1146							sub _debug {
1147	0			0			my ($self, $data) = @_;
1148	0						$self->debug(Dumper($data));
1149							}
1150
1151
1152							1;