File Coverage

Bio/SeqIO/embl.pm

Criterion	Covered	Total	%
statement	538	651	82.6
branch	267	434	61.5
condition	90	128	70.3
subroutine	30	33	90.9
pod	2	2	100.0
total	927	1248	74.2

line	stmt	bran	cond	sub	pod	time	code
1							#
2							# BioPerl module for Bio::SeqIO::EMBL
3							#
4							# Please direct questions and support issues to
5							#
6							# Cared for by Ewan Birney
7							#
8							# Copyright Ewan Birney
9							#
10							# You may distribute this module under the same terms as perl itself
11
12							# POD documentation - main docs before the code
13
14							=head1 NAME
15
16							Bio::SeqIO::embl - EMBL sequence input/output stream
17
18							=head1 SYNOPSIS
19
20							It is probably best not to use this object directly, but
21							rather go through the SeqIO handler system. Go:
22
23							$stream = Bio::SeqIO->new(-file => $filename, -format => 'EMBL');
24
25							while ( (my $seq = $stream->next_seq()) ) {
26							# do something with $seq
27							}
28
29							=head1 DESCRIPTION
30
31							This object can transform Bio::Seq objects to and from EMBL flat
32							file databases.
33
34							There is a lot of flexibility here about how to dump things which
35							should be documented more fully.
36
37							There should be a common object that this and Genbank share (probably
38							with Swissprot). Too much of the magic is identical.
39
40							=head2 Optional functions
41
42							=over 3
43
44							=item _show_dna()
45
46							(output only) shows the dna or not
47
48							=item _post_sort()
49
50							(output only) provides a sorting func which is applied to the FTHelpers
51							before printing
52
53							=item _id_generation_func()
54
55							This is function which is called as
56
57							print "ID ", $func($annseq), "\n";
58
59							To generate the ID line. If it is not there, it generates a sensible ID
60							line using a number of tools.
61
62							If you want to output annotations in EMBL format they need to be
63							stored in a Bio::Annotation::Collection object which is accessible
64							through the Bio::SeqI interface method L.
65
66							The following are the names of the keys which are polled from a
67							L object.
68
69							reference - Should contain Bio::Annotation::Reference objects
70							comment - Should contain Bio::Annotation::Comment objects
71							dblink - Should contain Bio::Annotation::DBLink objects
72
73							=back
74
75							=head1 FEEDBACK
76
77							=head2 Mailing Lists
78
79							User feedback is an integral part of the evolution of this and other
80							Bioperl modules. Send your comments and suggestions preferably to one
81							of the Bioperl mailing lists. Your participation is much appreciated.
82
83							bioperl-l@bioperl.org - General discussion
84							http://bioperl.org/wiki/Mailing_lists - About the mailing lists
85
86							=head2 Support
87
88							Please direct usage questions or support issues to the mailing list:
89
90							I
91
92							rather than to the module maintainer directly. Many experienced and
93							reponsive experts will be able look at the problem and quickly
94							address it. Please include a thorough description of the problem
95							with code and data examples if at all possible.
96
97							=head2 Reporting Bugs
98
99							Report bugs to the Bioperl bug tracking system to help us keep track
100							the bugs and their resolution. Bug reports can be submitted via
101							the web:
102
103							https://github.com/bioperl/bioperl-live/issues
104
105							=head1 AUTHOR - Ewan Birney
106
107							Email birney@ebi.ac.uk
108
109							=head1 APPENDIX
110
111							The rest of the documentation details each of the object
112							methods. Internal methods are usually preceded with a _
113
114							=cut
115
116
117							# Let the code begin...
118
119
120							package Bio::SeqIO::embl;
121	7			7		651	use vars qw(%FTQUAL_NO_QUOTE);
	7					10
	7					405
122	7			7		32	use strict;
	7					7
	7					177
123	7			7		1443	use Bio::SeqIO::FTHelper;
	7					13
	7					160
124	7			7		29	use Bio::SeqFeature::Generic;
	7					10
	7					124
125	7			7		1290	use Bio::Species;
	7					10
	7					163
126	7			7		33	use Bio::Seq::SeqFactory;
	7					9
	7					142
127	7			7		26	use Bio::Annotation::Collection;
	7					9
	7					151
128	7			7		1140	use Bio::Annotation::Comment;
	7					8
	7					143
129	7			7		1530	use Bio::Annotation::Reference;
	7					9
	7					181
130	7			7		35	use Bio::Annotation::DBLink;
	7					9
	7					196
131
132	7			7		25	use base qw(Bio::SeqIO);
	7					11
	7					42142
133
134							# Note that a qualifier that exceeds one line (i.e. a long label) will
135							# automatically be quoted regardless:
136							%FTQUAL_NO_QUOTE=(
137							'anticodon'=>1,
138							'citation'=>1,
139							'codon'=>1,
140							'codon_start'=>1,
141							'cons_splice'=>1,
142							'direction'=>1,
143							'evidence'=>1,
144							'label'=>1,
145							'mod_base'=> 1,
146							'number'=> 1,
147							'rpt_type'=> 1,
148							'rpt_unit'=> 1,
149							'transl_except'=> 1,
150							'transl_table'=> 1,
151							'usedin'=> 1,
152							);
153
154							sub _initialize {
155	31			31		75	my($self,@args) = @_;
156
157	31					110	$self->SUPER::_initialize(@args);
158							# hash for functions for decoding keys.
159	31					92	$self->{'_func_ftunit_hash'} = {};
160							# sets this to one by default. People can change it
161	31					105	$self->_show_dna(1);
162	31	50				103	if ( ! defined $self->sequence_factory ) {
163	31					131	$self->sequence_factory(Bio::Seq::SeqFactory->new
164							(-verbose => $self->verbose(),
165							-type => 'Bio::Seq::RichSeq'));
166							}
167							}
168
169							=head2 next_seq
170
171							Title : next_seq
172							Usage : $seq = $stream->next_seq()
173							Function: returns the next sequence in the stream
174							Returns : Bio::Seq object
175							Args :
176
177							=cut
178
179							sub next_seq {
180	25			25	1	86	my ($self,@args) = @_;
181	25					30	my ($pseq,$c,$line,$name,$desc,$acc,$seqc,$mol,$div,
182							$date, $comment, @date_arr);
183
184	25					150	my ($annotation, %params, @features) =
185							Bio::Annotation::Collection->new();
186
187	25					78	$line = $self->_readline;
188							# This needs to be before the first eof() test
189
190	25	100				67	if ( !defined $line ) {
191	1					3	return; # no throws - end of file
192							}
193
194	24	100				124	if ( $line =~ /^\s+$/ ) {
195	1					4	while ( defined ($line = $self->_readline) ) {
196	1	50				6	$line =~/^\S/ && last;
197							}
198							# return without error if the whole next sequence was just a single
199							# blank line and then eof
200	1	50				3	return unless $line;
201							}
202
203							# no ID as 1st non-blank line, need short circuit and exit routine
204	24	50				105	$self->throw("EMBL stream with no ID. Not embl in my book")
205							unless $line =~ /^ID\s+\S+/;
206
207							# At this point we are sure that $line contains an ID header line
208	24					27	my $alphabet;
209	24	100				69	if ( $line =~ tr/;/;/ == 6) { # New style headers contain exactly six semicolons.
210
211							# New style header (EMBL Release >= 87, after June 2006)
212	10					16	my $topology;
213							my $sv;
214
215							# ID DQ299383; SV 1; linear; mRNA; STD; MAM; 431 BP.
216							# This regexp comes from the new2old.pl conversion script, from EBI
217	10	100				56	if ($line =~ m/^ID (\S+);\s+SV (\d+); (\w+); ([^;]+); (\w{3}); (\w{3}); (\d+) BP./) {
218	7					43	($name, $sv, $topology, $mol, $div) = ($1, $2, $3, $4, $6);
219							}
220	10	100				28	if (defined $sv) {
221	7					13	$params{'-seq_version'} = $sv;
222	7					13	$params{'-version'} = $sv;
223							}
224
225	10	50	66			48	if (defined $topology && $topology eq 'circular') {
226	0					0	$params{'-is_circular'} = 1;
227							}
228
229	10	100				25	if (defined $mol ) {
230	7	100				26	if ($mol =~ /DNA/) {
		50
		0
231	6					11	$alphabet = 'dna';
232							} elsif ($mol =~ /RNA/) {
233	1					3	$alphabet = 'rna';
234							} elsif ($mol =~ /AA/) {
235	0					0	$alphabet = 'protein';
236							}
237							}
238							} else {
239
240							# Old style header (EMBL Release < 87, before June 2006)
241	14	100				109	if ($line =~ /^ID\s+(\S+)[^;];\s+(\S+)[^;];\s+(\S+)[^;]*;/) {
242	12					55	($name, $mol, $div) = ($1, $2, $3);
243							}
244
245	14	100				33	if ($mol) {
246	12	50				32	if ( $mol =~ /circular/ ) {
247	0					0	$params{'-is_circular'} = 1;
248	0					0	$mol =~ s\|circular \|\|;
249							}
250	12	50				29	if (defined $mol ) {
251	12	100				38	if ($mol =~ /DNA/) {
		50
		0
252	10					22	$alphabet='dna';
253							} elsif ($mol =~ /RNA/) {
254	2					3	$alphabet='rna';
255							} elsif ($mol =~ /AA/) {
256	0					0	$alphabet='protein';
257							}
258							}
259							}
260							}
261
262	24	100	66			119	unless( defined $name && length($name) ) {
263	5					10	$name = "unknown_id";
264							}
265
266							# $self->warn("not parsing upper annotation in EMBL file yet!");
267	24					27	my $buffer = $line;
268	24					29	local $_;
269	24					28	BEFORE_FEATURE_TABLE :
270							my $ncbi_taxid;
271	24					63	until ( !defined $buffer ) {
272	499					404	$_ = $buffer;
273							# Exit at start of Feature table
274	499	100				1093	if ( /^(F[HT]\|SQ)/ ) {
275	24	100	100			150	$self->_pushback($_) if( $1 eq 'SQ' \|\| $1 eq 'FT');
276	24					35	last;
277							}
278							# Description line(s)
279	475	100				701	if (/^DE\s+(\S.*\S)/) {
280	22	100				91	$desc .= $desc ? " $1" : $1;
281							}
282
283							#accession number
284	475	100	100			1418	if ( /^AC\s+(.)?/ \|\| /^PA\s+(.)?/) {
285	24					109	my @accs = split(/[; ]+/, $1); # allow space in addition
286							$params{'-accession_number'} = shift @accs
287	24	100				82	unless defined $params{'-accession_number'};
288	24					32	push @{$params{'-secondary_accessions'}}, @accs;
	24					67
289							}
290
291							#version number
292	475	100				652	if ( /^SV\s+\S+\.(\d+);?/ ) {
293	6					11	my $sv = $1;
294							#$sv =~ s/\;//;
295	6					13	$params{'-seq_version'} = $sv;
296	6					9	$params{'-version'} = $sv;
297							}
298
299							#date (NOTE: takes last date line)
300	475	100				663	if ( /^DT\s+(.+)$/ ) {
301	26					55	my $line = $1;
302	26					66	my ($date, $version) = split(' ', $line, 2);
303	26					41	$date =~ tr/,//d; # remove comma if new version
304	26	100				54	if ($version) {
305	24	100				150	if ($version =~ /$Rel\. (\d+), Created$/ms ) {
		100
306	11					88	my $release = Bio::Annotation::SimpleValue->new(
307							-tagname => 'creation_release',
308							-value => $1
309							);
310	11					37	$annotation->add_Annotation($release);
311							} elsif ($version =~ /$Rel\. (\d+), Last updated, Version (\d+)$/ms ) {
312	11					47	my $release = Bio::Annotation::SimpleValue->new(
313							-tagname => 'update_release',
314							-value => $1
315							);
316	11					29	$annotation->add_Annotation($release);
317
318	11					53	my $update = Bio::Annotation::SimpleValue->new(
319							-tagname => 'update_version',
320							-value => $2
321							);
322	11					27	$annotation->add_Annotation($update);
323							}
324							}
325	26					33	push @{$params{'-dates'}}, $date;
	26					57
326							}
327
328							#keywords
329	475	100				1856	if ( /^KW (.)\S$/ ) {
		100
		100
		100
		100
		100
330	29					154	my @kw = split(/\s\;\s/,$1);
331	29					33	push @{$params{'-keywords'}}, @kw;
	29					110
332							}
333
334							# Organism name and phylogenetic information
335							elsif (/^O[SC]/) {
336							# pass the accession number so we can give an informative throw message if necessary
337	21					79	my $species = $self->_read_EMBL_Species(\$buffer, $params{'-accession_number'});
338	21					50	$params{'-species'}= $species;
339							}
340
341							# NCBI TaxID Xref
342							elsif (/^OX/) {
343	3	50				19	if (/NCBI_TaxID=(\d+)/) {
344	3					12	$ncbi_taxid=$1;
345							}
346
347	3					12	my @links = $self->_read_EMBL_TaxID_DBLink(\$buffer);
348	3					9	foreach my $dblink ( @links ) {
349	1					6	$annotation->add_Annotation('dblink',$dblink);
350							}
351							}
352
353							# References
354							elsif (/^R/) {
355	99					228	my @refs = $self->_read_EMBL_References(\$buffer);
356	99					136	foreach my $ref ( @refs ) {
357	99					235	$annotation->add_Annotation('reference',$ref);
358							}
359							}
360
361							# DB Xrefs
362							elsif (/^DR/) {
363	5					19	my @links = $self->_read_EMBL_DBLink(\$buffer);
364	5					7	foreach my $dblink ( @links ) {
365	25					36	$annotation->add_Annotation('dblink',$dblink);
366							}
367							}
368
369							# Comments
370							elsif (/^CC\s+(.*)/) {
371	15					39	$comment .= $1;
372	15					23	$comment .= " ";
373	15					30	while (defined ($_ = $self->_readline) ) {
374	257	100				569	if (/^CC\s+(.*)/) {
375	242					313	$comment .= $1;
376	242					356	$comment .= " ";
377							} else {
378	15					20	last;
379							}
380							}
381	15					103	my $commobj = Bio::Annotation::Comment->new();
382	15					46	$commobj->text($comment);
383	15					31	$annotation->add_Annotation('comment',$commobj);
384	15					15	$comment = "";
385							}
386
387							# Get next line.
388	475					897	$buffer = $self->_readline;
389							}
390
391	24					54	while ( defined ($_ = $self->_readline) ) {
392	46	100				126	/^FT\s{3}\w/ && last;
393	24	100				50	/^SQ / && last;
394	22	50				64	/^CO / && last;
395							}
396	24					41	$buffer = $_;
397
398	24	100	66			125	if (defined($buffer) && $buffer =~ /^FT /) {
399	22					59	until ( !defined ($buffer) ) {
400	256					558	my $ftunit = $self->_read_FTHelper_EMBL(\$buffer);
401
402							# process ftunit
403	256					538	my $feat =
404							$ftunit->_generic_seqfeature($self->location_factory(), $name);
405
406							# add taxon_id from source if available
407							# Notice, this will override what is found in the OX line.
408							# this is by design as this seems to be the official way
409							# of specifying a TaxID
410	256	100	100			705	if ($params{'-species'} && ($feat->primary_tag eq 'source')
			100
			66
411							&& $feat->has_tag('db_xref')
412							&& (! $params{'-species'}->ncbi_taxid())) {
413	12					31	foreach my $tagval ($feat->get_tag_values('db_xref')) {
414	12	50				45	if (index($tagval,"taxon:") == 0) {
415	12					39	$params{'-species'}->ncbi_taxid(substr($tagval,6));
416	12					23	last;
417							}
418							}
419							}
420
421							# add feature to list of features
422	256					312	push(@features, $feat);
423
424	256	100				1157	if ( $buffer !~ /^FT/ ) {
425	22					60	last;
426							}
427							}
428							}
429							# Set taxid found in OX line
430	24	100	100			129	if ($params{'-species'} && defined $ncbi_taxid
			100
431							&& (! $params{'-species'}->ncbi_taxid())) {
432	2					5	$params{'-species'}->ncbi_taxid($ncbi_taxid);
433							}
434
435							# skip comments
436	24		66			143	while ( defined ($buffer) && $buffer =~ /^XX/ ) {
437	20					48	$buffer = $self->_readline();
438							}
439
440	24	100				107	if ( $buffer =~ /^CO/ ) {
441							# bug#2982
442							# special : create contig as annotation
443	4					11	while ( defined ($buffer) ) {
444	4					14	$annotation->add_Annotation($_) for $self->_read_EMBL_Contig(\$buffer);
445	4	50	66			27	if ( !$buffer \|\| $buffer !~ /^CO/ ) {
446	4					5	last;
447							}
448							}
449	4		100			13	$buffer \|\|= '';
450							}
451	24	100				98	if ($buffer !~ /^\/\//) { # if no SQ lines following CO (bug#2958)
452	20	100				67	if ( $buffer !~ /^SQ/ ) {
453	1					4	while ( defined ($_ = $self->_readline) ) {
454	0	0				0	/^SQ/ && last;
455							}
456							}
457	20					23	$seqc = "";
458	20					50	while ( defined ($_ = $self->_readline) ) {
459	2924	100				3835	m{^//} && last;
460	2906					2635	$_ = uc($_);
461	2906					21416	s/[^A-Za-z]//g;
462	2906					5507	$seqc .= $_;
463							}
464							}
465	24					88	my $seq = $self->sequence_factory->create
466							(-verbose => $self->verbose(),
467							-division => $div,
468							-seq => $seqc,
469							-desc => $desc,
470							-display_id => $name,
471							-annotation => $annotation,
472							-molecule => $mol,
473							-alphabet => $alphabet,
474							-features => \@features,
475							%params);
476	24					228	return $seq;
477							}
478
479
480
481							=head2 _write_ID_line
482
483							Title : _write_ID_line
484							Usage : $self->_write_ID_line($seq);
485							Function: Writes the EMBL Release 87 format ID line to the stream, unless
486							: there is a user-supplied ID line generation function in which
487							: case that is used instead.
488							: ( See Bio::SeqIO::embl::_id_generation_function(). )
489							Returns : nothing
490							Args : Bio::Seq object
491
492							=cut
493
494							sub _write_ID_line {
495
496	11			11		18	my ($self, $seq) = @_;
497
498	11					14	my $id_line;
499							# If there is a user-supplied ID generation function, use it.
500	11	50				35	if ( $self->_id_generation_func ) {
501	0					0	$id_line = "ID " . &{$self->_id_generation_func}($seq) . "\nXX\n";
	0					0
502							}
503							# Otherwise, generate a standard EMBL release 87 (June 2006) ID line.
504							else {
505
506							# The sequence name is supposed to be the primary accession number,
507	11					45	my $name = $seq->accession_number();
508	11	100	66			75	if ( not(defined $name) \|\| $name eq 'unknown') {
509							# but if it is not present, use the sequence ID or the empty string
510	5		100			16	$name = $seq->id() \|\| '';
511							}
512
513	11	50				46	$self->warn("No whitespace allowed in EMBL id [". $name. "]") if $name =~ /\s/;
514
515							# Use the sequence version, or default to 1.
516	11		50			44	my $version = $seq->version() \|\| 1;
517
518	11					47	my $len = $seq->length();
519
520							# Taxonomic division.
521	11					18	my $div;
522	11	100	100			157	if ( $seq->can('division') && defined($seq->division) &&
			66
523							$self->_is_valid_division($seq->division) ) {
524	2					7	$div = $seq->division();
525							} else {
526	9		50			50	$div \|\|= 'UNC'; # 'UNC' is the EMBL division code for 'unclassified'.
527							}
528
529	11					19	my $mol;
530							# If the molecule type is a valid EMBL type, use it.
531	11	50	100			120	if ( $seq->can('molecule')
		100	66
			100
532							&& defined($seq->molecule)
533							&& $self->_is_valid_molecule_type($seq->molecule)
534							) {
535	0					0	$mol = $seq->molecule();
536							}
537							# Otherwise, choose unassigned DNA or RNA based on the alphabet.
538							elsif ($seq->can('primary_seq') && defined $seq->primary_seq->alphabet) {
539	6					16	my $alphabet =$seq->primary_seq->alphabet;
540	6	50				21	if ($alphabet eq 'dna') {
		0
		0
541	6					12	$mol ='unassigned DNA';
542							} elsif ($alphabet eq 'rna') {
543	0					0	$mol='unassigned RNA';
544							} elsif ($alphabet eq 'protein') {
545	0					0	$self->warn("Protein sequence found; EMBL is a nucleotide format.");
546	0					0	$mol='AA'; # AA is not a valid EMBL molecule type.
547							}
548							}
549
550	11					19	my $topology = 'linear';
551	11	50				38	if ($seq->is_circular) {
552	0					0	$topology = 'circular';
553							}
554
555	11		100			44	$mol \|\|= ''; # 'unassigned'; ?
556	11					68	$id_line = "ID $name; SV $version; $topology; $mol; STD; $div; $len BP.\nXX\n";
557							}
558	11					77	$self->_print($id_line);
559							}
560
561							=head2 _is_valid_division
562
563							Title : _is_valid_division
564							Usage : $self->_is_valid_division($div)
565							Function: tests division code for validity
566							Returns : true if $div is a valid EMBL release 87 taxonomic division.
567							Args : taxonomic division code string
568
569							=cut
570
571							sub _is_valid_division {
572	2			2		6	my ($self, $division) = @_;
573
574	2					30	my %EMBL_divisions = (
575							"PHG" => 1, # Bacteriophage
576							"ENV" => 1, # Environmental Sample
577							"FUN" => 1, # Fungal
578							"HUM" => 1, # Human
579							"INV" => 1, # Invertebrate
580							"MAM" => 1, # Other Mammal
581							"VRT" => 1, # Other Vertebrate
582							"MUS" => 1, # Mus musculus
583							"PLN" => 1, # Plant
584							"PRO" => 1, # Prokaryote
585							"ROD" => 1, # Other Rodent
586							"SYN" => 1, # Synthetic
587							"UNC" => 1, # Unclassified
588							"VRL" => 1 # Viral
589							);
590
591	2					17	return exists($EMBL_divisions{$division});
592							}
593
594							=head2 _is_valid_molecule_type
595
596							Title : _is_valid_molecule_type
597							Usage : $self->_is_valid_molecule_type($mol)
598							Function: tests molecule type for validity
599							Returns : true if $mol is a valid EMBL release 87 molecule type.
600							Args : molecule type string
601
602							=cut
603
604							sub _is_valid_molecule_type {
605	2			2		6	my ($self, $moltype) = @_;
606
607	2					31	my %EMBL_molecule_types = (
608							"genomic DNA" => 1,
609							"genomic RNA" => 1,
610							"mRNA" => 1,
611							"tRNA" => 1,
612							"rRNA" => 1,
613							"snoRNA" => 1,
614							"snRNA" => 1,
615							"scRNA" => 1,
616							"pre-RNA" => 1,
617							"other RNA" => 1,
618							"other DNA" => 1,
619							"unassigned DNA" => 1,
620							"unassigned RNA" => 1
621							);
622
623	2					41	return exists($EMBL_molecule_types{$moltype});
624							}
625
626							=head2 write_seq
627
628							Title : write_seq
629							Usage : $stream->write_seq($seq)
630							Function: writes the $seq object (must be seq) to the stream
631							Returns : 1 for success and undef for error
632							Args : array of 1 to n Bio::SeqI objects
633
634
635							=cut
636
637							sub write_seq {
638	13			13	1	69	my ($self,@seqs) = @_;
639
640	13					30	foreach my $seq ( @seqs ) {
641	13	50				38	$self->throw("Attempting to write with no seq!") unless defined $seq;
642	13	100	100			116	unless ( ref $seq && $seq->isa('Bio::SeqI' ) ) {
643	4	50				15	$self->warn("$seq is not a SeqI compliant sequence object!")
644							if $self->verbose >= 0;
645	4	100	66			23	unless ( ref $seq && $seq->isa('Bio::PrimarySeqI' ) ) {
646	2					23	$self->throw("$seq is not a PrimarySeqI compliant sequence object!");
647							}
648							}
649	11		100			50	my $str = $seq->seq \|\| '';
650
651							# Write the ID line.
652	11					41	$self->_write_ID_line($seq);
653
654
655							# Write the accession line if present
656	11					13	my( $acc );
657							{
658	11	50	66			12	if ( my $func = $self->_ac_generation_func ) {
	11	100				38
		50
659	0					0	$acc = &{$func}($seq);
	0					0
660							} elsif ( $seq->isa('Bio::Seq::RichSeqI') &&
661							defined($seq->accession_number) ) {
662	4					12	$acc = $seq->accession_number;
663	4					21	$acc = join("; ", $acc, $seq->get_secondary_accessions);
664							} elsif ( $seq->can('accession_number') ) {
665	7					23	$acc = $seq->accession_number;
666							}
667
668	11	50				57	if (defined $acc) {
669	11	50				52	$self->_print("AC $acc;\n",
670							"XX\n") \|\| return;
671							}
672							}
673
674							# Date lines
675	11					20	my $switch=0;
676	11	100				64	if ( $seq->can('get_dates') ) {
677	4					14	my @dates = $seq->get_dates();
678	4					9	my $ct = 1;
679	4					7	my $date_flag = 0;
680	4					21	my ($cr) = $seq->annotation->get_Annotations("creation_release");
681	4					12	my ($ur) = $seq->annotation->get_Annotations("update_release");
682	4					12	my ($uv) = $seq->annotation->get_Annotations("update_version");
683
684	4	0	33			15	unless ($cr && $ur && $ur) {
			33
685	4					6	$date_flag = 1;
686							}
687
688	4					10	foreach my $dt (@dates) {
689	0	0				0	if (!$date_flag) {
690	0	0				0	$self->_write_line_EMBL_regex("DT ","DT ",
691							$dt." (Rel. ".($cr->value()).", Created)",
692							'\s+\|$',80) if $ct == 1;
693	0	0				0	$self->_write_line_EMBL_regex("DT ","DT ",
694							$dt." (Rel. ".($ur->value()).", Last updated, Version ".($uv->value()).")",
695							'\s+\|$',80) if $ct == 2;
696							} else { # other formats?
697	0					0	$self->_write_line_EMBL_regex("DT ","DT ",
698							$dt,'\s+\|$',80);
699							}
700	0					0	$switch =1;
701	0					0	$ct++;
702							}
703	4	50				11	if ($switch == 1) {
704	0	0				0	$self->_print("XX\n") \|\| return;
705							}
706							}
707
708							# Description lines
709	11	50				44	$self->_write_line_EMBL_regex("DE ","DE ",$seq->desc(),'\s+\|$',80) \|\| return; #'
710	11	50				33	$self->_print( "XX\n") \|\| return;
711
712							# if there, write the kw line
713							{
714	11					17	my( $kw );
	11					914
715	11	50				47	if ( my $func = $self->_kw_generation_func ) {
		100
716	0					0	$kw = &{$func}($seq);
	0					0
717							} elsif ( $seq->can('keywords') ) {
718	4					19	$kw = $seq->keywords;
719							}
720	11	100				38	if (defined $kw) {
721	4	50				12	$self->_write_line_EMBL_regex("KW ", "KW ", $kw, '\s+\|$', 80) \|\| return; #'
722	4	50				12	$self->_print( "XX\n") \|\| return;
723							}
724							}
725
726							# Organism lines
727
728	11	100	100			73	if ($seq->can('species') && (my $spec = $seq->species)) {
729	6					31	my @class = $spec->classification();
730	6					11	shift @class; # get rid of species name. Some embl files include
731							# the species name in the OC lines, but this seems
732							# more like an error than something we need to
733							# emulate
734	6					22	my $OS = $spec->scientific_name;
735	6	50				22	if ($spec->common_name) {
736	0					0	$OS .= ' ('.$spec->common_name.')';
737							}
738	6	50				29	$self->_print("OS $OS\n") \|\| return;
739	6					33	my $OC = join('; ', reverse(@class)) .'.';
740	6	50				22	$self->_write_line_EMBL_regex("OC ","OC ",$OC,'; \|$',80) \|\| return;
741	6	50				26	if ($spec->organelle) {
742	0	0				0	$self->_write_line_EMBL_regex("OG ","OG ",$spec->organelle,'; \|$',80) \|\| return;
743							}
744	6					20	my $ncbi_taxid = $spec->ncbi_taxid;
745	6	100				26	if ($ncbi_taxid) {
746	2	50				7	$self->_print("OX NCBI_TaxID=$ncbi_taxid\n") \|\| return;
747							}
748	6	50				18	$self->_print("XX\n") \|\| return;
749							}
750							# Reference lines
751	11					20	my $t = 1;
752	11	100	66			80	if ( $seq->can('annotation') && defined $seq->annotation ) {
753	9					26	foreach my $ref ( $seq->annotation->get_Annotations('reference') ) {
754	6	50				24	$self->_print( "RN [$t]\n") \|\| return;
755
756							# Having no RP line is legal, but we need both
757							# start and end for a valid location.
758	6	50				23	if ($ref->comment) {
759	0	0				0	$self->_write_line_EMBL_regex("RC ", "RC ", $ref->comment, '\s+\|$', 80) \|\| return; #'
760							}
761	6					23	my $start = $ref->start;
762	6					23	my $end = $ref->end;
763	6	50	33			33	if ($start and $end) {
		0	0
764	6	50				28	$self->_print( "RP $start-$end\n") \|\| return;
765							} elsif ($start or $end) {
766	0					0	$self->throw("Both start and end are needed for a valid RP line.".
767							" Got: start='$start' end='$end'");
768							}
769
770	6	50				21	if (my $med = $ref->medline) {
771	0	0				0	$self->_print( "RX MEDLINE; $med.\n") \|\| return;
772							}
773	6	50				17	if (my $pm = $ref->pubmed) {
774	0	0				0	$self->_print( "RX PUBMED; $pm.\n") \|\| return;
775							}
776	6					19	my $authors = $ref->authors;
777	6					100	$authors =~ s/([\w\.]) (\w)/$1#$2/g; # add word wrap protection char '#'
778
779	6	50				31	$self->_write_line_EMBL_regex("RA ", "RA ",
780							$authors . ";",
781							'\s+\|$', 80) \|\| return; #'
782
783							# If there is no title to the reference, it appears
784							# as a single semi-colon. All titles must end in
785							# a semi-colon.
786	6		100			22	my $ref_title = $ref->title \|\| '';
787	6					52	$ref_title =~ s/[\s;]*$/;/;
788	6	50				19	$self->_write_line_EMBL_regex("RT ", "RT ", $ref_title, '\s+\|$', 80) \|\| return; #'
789	6	50				22	$self->_write_line_EMBL_regex("RL ", "RL ", $ref->location, '\s+\|$', 80) \|\| return; #'
790	6	50				14	$self->_print("XX\n") \|\| return;
791	6					19	$t++;
792							}
793
794							# DB Xref lines
795	9	100				30	if (my @db_xref = $seq->annotation->get_Annotations('dblink') ) {
796	2					4	for my $dr (@db_xref) {
797	43					70	my $db_name = $dr->database;
798	43					66	my $prim = $dr->primary_id;
799
800	43		50			57	my $opt = $dr->optional_id \|\| '';
801	43	50				82	my $line = $opt ? "$db_name; $prim; $opt." : "$db_name; $prim.";
802	43	50				55	$self->_write_line_EMBL_regex("DR ", "DR ", $line, '\s+\|$', 80) \|\| return; #'
803							}
804	2	50				6	$self->_print("XX\n") \|\| return;
805							}
806
807							# Comment lines
808	9					28	foreach my $comment ( $seq->annotation->get_Annotations('comment') ) {
809	4	50				16	$self->_write_line_EMBL_regex("CC ", "CC ", $comment->text, '\s+\|$', 80) \|\| return; #'
810	4	50				11	$self->_print("XX\n") \|\| return;
811							}
812							}
813							# "\\s\+\\|\$"
814
815							## FEATURE TABLE
816
817	11	50				35	$self->_print("FH Key Location/Qualifiers\n") \|\| return;
818	11	50				28	$self->_print("FH\n") \|\| return;
819
820	11	100				88	my @feats = $seq->can('top_SeqFeatures') ? $seq->top_SeqFeatures : ();
821	11	100				38	if ($feats[0]) {
822	6	50				19	if ( defined $self->_post_sort ) {
823							# we need to read things into an array.
824							# Process. Sort them. Print 'em
825
826	0					0	my $post_sort_func = $self->_post_sort();
827	0					0	my @fth;
828
829	0					0	foreach my $sf ( @feats ) {
830	0					0	push(@fth,Bio::SeqIO::FTHelper::from_SeqFeature($sf,$seq));
831							}
832
833	0					0	@fth = sort { &$post_sort_func($a,$b) } @fth;
	0					0
834
835	0					0	foreach my $fth ( @fth ) {
836	0	0				0	$self->_print_EMBL_FTHelper($fth) \|\| return;
837							}
838							} else {
839							# not post sorted. And so we can print as we get them.
840							# lower memory load...
841
842	6					12	foreach my $sf ( @feats ) {
843	38					95	my @fth = Bio::SeqIO::FTHelper::from_SeqFeature($sf,$seq);
844	38					47	foreach my $fth ( @fth ) {
845	38	50				84	if ( $fth->key eq 'CONTIG') {
846	0					0	$self->_show_dna(0);
847							}
848	38	50				92	$self->_print_EMBL_FTHelper($fth) \|\| return;
849							}
850							}
851							}
852							}
853
854	11	50				35	if ( $self->_show_dna() == 0 ) {
855	0	0				0	$self->_print( "//\n") \|\| return;
856	0					0	return;
857							}
858	11	50				36	$self->_print( "XX\n") \|\| return;
859
860							# finished printing features.
861
862							# print contig if present : bug#2982
863	11	100	66			87	if ( $seq->can('annotation') && defined $seq->annotation) {
864	9					27	foreach my $ctg ( $seq->annotation->get_Annotations('contig') ) {
865	0	0				0	if ($ctg->value) {
866	0	0				0	$self->_write_line_EMBL_regex("CO ","CO ", $ctg->value,
867							'[,]\|$', 80) \|\| return;
868							}
869							}
870							}
871							# print sequence lines only if sequence is present! bug#2982
872	11	100				36	if (length($str)) {
873	8					42	$str =~ tr/A-Z/a-z/;
874
875							# Count each nucleotide
876	8					25	my $alen = $str =~ tr/a/a/;
877	8					22	my $clen = $str =~ tr/c/c/;
878	8					27	my $glen = $str =~ tr/g/g/;
879	8					21	my $tlen = $str =~ tr/t/t/;
880
881	8					20	my $len = $seq->length();
882	8					17	my $olen = $seq->length() - ($alen + $tlen + $clen + $glen);
883	8	50				24	if ( $olen < 0 ) {
884	0					0	$self->warn("Weird. More atgc than bases. Problem!");
885							}
886
887	8	50				56	$self->_print("SQ Sequence $len BP; $alen A; $clen C; $glen G; $tlen T; $olen other;\n") \|\| return;
888
889	8					16	my $nuc = 60; # Number of nucleotides per line
890	8					14	my $whole_pat = 'a10' x 6; # Pattern for unpacking a whole line
891	8					34	my $out_pat = 'A11' x 6; # Pattern for packing a line
892	8					11	my $length = length($str);
893
894							# Calculate the number of nucleotides which fit on whole lines
895	8					37	my $whole = int($length / $nuc) * $nuc;
896
897							# Print the whole lines
898	8					11	my( $i );
899	8					26	for ($i = 0; $i < $whole; $i += $nuc) {
900	162					687	my $blocks = pack $out_pat,
901							unpack $whole_pat,
902							substr($str, $i, $nuc);
903	162	50				660	$self->_print(sprintf(" $blocks%9d\n", $i + $nuc)) \|\| return;
904							}
905
906							# Print the last line
907	8	50				33	if (my $last = substr($str, $i)) {
908	8					14	my $last_len = length($last);
909	8					36	my $last_pat = 'a10' x int($last_len / 10) .'a'. $last_len % 10;
910	8					72	my $blocks = pack $out_pat,
911							unpack($last_pat, $last);
912	8	50				55	$self->_print(sprintf(" $blocks%9d\n", $length)) \|\|
913							return; # Add the length to the end
914							}
915							}
916
917	11	50				32	$self->_print( "//\n") \|\| return;
918
919	11	50	33			34	$self->flush if $self->_flush_on_write && defined $self->_fh;
920							}
921	11					55	return 1;
922							}
923
924							=head2 _print_EMBL_FTHelper
925
926							Title : _print_EMBL_FTHelper
927							Usage :
928							Function: Internal function
929							Returns : 1 if writing suceeded, otherwise undef
930							Args :
931
932
933							=cut
934
935							sub _print_EMBL_FTHelper {
936	38			38		47	my ($self,$fth) = @_;
937
938	38	50	33			231	if ( ! ref $fth \|\| ! $fth->isa('Bio::SeqIO::FTHelper') ) {
939	0					0	$fth->warn("$fth is not a FTHelper class. Attempting to print, but there could be tears!");
940							}
941
942
943							#$self->_print( "FH Key Location/Qualifiers\n");
944							#$self->_print( sprintf("FT %-15s %s\n",$fth->key,$fth->loc));
945							# let
946	38	50				72	if ( $fth->key eq 'CONTIG' ) {
947	0	0				0	$self->_print("XX\n") \|\| return;
948	0	0				0	$self->_write_line_EMBL_regex("CO ",
949							"CO ",$fth->loc,
950							'\,\|$',80) \|\| return; #'
951	0					0	return 1;
952							}
953	38	50				77	$self->_write_line_EMBL_regex(sprintf("FT %-15s ",$fth->key),
954							"FT ",$fth->loc,
955							'\,\|$',80) \|\| return; #'
956	38					50	foreach my $tag ( keys %{$fth->field} ) {
	38					88
957	108	50				196	if ( ! defined $fth->field->{$tag} ) {
958	0					0	next;
959							}
960	108					215	foreach my $value ( @{$fth->field->{$tag}} ) {
	108					147
961	139					175	$value =~ s/\"/\"\"/g;
962	139	50	66			407	if ($value eq "_no_value") {
		50
963	0	0				0	$self->_write_line_EMBL_regex("FT ",
964							"FT ",
965							"/$tag",'.\|$',80) \|\| return; #'
966							}
967							# there are almost 3x more quoted qualifier values and they
968							# are more common too so we take quoted ones first
969							#
970							# Long qualifiers, that will be line wrapped, are always quoted
971							elsif (!$FTQUAL_NO_QUOTE{$tag} or length("/$tag=$value")>=60) {
972	139	100				492	my $pat = $value =~ /\s+/ ? '\s+\|\-\|$' : '.\|\-\|$';
973	139	50				367	$self->_write_line_EMBL_regex("FT ",
974							"FT ",
975							"/$tag=\"$value\"",$pat,80) \|\| return;
976							} else {
977	0	0				0	$self->_write_line_EMBL_regex("FT ",
978							"FT ",
979							"/$tag=$value",'.\|$',80) \|\| return; #'
980							}
981							}
982							}
983
984	38					229	return 1;
985							}
986
987
988
989							=head2 _read_EMBL_Contig()
990
991							Title : _read_EMBL_Contig
992							Usage :
993							Function: convert CO lines into annotations
994							Returns :
995							Args :
996
997							=cut
998
999							sub _read_EMBL_Contig {
1000	4			4		5	my ($self, $buffer) = @_;
1001	4					5	my @ret;
1002	4	50				16	if ( $$buffer !~ /^CO/ ) {
1003	0					0	warn("Not parsing line '$$buffer' which maybe important");
1004							}
1005	4					11	$self->_pushback($$buffer);
1006	4					9	while ( defined ($_ = $self->_readline) ) {
1007	356	100				749	/^C/ \|\| last;
1008	353	50				756	/^CO\s+(.*)/ && do {
1009	353					861	push @ret, Bio::Annotation::SimpleValue->new( -tagname => 'contig',
1010							-value => $1);
1011							};
1012							}
1013	4					8	$$buffer = $_;
1014	4					54	return @ret;
1015
1016							}
1017							#'
1018							=head2 _read_EMBL_References
1019
1020							Title : _read_EMBL_References
1021							Usage :
1022							Function: Reads references from EMBL format. Internal function really
1023							Example :
1024							Returns :
1025							Args :
1026
1027
1028							=cut
1029
1030							sub _read_EMBL_References {
1031	99			99		107	my ($self,$buffer) = @_;
1032	99					65	my (@refs);
1033
1034							# assume things are starting with RN
1035
1036	99	50				233	if ( $$buffer !~ /^RN/ ) {
1037	0					0	warn("Not parsing line '$$buffer' which maybe important");
1038							}
1039	99					88	my $b1;
1040							my $b2;
1041	0					0	my $title;
1042	0					0	my $loc;
1043	0					0	my $au;
1044	0					0	my $med;
1045	0					0	my $pm;
1046	0					0	my $com;
1047
1048	99					169	while ( defined ($_ = $self->_readline) ) {
1049	727	100				1411	/^R/ \|\| last;
1050	628	100				965	/^RP (\d+)-(\d+)/ && do {$b1=$1;$b2=$2;};
	40					87
	40					68
1051	628	100				806	/^RX MEDLINE;\s+(\d+)/ && do {$med=$1};
	55					96
1052	628	100				762	/^RX PUBMED;\s+(\d+)/ && do {$pm=$1};
	10					18
1053	628	100				880	/^RA (.*)/ && do {
1054	193					357	$au = $self->_concatenate_lines($au,$1); next;
	193					320
1055							};
1056	435	100				656	/^RT (.*)/ && do {
1057	164					258	$title = $self->_concatenate_lines($title,$1); next;
	164					294
1058							};
1059	271	100				459	/^RL (.*)/ && do {
1060	156					223	$loc = $self->_concatenate_lines($loc,$1); next;
	156					231
1061							};
1062	115	100				274	/^RC (.*)/ && do {
1063	3					11	$com = $self->_concatenate_lines($com,$1); next;
	3					8
1064							};
1065							}
1066
1067	99					312	my $ref = Bio::Annotation::Reference->new();
1068	99					385	$au =~ s/;\s*$//g;
1069	99					206	$title =~ s/;\s*$//g;
1070
1071	99					216	$ref->start($b1);
1072	99					168	$ref->end($b2);
1073	99					159	$ref->authors($au);
1074	99					163	$ref->title($title);
1075	99					150	$ref->location($loc);
1076	99					144	$ref->medline($med);
1077	99					154	$ref->comment($com);
1078	99					169	$ref->pubmed($pm);
1079
1080	99					103	push(@refs,$ref);
1081	99					125	$$buffer = $_;
1082
1083	99					178	return @refs;
1084							}
1085
1086							=head2 _read_EMBL_Species
1087
1088							Title : _read_EMBL_Species
1089							Usage :
1090							Function: Reads the EMBL Organism species and classification
1091							lines.
1092							Example :
1093							Returns : A Bio::Species object
1094							Args : a reference to the current line buffer, accession number
1095
1096							=cut
1097
1098							sub _read_EMBL_Species {
1099	21			21		39	my( $self, $buffer, $acc ) = @_;
1100	21					27	my $org;
1101
1102	21					27	$_ = $$buffer;
1103	21					34	my( $sub_species, $species, $genus, $common, $sci_name, $class_lines );
1104	21		66			76	while (defined( $_ \|\|= $self->_readline )) {
1105	92	100				332	if (/^OS\s+(.+)/) {
		100
		50
1106	21	50				79	$sci_name .= ($sci_name) ? ' '.$1 : $1;
1107							} elsif (s/^OC\s+(.+)$//) {
1108	50					104	$class_lines .= $1;
1109							} elsif (/^OG\s+(.*)/) {
1110	0					0	$org = $1;
1111							} else {
1112	21					35	last;
1113							}
1114
1115	71					194	$_ = undef; # Empty $_ to trigger read of next line
1116							}
1117
1118							# $$buffer = $_;
1119	21					71	$self->_pushback($_);
1120
1121	21					32	$sci_name =~ s{\.$}{};
1122	21	50				103	$sci_name \|\| return;
1123
1124							# Convert data in classification lines into classification array.
1125							# only split on ';' or '.' so that classification that is 2 or more words
1126							# will still get matched, use map() to remove trailing/leading/intervening
1127							# spaces
1128	21					188	my @class = map { s/^\s+//; s/\s+$//; s/\s{2,}/ /g; $_; } split /(?
	224					308
	224					204
	224					182
	224					252
1129
1130							# do we have a genus?
1131	21					41	my $possible_genus = $class[-1];
1132	21	50				66	$possible_genus .= "\|$class[-2]" if $class[-2];
1133	21	50				626	if ($sci_name =~ /^($possible_genus)/) {
1134	21					49	$genus = $1;
1135	21					213	($species) = $sci_name =~ /^$genus\s+(.+)/;
1136							} else {
1137	0					0	$species = $sci_name;
1138							}
1139
1140							# Don't make a species object if it is "Unknown" or "None"
1141	21	50				63	if ($genus) {
1142	21	50				122	return if $genus =~ /^(Unknown\|None)$/i;
1143							}
1144
1145							# is this organism of rank species or is it lower?
1146							# (doesn't catch everything, but at least the guess isn't dangerous)
1147	21	50				98	if ($species =~ /subsp\.\|var\./) {
1148	0					0	($species, $sub_species) = $species =~ /(.+)\s+((?:subsp\.\|var\.).+)/;
1149							}
1150
1151							# sometimes things have common name in brackets, like
1152							# Schizosaccharomyces pombe (fission yeast), so get rid of the common
1153							# name bit. Probably dangerous if real scientific species name ends in
1154							# bracketed bit.
1155	21	50				59	unless ($class[-1] eq 'Viruses') {
1156	21					81	($species, $common) = $species =~ /^(.+)\s+$(.+)$$/;
1157	21	100				82	$sci_name =~ s/\s+$.+$$// if $common;
1158							}
1159
1160							# Bio::Species array needs array in Species -> Kingdom direction
1161	21	50				52	unless ($class[-1] eq $sci_name) {
1162	21					38	push(@class, $sci_name);
1163							}
1164	21					33	@class = reverse @class;
1165
1166							# do minimal sanity checks before we hand off to Bio::Species which won't
1167							# be able to give informative throw messages if it has to throw because
1168							# of problems here
1169	21	50				44	$self->throw("$acc seems to be missing its OS line: invalid.") unless $sci_name;
1170	21					28	my %names;
1171	21					86	foreach my $i (0..$#class) {
1172	245					187	my $name = $class[$i];
1173	245					336	$names{$name}++;
1174							# this code breaks examples like: Xenopus (Silurana) tropicalis
1175							# commenting out, see bug 3158
1176
1177							#if ($names{$name} > 1 && ($name ne $class[$i - 1])) {
1178							# $self->warn("$acc seems to have an invalid species classification:$name ne $class[$i - 1]");
1179							#}
1180							}
1181	21					155	my $make = Bio::Species->new();
1182	21					61	$make->scientific_name($sci_name);
1183	21					64	$make->classification(@class);
1184	21	50				75	unless ($class[-1] eq 'Viruses') {
1185	21	50				100	$make->genus($genus) if $genus;
1186	21	100				60	$make->species($species) if $species;
1187	21	50				46	$make->sub_species($sub_species) if $sub_species;
1188	21	100				108	$make->common_name($common) if $common;
1189							}
1190	21	50				50	$make->organelle($org) if $org;
1191	21					113	return $make;
1192							}
1193
1194							=head2 _read_EMBL_DBLink
1195
1196							Title : _read_EMBL_DBLink
1197							Usage :
1198							Function: Reads the EMBL database cross reference ("DR") lines
1199							Example :
1200							Returns : A list of Bio::Annotation::DBLink objects
1201							Args :
1202
1203							=cut
1204
1205							sub _read_EMBL_DBLink {
1206	5			5		8	my( $self,$buffer ) = @_;
1207	5					5	my( @db_link );
1208
1209	5					5	$_ = $$buffer;
1210	5		66			24	while (defined( $_ \|\|= $self->_readline )) {
1211	30	100				125	if ( /^DR ([^\s;]+);\s([^\s;]+);?\s([^\s;]+)?\.$/) {
1212	25					68	my ($databse, $prim_id, $sec_id) = ($1,$2,$3);
1213	25					76	my $link = Bio::Annotation::DBLink->new(-database => $databse,
1214							-primary_id => $prim_id,
1215							-optional_id => $sec_id);
1216
1217	25					39	push(@db_link, $link);
1218							} else {
1219	5					9	last;
1220							}
1221	25					72	$_ = undef; # Empty $_ to trigger read of next line
1222							}
1223
1224	5					13	$$buffer = $_;
1225	5					16	return @db_link;
1226							}
1227
1228							=head2 _read_EMBL_TaxID_DBLink
1229
1230							Title : _read_EMBL_TaxID_DBLink
1231							Usage :
1232							Function: Reads the EMBL database cross reference to NCBI TaxID ("OX") lines
1233							Example :
1234							Returns : A list of Bio::Annotation::DBLink objects
1235							Args :
1236
1237							=cut
1238
1239							sub _read_EMBL_TaxID_DBLink {
1240	3			3		5	my( $self,$buffer ) = @_;
1241	3					4	my( @db_link );
1242
1243	3					4	$_ = $$buffer;
1244	3		66			14	while (defined( $_ \|\|= $self->_readline )) {
1245	4	100				17	if ( /^OX (\S+)=(\d+);$/ ) {
1246	1					4	my ($databse, $prim_id) = ($1,$2);
1247	1					15	my $link = Bio::Annotation::DBLink->new(-database => $databse,
1248							-primary_id => $prim_id,);
1249	1					3	push(@db_link, $link);
1250							} else {
1251	3					6	last;
1252							}
1253	1					7	$_ = undef; # Empty $_ to trigger read of next line
1254							}
1255
1256	3					5	$$buffer = $_;
1257	3					7	return @db_link;
1258							}
1259
1260							=head2 _filehandle
1261
1262							Title : _filehandle
1263							Usage : $obj->_filehandle($newval)
1264							Function:
1265							Example :
1266							Returns : value of _filehandle
1267							Args : newvalue (optional)
1268
1269
1270							=cut
1271
1272							sub _filehandle{
1273	0			0		0	my ($obj,$value) = @_;
1274	0	0				0	if ( defined $value) {
1275	0					0	$obj->{'_filehandle'} = $value;
1276							}
1277	0					0	return $obj->{'_filehandle'};
1278
1279							}
1280
1281							=head2 _read_FTHelper_EMBL
1282
1283							Title : _read_FTHelper_EMBL
1284							Usage : _read_FTHelper_EMBL($buffer)
1285							Function: reads the next FT key line
1286							Example :
1287							Returns : Bio::SeqIO::FTHelper object
1288							Args : filehandle and reference to a scalar
1289
1290
1291							=cut
1292
1293							sub _read_FTHelper_EMBL {
1294	256			256		220	my ($self,$buffer) = @_;
1295
1296	256					194	my ($key, # The key of the feature
1297							$loc, # The location line from the feature
1298							@qual, # An arrray of lines making up the qualifiers
1299							);
1300
1301	256	50				859	if ($$buffer =~ /^FT\s{3}(\S+)\s+(\S+)/ ) {
		0
1302	256					420	$key = $1;
1303	256					313	$loc = $2;
1304							# Read all the lines up to the next feature
1305	256					539	while ( defined($_ = $self->_readline) ) {
1306	2038	100				9833	if (/^FT(\s+)(.+?)\s*$/) {
1307							# Lines inside features are preceeded by 19 spaces
1308							# A new feature is preceeded by 3 spaces
1309	2016	100				2899	if (length($1) > 4) {
1310							# Add to qualifiers if we're in the qualifiers
1311	1782	100				2172	if (@qual) {
		100
1312	1523					3437	push(@qual, $2);
1313							}
1314							# Start the qualifier list if it's the first qualifier
1315							elsif (substr($2, 0, 1) eq '/') {
1316	254					692	@qual = ($2);
1317							}
1318							# We're still in the location line, so append to location
1319							else {
1320	5					17	$loc .= $2;
1321							}
1322							} else {
1323							# We've reached the start of the next feature
1324	234					292	last;
1325							}
1326							} else {
1327							# We're at the end of the feature table
1328	22					40	last;
1329							}
1330							}
1331							} elsif ( $$buffer =~ /^CO\s+(\S+)/) {
1332	0					0	$key = 'CONTIG';
1333	0					0	$loc = $1;
1334							# Read all the lines up to the next feature
1335	0					0	while ( defined($_ = $self->_readline) ) {
1336	0	0				0	if (/^CO\s+(\S+)\s*$/) {
1337	0					0	$loc .= $1;
1338							} else {
1339							# We've reached the start of the next feature
1340	0					0	last;
1341							}
1342							}
1343							} else {
1344							# No feature key
1345	0					0	return;
1346							}
1347
1348							# Put the first line of the next feature into the buffer
1349	256					268	$$buffer = $_;
1350
1351							# Make the new FTHelper object
1352	256					607	my $out = Bio::SeqIO::FTHelper->new();
1353	256					495	$out->verbose($self->verbose());
1354	256					458	$out->key($key);
1355	256					429	$out->loc($loc);
1356
1357							# Now parse and add any qualifiers. (@qual is kept
1358							# intact to provide informative error messages.)
1359	256					218	my $last_unquoted_qualifier;
1360							QUAL:
1361	256					500	for (my $i = 0; $i < @qual; $i++) {
1362	1091					895	my $data = $qual[$i];
1363	1091					779	my ( $qualifier, $value );
1364
1365	1091	50				3869	unless (( $qualifier, $value ) = ($data =~ m{^/([^=]+)(?:=\s(.))?})) {
1366	0	0				0	if ( defined $last_unquoted_qualifier ) {
1367							# handle case of unquoted multiline - read up everything until the next qualifier
1368	0		0			0	do {
1369							# Protein sequence translations need to be joined without spaces,
1370							# other qualifiers need those.
1371	0	0				0	$value .= ' ' if $qualifier ne "translation";
1372	0					0	$value .= $data;
1373							} while defined($data = $qual[++$i]) && $data !~ m[^/];
1374	0					0	$i--;
1375	0					0	$out->field->{$last_unquoted_qualifier}->[-1] .= $value;
1376	0					0	$last_unquoted_qualifier = undef;
1377	0					0	next QUAL;
1378							} else {
1379	0					0	$self->throw("Can't see new qualifier in: $_\nfrom:\n"
1380							. join('', map "$_\n", @qual));
1381							}
1382							}
1383	1091	50				1423	$qualifier = '' if not defined $qualifier;
1384
1385	1091					760	$last_unquoted_qualifier = undef;
1386	1091	50				1081	if (defined $value) {
1387							# Do we have a quoted value?
1388	1091	100				1315	if (substr($value, 0, 1) eq '"') {
1389							# Keep adding to value until we find the trailing quote
1390							# and the quotes are balanced
1391							QUOTES:
1392	971		66			3450	while ($value !~ /"$/ or $value =~ tr/"/"/ % 2) { #"
1393	686					458	$i++;
1394	686					484	my $next = $qual[$i];
1395	686	50				817	if (!defined($next)) {
1396	0					0	$self->warn("Unbalanced quote in:\n".join("\n", @qual).
1397							"\nAdding quote to close...".
1398							"Check sequence quality!");
1399	0					0	$value .= '"';
1400	0					0	last QUOTES;
1401							}
1402
1403							# Protein sequence translations need to be joined without spaces,
1404							# other qualifiers need those.
1405	686	100				663	if ($qualifier eq "translation") {
1406	569					1436	$value .= $next;
1407							} else {
1408	117					522	$value .= " $next";
1409							}
1410							}
1411							# Trim leading and trailing quotes
1412	971					7023	$value =~ s/^"\|"$//g;
1413							# Undouble internal quotes
1414	971					1014	$value =~ s/""/"/g; #"
1415							} else {
1416	120					136	$last_unquoted_qualifier = $qualifier;
1417							}
1418							} else {
1419	0					0	$value = '_no_value';
1420							}
1421
1422							# Store the qualifier
1423	1091		100			1681	$out->field->{$qualifier} \|\|= [];
1424	1091					814	push(@{$out->field->{$qualifier}},$value);
	1091					1438
1425							}
1426
1427	256					564	return $out;
1428							}
1429
1430							=head2 _write_line_EMBL
1431
1432							Title : _write_line_EMBL
1433							Usage :
1434							Function: internal function
1435							Example :
1436							Returns : 1 if writing suceeded, else undef
1437							Args :
1438
1439
1440							=cut
1441
1442							sub _write_line_EMBL {
1443	0			0		0	my ($self,$pre1,$pre2,$line,$length) = @_;
1444
1445	0	0				0	$length \|\| $self->throw("Miscalled write_line_EMBL without length. Programming error!");
1446	0					0	my $subl = $length - length $pre2;
1447	0					0	my $linel = length $line;
1448	0					0	my $i;
1449
1450	0					0	my $sub = substr($line,0,$length - length $pre1);
1451
1452	0	0				0	$self->_print( "$pre1$sub\n") \|\| return;
1453
1454	0					0	for ($i= ($length - length $pre1);$i < $linel;) {
1455	0					0	$sub = substr($line,$i,($subl));
1456	0	0				0	$self->_print( "$pre2$sub\n") \|\| return;
1457	0					0	$i += $subl;
1458							}
1459
1460	0					0	return 1;
1461							}
1462
1463							=head2 _write_line_EMBL_regex
1464
1465							Title : _write_line_EMBL_regex
1466							Usage :
1467							Function: internal function for writing lines of specified
1468							length, with different first and the next line
1469							left hand headers and split at specific points in the
1470							text
1471							Example :
1472							Returns : nothing
1473							Args : file handle, first header, second header, text-line, regex for line breaks, total line length
1474
1475
1476							=cut
1477
1478							sub _write_line_EMBL_regex {
1479	263			263		363	my ($self,$pre1,$pre2,$line,$regex,$length) = @_;
1480
1481							#print STDOUT "Going to print with $line!\n";
1482
1483	263	50				390	$length \|\| $self->throw("Programming error - called write_line_EMBL_regex without length.");
1484
1485	263					294	my $subl = $length - (length $pre1) -1 ;
1486	263					205	my( @lines );
1487
1488	263					370	CHUNK: while($line) {
1489	401					558	foreach my $pat ($regex, '[,;\.\/-]\s\|'.$regex, '[,;\.\/-]\|'.$regex) {
1490	401	50				4130	if ($line =~ m/^(.{1,$subl})($pat)(.*)/ ) {
1491	401					890	my $l = $1.$2;
1492	401	100				587	$l =~ s/#/ /g # remove word wrap protection char '#'
1493							if $pre1 eq "RA ";
1494	401					501	my $newl = $3;
1495	401					483	$line = substr($line,length($l));
1496							# be strict about not padding spaces according to
1497							# genbank format
1498	401					1056	$l =~ s/\s+$//;
1499	401	50				668	next CHUNK if ($l eq '');
1500	401					424	push(@lines, $l);
1501	401					1076	next CHUNK;
1502							}
1503							}
1504							# if we get here none of the patterns matched $subl or less chars
1505	0					0	$self->warn("trouble dissecting \"$line\"\n into chunks ".
1506							"of $subl chars or less - this tag won't print right");
1507							# insert a space char to prevent infinite loops
1508	0					0	$line = substr($line,0,$subl) . " " . substr($line,$subl);
1509							}
1510	263					293	my $s = shift @lines;
1511	263	100	50			952	($self->_print("$pre1$s\n") \|\| return) if $s;
1512	263					389	foreach my $s ( @lines ) {
1513	145	50				332	$self->_print("$pre2$s\n") \|\| return;
1514							}
1515
1516	263					692	return 1;
1517							}
1518
1519							=head2 _post_sort
1520
1521							Title : _post_sort
1522							Usage : $obj->_post_sort($newval)
1523							Function:
1524							Returns : value of _post_sort
1525							Args : newvalue (optional)
1526
1527
1528							=cut
1529
1530							sub _post_sort{
1531	6			6		12	my $obj = shift;
1532	6	50				25	if ( @_ ) {
1533	0					0	my $value = shift;
1534	0					0	$obj->{'_post_sort'} = $value;
1535							}
1536	6					17	return $obj->{'_post_sort'};
1537
1538							}
1539
1540							=head2 _show_dna
1541
1542							Title : _show_dna
1543							Usage : $obj->_show_dna($newval)
1544							Function:
1545							Returns : value of _show_dna
1546							Args : newvalue (optional)
1547
1548
1549							=cut
1550
1551							sub _show_dna{
1552	42			42		62	my $obj = shift;
1553	42	100				223	if ( @_ ) {
1554	31					45	my $value = shift;
1555	31					65	$obj->{'_show_dna'} = $value;
1556							}
1557	42					79	return $obj->{'_show_dna'};
1558
1559							}
1560
1561							=head2 _id_generation_func
1562
1563							Title : _id_generation_func
1564							Usage : $obj->_id_generation_func($newval)
1565							Function:
1566							Returns : value of _id_generation_func
1567							Args : newvalue (optional)
1568
1569
1570							=cut
1571
1572							sub _id_generation_func{
1573	11			11		13	my $obj = shift;
1574	11	50				31	if ( @_ ) {
1575	0					0	my $value = shift;
1576	0					0	$obj->{'_id_generation_func'} = $value;
1577							}
1578	11					28	return $obj->{'_id_generation_func'};
1579
1580							}
1581
1582							=head2 _ac_generation_func
1583
1584							Title : _ac_generation_func
1585							Usage : $obj->_ac_generation_func($newval)
1586							Function:
1587							Returns : value of _ac_generation_func
1588							Args : newvalue (optional)
1589
1590
1591							=cut
1592
1593							sub _ac_generation_func{
1594	11			11		18	my $obj = shift;
1595	11	50				31	if ( @_ ) {
1596	0					0	my $value = shift;
1597	0					0	$obj->{'_ac_generation_func'} = $value;
1598							}
1599	11					153	return $obj->{'_ac_generation_func'};
1600
1601							}
1602
1603							=head2 _sv_generation_func
1604
1605							Title : _sv_generation_func
1606							Usage : $obj->_sv_generation_func($newval)
1607							Function:
1608							Returns : value of _sv_generation_func
1609							Args : newvalue (optional)
1610
1611
1612							=cut
1613
1614							sub _sv_generation_func{
1615	0			0		0	my $obj = shift;
1616	0	0				0	if ( @_ ) {
1617	0					0	my $value = shift;
1618	0					0	$obj->{'_sv_generation_func'} = $value;
1619							}
1620	0					0	return $obj->{'_sv_generation_func'};
1621
1622							}
1623
1624							=head2 _kw_generation_func
1625
1626							Title : _kw_generation_func
1627							Usage : $obj->_kw_generation_func($newval)
1628							Function:
1629							Returns : value of _kw_generation_func
1630							Args : newvalue (optional)
1631
1632
1633							=cut
1634
1635							sub _kw_generation_func{
1636	11			11		18	my $obj = shift;
1637	11	50				34	if ( @_ ) {
1638	0					0	my $value = shift;
1639	0					0	$obj->{'_kw_generation_func'} = $value;
1640							}
1641	11					89	return $obj->{'_kw_generation_func'};
1642
1643							}
1644
1645							1;