| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
# |
|
2
|
|
|
|
|
|
|
# BioPerl module for Bio::SeqIO::Handler::GenericRichSeqHandler |
|
3
|
|
|
|
|
|
|
# |
|
4
|
|
|
|
|
|
|
# Please direct questions and support issues to |
|
5
|
|
|
|
|
|
|
# |
|
6
|
|
|
|
|
|
|
# Cared for by Chris Fields |
|
7
|
|
|
|
|
|
|
# |
|
8
|
|
|
|
|
|
|
# Copyright Chris Fields |
|
9
|
|
|
|
|
|
|
# |
|
10
|
|
|
|
|
|
|
# You may distribute this module under the same terms as perl itself |
|
11
|
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
# POD documentation - main docs before the code |
|
13
|
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
=head1 NAME |
|
15
|
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
Bio::SeqIO::Handler::GenericRichSeqHandler - Bio::HandlerI-based |
|
17
|
|
|
|
|
|
|
data handler for GenBank/EMBL/UniProt (and other) sequence data |
|
18
|
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
20
|
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
# MyHandler is a GenericRichSeqHandler object. |
|
22
|
|
|
|
|
|
|
# inside a parser (driver) constructor.... |
|
23
|
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
$self->seq_handler($handler || MyHandler->new(-format => 'genbank')); |
|
25
|
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
# in next_seq() in driver... |
|
27
|
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
$hobj = $self->seqhandler(); |
|
29
|
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
# roll data up into hashref chunks, pass off into Handler for processing... |
|
31
|
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
$hobj->data_handler($data); |
|
33
|
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
# or retrieve Handler methods and pass data directly to Handler methods... |
|
35
|
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
my $hmeth = $hobj->handler_methods; |
|
37
|
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
if ($hmeth->{ $data->{NAME} }) { |
|
39
|
|
|
|
|
|
|
my $mth = $hmeth->{ $data->{NAME} }; |
|
40
|
|
|
|
|
|
|
$hobj->$mth($data); |
|
41
|
|
|
|
|
|
|
} |
|
42
|
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
44
|
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
This is an experimental implementation of a sequence-based HandlerBaseI parser |
|
46
|
|
|
|
|
|
|
and may change over time. It is possible (nay, likely) that the way handler |
|
47
|
|
|
|
|
|
|
methods are set up will change over development to allow more flexibility. |
|
48
|
|
|
|
|
|
|
Release pumpkins, please do not add this to a release until the API has settled. |
|
49
|
|
|
|
|
|
|
It is also likely that write_seq() will not work properly for some data. |
|
50
|
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
Standard Developer caveats: |
|
52
|
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
Do not use for production purposes. |
|
54
|
|
|
|
|
|
|
Not responsible for destroying (your data|computer|world). |
|
55
|
|
|
|
|
|
|
Do not stare directly at GenericRichSeqHandler. |
|
56
|
|
|
|
|
|
|
If GenericRichSeqHandler glows, back slowly away and call for help. |
|
57
|
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
Consider yourself warned! |
|
59
|
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
This class acts as a demonstration on how to handle similar data chunks derived |
|
61
|
|
|
|
|
|
|
from Bio::SeqIO::gbdriver, Bio::SeqIO::embldriver, and Bio::SeqIO::swissdriver |
|
62
|
|
|
|
|
|
|
using similar (or the same) handler methods. |
|
63
|
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
The modules currently pass all previous tests in t/genbank.t, t/embl.t, and |
|
65
|
|
|
|
|
|
|
t/swiss.t yet all use the same handler methods (the collected tests for handlers |
|
66
|
|
|
|
|
|
|
can be found in t/Handler.t). Some tweaking of the methods themselves is |
|
67
|
|
|
|
|
|
|
probably in order over the long run to ensure that data is consistently handled |
|
68
|
|
|
|
|
|
|
for each parser. Round-trip tests are probably in order here... |
|
69
|
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
Though a Bio::Seq::SeqBuilder is employed for building sequence objects no |
|
71
|
|
|
|
|
|
|
bypassing of data based on builder slots has been implemented (yet); this is |
|
72
|
|
|
|
|
|
|
planned in the near future. |
|
73
|
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
As a reminder: this is the current Annotation data chunk (via Data::Dumper): |
|
75
|
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
$VAR1 = { |
|
77
|
|
|
|
|
|
|
'NAME' => 'REFERENCE', |
|
78
|
|
|
|
|
|
|
'DATA' => '1 (bases 1 to 10001)' |
|
79
|
|
|
|
|
|
|
'AUTHORS' => 'International Human Genome Sequencing Consortium.' |
|
80
|
|
|
|
|
|
|
'TITLE' => 'The DNA sequence of Homo sapiens' |
|
81
|
|
|
|
|
|
|
'JOURNAL' => 'Unpublished (2003)' |
|
82
|
|
|
|
|
|
|
}; |
|
83
|
|
|
|
|
|
|
... |
|
84
|
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
This is the current SeqFeature data chunk (again via Data::Dumper): |
|
86
|
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
$VAR1 = { |
|
88
|
|
|
|
|
|
|
'mol_type' => 'genomic DNA', |
|
89
|
|
|
|
|
|
|
'LOCATION' => '<1..>10001', |
|
90
|
|
|
|
|
|
|
'NAME' => 'FEATURES', |
|
91
|
|
|
|
|
|
|
'FEATURE_KEY' => 'source', |
|
92
|
|
|
|
|
|
|
'note' => 'Accession AL451081 sequenced by The Sanger Centre', |
|
93
|
|
|
|
|
|
|
'db_xref' => 'taxon:9606', |
|
94
|
|
|
|
|
|
|
'clone' => 'RP11-302I18', |
|
95
|
|
|
|
|
|
|
'organism' => 'Homo sapiens' |
|
96
|
|
|
|
|
|
|
}; |
|
97
|
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
=head1 FEEDBACK |
|
99
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
=head2 Mailing Lists |
|
101
|
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
User feedback is an integral part of the evolution of this and other |
|
103
|
|
|
|
|
|
|
Bioperl modules. Send your comments and suggestions preferably to one |
|
104
|
|
|
|
|
|
|
of the Bioperl mailing lists. Your participation is much appreciated. |
|
105
|
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
bioperl-l@bioperl.org - General discussion |
|
107
|
|
|
|
|
|
|
http://bioperl.org/wiki/Mailing_lists - About the mailing lists |
|
108
|
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
=head2 Support |
|
110
|
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
Please direct usage questions or support issues to the mailing list: |
|
112
|
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
I |
|
114
|
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
rather than to the module maintainer directly. Many experienced and |
|
116
|
|
|
|
|
|
|
reponsive experts will be able look at the problem and quickly |
|
117
|
|
|
|
|
|
|
address it. Please include a thorough description of the problem |
|
118
|
|
|
|
|
|
|
with code and data examples if at all possible. |
|
119
|
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
=head2 Reporting Bugs |
|
121
|
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
Report bugs to the Bioperl bug tracking system to help us keep track |
|
123
|
|
|
|
|
|
|
the bugs and their resolution. Bug reports can be submitted via the |
|
124
|
|
|
|
|
|
|
web: |
|
125
|
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
https://github.com/bioperl/bioperl-live/issues |
|
127
|
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
=head1 AUTHOR - Chris Fields |
|
129
|
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
Email cjfields at bioperl dot org |
|
131
|
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
=head1 APPENDIX |
|
133
|
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
The rest of the documentation details each of the object methods. Internal |
|
135
|
|
|
|
|
|
|
methods are usually preceded with a _ |
|
136
|
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
=cut |
|
138
|
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
# Let the code begin... |
|
140
|
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
package Bio::SeqIO::Handler::GenericRichSeqHandler; |
|
142
|
1
|
|
|
1
|
|
3
|
use strict; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
21
|
|
|
143
|
1
|
|
|
1
|
|
3
|
use warnings; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
16
|
|
|
144
|
|
|
|
|
|
|
|
|
145
|
1
|
|
|
1
|
|
687
|
use Bio::SeqIO::FTHelper; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
23
|
|
|
146
|
1
|
|
|
1
|
|
4
|
use Bio::Annotation::Collection; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
16
|
|
|
147
|
1
|
|
|
1
|
|
701
|
use Bio::Annotation::DBLink; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
20
|
|
|
148
|
1
|
|
|
1
|
|
627
|
use Bio::Annotation::Comment; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
20
|
|
|
149
|
1
|
|
|
1
|
|
675
|
use Bio::Annotation::Reference; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
25
|
|
|
150
|
1
|
|
|
1
|
|
4
|
use Bio::Annotation::Collection; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
15
|
|
|
151
|
1
|
|
|
1
|
|
3
|
use Bio::Annotation::SimpleValue; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
13
|
|
|
152
|
1
|
|
|
1
|
|
690
|
use Bio::Annotation::TagTree; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
24
|
|
|
153
|
1
|
|
|
1
|
|
4
|
use Bio::SeqFeature::Generic; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
15
|
|
|
154
|
1
|
|
|
1
|
|
688
|
use Bio::Species; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
21
|
|
|
155
|
1
|
|
|
1
|
|
4
|
use Bio::Taxon; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
12
|
|
|
156
|
1
|
|
|
1
|
|
3
|
use Bio::DB::Taxonomy; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
14
|
|
|
157
|
1
|
|
|
1
|
|
3
|
use Bio::Factory::FTLocationFactory; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
18
|
|
|
158
|
1
|
|
|
1
|
|
3
|
use Data::Dumper; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
44
|
|
|
159
|
|
|
|
|
|
|
|
|
160
|
1
|
|
|
1
|
|
3
|
use base qw(Bio::Root::Root Bio::HandlerBaseI); |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
742
|
|
|
161
|
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
my %HANDLERS = ( |
|
163
|
|
|
|
|
|
|
'genbank' => { |
|
164
|
|
|
|
|
|
|
'LOCUS' => \&_genbank_locus, |
|
165
|
|
|
|
|
|
|
'DEFINITION' => \&_generic_description, |
|
166
|
|
|
|
|
|
|
'ACCESSION' => \&_generic_accession, |
|
167
|
|
|
|
|
|
|
'VERSION' => \&_generic_version, |
|
168
|
|
|
|
|
|
|
'KEYWORDS' => \&_generic_keywords, |
|
169
|
|
|
|
|
|
|
'DBSOURCE' => \&_genbank_dbsource, |
|
170
|
|
|
|
|
|
|
'DBLINK' => \&_genbank_dbsource, |
|
171
|
|
|
|
|
|
|
'SOURCE' => \&_generic_species, |
|
172
|
|
|
|
|
|
|
'REFERENCE' => \&_generic_reference, |
|
173
|
|
|
|
|
|
|
'COMMENT' => \&_generic_comment, |
|
174
|
|
|
|
|
|
|
'FEATURES' => \&_generic_seqfeatures, |
|
175
|
|
|
|
|
|
|
'BASE' => \&noop, # this is generated from scratch |
|
176
|
|
|
|
|
|
|
'ORIGIN' => \&_generic_seq, |
|
177
|
|
|
|
|
|
|
# handles anything else (WGS, WGS_SCAFLD, CONTIG, PROJECT) |
|
178
|
|
|
|
|
|
|
'_DEFAULT_' => \&_generic_simplevalue, |
|
179
|
|
|
|
|
|
|
}, |
|
180
|
|
|
|
|
|
|
'embl' => { |
|
181
|
|
|
|
|
|
|
'ID' => \&_embl_id, |
|
182
|
|
|
|
|
|
|
'DT' => \&_embl_date, |
|
183
|
|
|
|
|
|
|
'DR' => \&_generic_dbsource, |
|
184
|
|
|
|
|
|
|
'SV' => \&_generic_version, |
|
185
|
|
|
|
|
|
|
'RN' => \&_generic_reference, |
|
186
|
|
|
|
|
|
|
'KW' => \&_generic_keywords, |
|
187
|
|
|
|
|
|
|
'DE' => \&_generic_description, |
|
188
|
|
|
|
|
|
|
'AC' => \&_generic_accession, |
|
189
|
|
|
|
|
|
|
#'AH' => \&noop, # TPA data not dealt with yet... |
|
190
|
|
|
|
|
|
|
#'AS' => \&noop, |
|
191
|
|
|
|
|
|
|
'SQ' => \&_generic_seq, |
|
192
|
|
|
|
|
|
|
'OS' => \&_generic_species, |
|
193
|
|
|
|
|
|
|
'CC' => \&_generic_comment, |
|
194
|
|
|
|
|
|
|
'FT' => \&_generic_seqfeatures, |
|
195
|
|
|
|
|
|
|
# handles anything else (WGS, TPA, ANN...) |
|
196
|
|
|
|
|
|
|
'_DEFAULT_' => \&_generic_simplevalue, |
|
197
|
|
|
|
|
|
|
}, |
|
198
|
|
|
|
|
|
|
'swiss' => { |
|
199
|
|
|
|
|
|
|
'ID' => \&_swiss_id, |
|
200
|
|
|
|
|
|
|
'DT' => \&_swiss_date, |
|
201
|
|
|
|
|
|
|
'GN' => \&_swiss_genename, |
|
202
|
|
|
|
|
|
|
'DR' => \&_generic_dbsource, |
|
203
|
|
|
|
|
|
|
'RN' => \&_generic_reference, |
|
204
|
|
|
|
|
|
|
'KW' => \&_generic_keywords, |
|
205
|
|
|
|
|
|
|
'DE' => \&_generic_description, |
|
206
|
|
|
|
|
|
|
'AC' => \&_generic_accession, |
|
207
|
|
|
|
|
|
|
'SQ' => \&_generic_seq, |
|
208
|
|
|
|
|
|
|
'OS' => \&_generic_species, |
|
209
|
|
|
|
|
|
|
'CC' => \&_generic_comment, |
|
210
|
|
|
|
|
|
|
'FT' => \&_generic_seqfeatures, |
|
211
|
|
|
|
|
|
|
# handles anything else, though I don't know what... |
|
212
|
|
|
|
|
|
|
'_DEFAULT_' => \&_generic_simplevalue, |
|
213
|
|
|
|
|
|
|
}, |
|
214
|
|
|
|
|
|
|
); |
|
215
|
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
# can we do this generically? Seems like a lot of trouble... |
|
217
|
|
|
|
|
|
|
my %DBSOURCE = map {$_ => 1} qw( |
|
218
|
|
|
|
|
|
|
EchoBASE IntAct SWISS-2DPAGE ECO2DBASE ECOGENE TIGRFAMs |
|
219
|
|
|
|
|
|
|
TIGR GO InterPro Pfam PROSITE SGD GermOnline |
|
220
|
|
|
|
|
|
|
HSSP PhosSite Ensembl RGD AGD ArrayExpress KEGG |
|
221
|
|
|
|
|
|
|
H-InvDB HGNC LinkHub PANTHER PRINTS SMART SMR |
|
222
|
|
|
|
|
|
|
MGI MIM RZPD-ProtExp ProDom MEROPS TRANSFAC Reactome |
|
223
|
|
|
|
|
|
|
UniGene GlycoSuiteDB PIRSF HSC-2DPAGE PHCI-2DPAGE |
|
224
|
|
|
|
|
|
|
PMMA-2DPAGE Siena-2DPAGE Rat-heart-2DPAGE Aarhus/Ghent-2DPAGE |
|
225
|
|
|
|
|
|
|
Biocyc MetaCyc Biocyc:Metacyc GenomeReviews FlyBase |
|
226
|
|
|
|
|
|
|
TMHOBP COMPLUYEAST-2DPAGE OGP DictyBase HAMAP |
|
227
|
|
|
|
|
|
|
PhotoList Gramene WormBase WormPep Genew ZFIN |
|
228
|
|
|
|
|
|
|
PeroxiBase MaizeDB TAIR DrugBank REBASE HPA |
|
229
|
|
|
|
|
|
|
swissprot GenBank GenPept REFSEQ embl PDB UniProtKB); |
|
230
|
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
my %NOPROCESS = map {$_ => 1} qw(DBSOURCE ORGANISM FEATURES); |
|
232
|
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
our %VALID_ALPHABET = ( |
|
234
|
|
|
|
|
|
|
'bp' => 'dna', |
|
235
|
|
|
|
|
|
|
'aa' => 'protein', |
|
236
|
|
|
|
|
|
|
'rc' => '' # rc = release candidate; file has no sequences |
|
237
|
|
|
|
|
|
|
); |
|
238
|
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
=head2 new |
|
240
|
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
Title : new |
|
242
|
|
|
|
|
|
|
Usage : |
|
243
|
|
|
|
|
|
|
Function: |
|
244
|
|
|
|
|
|
|
Returns : |
|
245
|
|
|
|
|
|
|
Args : -format Sequence format to be mapped for handler methods |
|
246
|
|
|
|
|
|
|
-builder Bio::Seq::SeqBuilder object (normally defined in |
|
247
|
|
|
|
|
|
|
SequenceStreamI object implementation constructor) |
|
248
|
|
|
|
|
|
|
Throws : On undefined '-format' sequence format parameter |
|
249
|
|
|
|
|
|
|
Note : Still under heavy development |
|
250
|
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
=cut |
|
252
|
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
sub new { |
|
254
|
42
|
|
|
42
|
1
|
123
|
my ($class, @args) = @_; |
|
255
|
42
|
|
|
|
|
170
|
my $self = $class->SUPER::new(@args); |
|
256
|
42
|
|
|
|
|
154
|
$self = {@args}; |
|
257
|
42
|
|
|
|
|
209
|
bless $self,$class; |
|
258
|
42
|
|
|
|
|
188
|
my ($format, $builder) = $self->_rearrange([qw(FORMAT BUILDER)], @args); |
|
259
|
42
|
50
|
|
|
|
155
|
$self->throw("Must define sequence record format") if !$format; |
|
260
|
42
|
|
|
|
|
170
|
$self->format($format); |
|
261
|
42
|
|
|
|
|
128
|
$self->handler_methods(); |
|
262
|
42
|
50
|
|
|
|
183
|
$builder && $self->seqbuilder($builder); |
|
263
|
42
|
|
|
|
|
138
|
$self->location_factory(); |
|
264
|
42
|
|
|
|
|
210
|
return $self; |
|
265
|
|
|
|
|
|
|
} |
|
266
|
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
=head1 L implementing methods |
|
268
|
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
=head2 handler_methods |
|
270
|
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
Title : handler_methods |
|
272
|
|
|
|
|
|
|
Usage : $handler->handler_methods('GenBank') |
|
273
|
|
|
|
|
|
|
%handlers = $handler->handler_methods(); |
|
274
|
|
|
|
|
|
|
Function: Retrieve the handler methods used for the current format() in |
|
275
|
|
|
|
|
|
|
the handler. This assumes the handler methods are already |
|
276
|
|
|
|
|
|
|
described in the HandlerI-implementing class. |
|
277
|
|
|
|
|
|
|
Returns : a hash reference with the data type handled and the code ref |
|
278
|
|
|
|
|
|
|
associated with it. |
|
279
|
|
|
|
|
|
|
Args : [optional] String representing the sequence format. If set here |
|
280
|
|
|
|
|
|
|
this will also set sequence_format() |
|
281
|
|
|
|
|
|
|
Throws : On unimplemented sequence format in %HANDLERS |
|
282
|
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
=cut |
|
284
|
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
sub handler_methods { |
|
286
|
75
|
|
|
75
|
1
|
103
|
my $self = shift; |
|
287
|
75
|
100
|
|
|
|
187
|
if (!($self->{'handlers'})) { |
|
288
|
|
|
|
|
|
|
$self->throw("No handlers defined for seqformat ",$self->format) |
|
289
|
42
|
50
|
|
|
|
119
|
unless exists $HANDLERS{$self->format}; |
|
290
|
42
|
|
|
|
|
113
|
$self->{'handlers'} = $HANDLERS{$self->format}; |
|
291
|
|
|
|
|
|
|
} |
|
292
|
75
|
|
|
|
|
143
|
return ($self->{'handlers'}); |
|
293
|
|
|
|
|
|
|
} |
|
294
|
|
|
|
|
|
|
|
|
295
|
|
|
|
|
|
|
=head2 data_handler |
|
296
|
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
Title : data_handler |
|
298
|
|
|
|
|
|
|
Usage : $handler->data_handler($data) |
|
299
|
|
|
|
|
|
|
Function: Centralized method which accepts all data chunks, then distributes |
|
300
|
|
|
|
|
|
|
to the appropriate methods for processing based on the chunk name |
|
301
|
|
|
|
|
|
|
from within the HandlerBaseI object. |
|
302
|
|
|
|
|
|
|
|
|
303
|
|
|
|
|
|
|
One can also use |
|
304
|
|
|
|
|
|
|
Returns : None |
|
305
|
|
|
|
|
|
|
Args : an hash ref containing a data chunk. |
|
306
|
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
=cut |
|
308
|
|
|
|
|
|
|
|
|
309
|
|
|
|
|
|
|
sub data_handler { |
|
310
|
1646
|
|
|
1646
|
1
|
1977
|
my ($self, $data) = @_; |
|
311
|
1646
|
|
33
|
|
|
3629
|
my $nm = $data->{NAME} || $self->throw("No name tag defined!"); |
|
312
|
|
|
|
|
|
|
|
|
313
|
|
|
|
|
|
|
# this should handle data on the fly w/o caching; any caching should be |
|
314
|
|
|
|
|
|
|
# done in the driver! |
|
315
|
|
|
|
|
|
|
my $method = (exists $self->{'handlers'}->{$nm}) ? ($self->{'handlers'}->{$nm}) : |
|
316
|
1646
|
50
|
|
|
|
4123
|
(exists $self->{'handlers'}->{'_DEFAULT_'}) ? ($self->{'handlers'}->{'_DEFAULT_'}) : |
|
|
|
100
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
undef; |
|
318
|
1646
|
50
|
|
|
|
2620
|
if (!$method) { |
|
319
|
0
|
|
|
|
|
0
|
$self->debug("No handler defined for $nm\n"); |
|
320
|
0
|
|
|
|
|
0
|
return; |
|
321
|
|
|
|
|
|
|
}; |
|
322
|
1646
|
|
|
|
|
3156
|
$self->$method($data); |
|
323
|
|
|
|
|
|
|
} |
|
324
|
|
|
|
|
|
|
|
|
325
|
|
|
|
|
|
|
=head2 reset_parameters |
|
326
|
|
|
|
|
|
|
|
|
327
|
|
|
|
|
|
|
Title : reset_parameters |
|
328
|
|
|
|
|
|
|
Usage : $handler->reset_parameters() |
|
329
|
|
|
|
|
|
|
Function: Resets the internal cache of data (normally object parameters for |
|
330
|
|
|
|
|
|
|
a builder or factory) |
|
331
|
|
|
|
|
|
|
Returns : None |
|
332
|
|
|
|
|
|
|
Args : None |
|
333
|
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
=cut |
|
335
|
|
|
|
|
|
|
|
|
336
|
|
|
|
|
|
|
sub reset_parameters { |
|
337
|
58
|
|
|
58
|
1
|
115
|
my $self = shift; |
|
338
|
58
|
|
|
|
|
150
|
$self->{'_params'} = undef; |
|
339
|
|
|
|
|
|
|
} |
|
340
|
|
|
|
|
|
|
|
|
341
|
|
|
|
|
|
|
=head2 format |
|
342
|
|
|
|
|
|
|
|
|
343
|
|
|
|
|
|
|
Title : format |
|
344
|
|
|
|
|
|
|
Usage : $handler->format('GenBank') |
|
345
|
|
|
|
|
|
|
Function: Get/Set the format for the report/record being parsed. This can be |
|
346
|
|
|
|
|
|
|
used to set handlers in classes which are capable of processing |
|
347
|
|
|
|
|
|
|
similar data chunks from multiple driver modules. |
|
348
|
|
|
|
|
|
|
Returns : String with the sequence format |
|
349
|
|
|
|
|
|
|
Args : [optional] String with the sequence format |
|
350
|
|
|
|
|
|
|
Note : The format may be used to set the handlers (as in the |
|
351
|
|
|
|
|
|
|
current GenericRichSeqHandler implementation) |
|
352
|
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
=cut |
|
354
|
|
|
|
|
|
|
|
|
355
|
|
|
|
|
|
|
sub format { |
|
356
|
1301
|
|
|
1301
|
1
|
1351
|
my $self = shift; |
|
357
|
1301
|
100
|
|
|
|
2312
|
return $self->{'_seqformat'} = lc shift if @_; |
|
358
|
1259
|
|
|
|
|
2782
|
return $self->{'_seqformat'}; |
|
359
|
|
|
|
|
|
|
} |
|
360
|
|
|
|
|
|
|
|
|
361
|
|
|
|
|
|
|
=head2 get_params |
|
362
|
|
|
|
|
|
|
|
|
363
|
|
|
|
|
|
|
Title : get_params |
|
364
|
|
|
|
|
|
|
Usage : $handler->get_params('-species') |
|
365
|
|
|
|
|
|
|
Function: Convenience method used to retrieve the specified |
|
366
|
|
|
|
|
|
|
parameters from the internal parameter cache |
|
367
|
|
|
|
|
|
|
Returns : Hash ref containing parameters requested and data as |
|
368
|
|
|
|
|
|
|
key-value pairs. Note that some parameter values may be |
|
369
|
|
|
|
|
|
|
objects, arrays, etc. |
|
370
|
|
|
|
|
|
|
Args : List (array) representing the parameters requested |
|
371
|
|
|
|
|
|
|
|
|
372
|
|
|
|
|
|
|
=cut |
|
373
|
|
|
|
|
|
|
|
|
374
|
|
|
|
|
|
|
sub get_params { |
|
375
|
818
|
|
|
818
|
1
|
1299
|
my ($self, @ids) = @_; |
|
376
|
818
|
|
|
|
|
668
|
my %data; |
|
377
|
818
|
|
|
|
|
1389
|
for my $id (@ids) { |
|
378
|
818
|
50
|
|
|
|
2085
|
if (!index($id, '-')==0) { |
|
379
|
818
|
|
|
|
|
1800
|
$id = '-'.$id ; |
|
380
|
|
|
|
|
|
|
} |
|
381
|
818
|
100
|
|
|
|
3709
|
$data{$id} = $self->{'_params'}->{$id} if (exists $self->{'_params'}->{$id}); |
|
382
|
|
|
|
|
|
|
} |
|
383
|
818
|
|
|
|
|
1870
|
return \%data; |
|
384
|
|
|
|
|
|
|
} |
|
385
|
|
|
|
|
|
|
|
|
386
|
|
|
|
|
|
|
=head2 set_params |
|
387
|
|
|
|
|
|
|
|
|
388
|
|
|
|
|
|
|
Title : set_params |
|
389
|
|
|
|
|
|
|
Usage : $handler->set_param({'-species') |
|
390
|
|
|
|
|
|
|
Function: Convenience method used to set specific parameters |
|
391
|
|
|
|
|
|
|
Returns : None |
|
392
|
|
|
|
|
|
|
Args : Hash ref containing the data to be passed as key-value pairs |
|
393
|
|
|
|
|
|
|
|
|
394
|
|
|
|
|
|
|
=cut |
|
395
|
|
|
|
|
|
|
|
|
396
|
|
|
|
|
|
|
sub set_params { |
|
397
|
0
|
|
|
0
|
1
|
0
|
shift->throw('Not implemented yet!'); |
|
398
|
|
|
|
|
|
|
} |
|
399
|
|
|
|
|
|
|
|
|
400
|
|
|
|
|
|
|
=head1 Methods unique to this implementation |
|
401
|
|
|
|
|
|
|
|
|
402
|
|
|
|
|
|
|
=head2 seqbuilder |
|
403
|
|
|
|
|
|
|
|
|
404
|
|
|
|
|
|
|
Title : seqbuilder |
|
405
|
|
|
|
|
|
|
Usage : |
|
406
|
|
|
|
|
|
|
Function: |
|
407
|
|
|
|
|
|
|
Returns : |
|
408
|
|
|
|
|
|
|
Args : |
|
409
|
|
|
|
|
|
|
Throws : |
|
410
|
|
|
|
|
|
|
Note : |
|
411
|
|
|
|
|
|
|
|
|
412
|
|
|
|
|
|
|
=cut |
|
413
|
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
sub seqbuilder { |
|
415
|
104
|
|
|
104
|
1
|
138
|
my $self = shift; |
|
416
|
104
|
100
|
|
|
|
370
|
return $self->{'_seqbuilder'} = shift if @_; |
|
417
|
62
|
|
|
|
|
137
|
return $self->{'_seqbuilder'}; |
|
418
|
|
|
|
|
|
|
} |
|
419
|
|
|
|
|
|
|
|
|
420
|
|
|
|
|
|
|
=head2 build_sequence |
|
421
|
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
Title : build_sequence |
|
423
|
|
|
|
|
|
|
Usage : |
|
424
|
|
|
|
|
|
|
Function: |
|
425
|
|
|
|
|
|
|
Returns : |
|
426
|
|
|
|
|
|
|
Args : |
|
427
|
|
|
|
|
|
|
Throws : |
|
428
|
|
|
|
|
|
|
Note : |
|
429
|
|
|
|
|
|
|
|
|
430
|
|
|
|
|
|
|
=cut |
|
431
|
|
|
|
|
|
|
|
|
432
|
|
|
|
|
|
|
sub build_sequence { |
|
433
|
62
|
|
|
62
|
1
|
120
|
my $self = shift; |
|
434
|
62
|
|
|
|
|
228
|
my $builder = $self->seqbuilder(); |
|
435
|
62
|
|
|
|
|
155
|
my $seq; |
|
436
|
62
|
100
|
|
|
|
189
|
if (defined($self->{'_params'})) { |
|
437
|
58
|
|
|
|
|
75
|
$builder->add_slot_value(%{ $self->{'_params'} }); |
|
|
58
|
|
|
|
|
783
|
|
|
438
|
58
|
|
|
|
|
243
|
$seq = $builder->make_object(); |
|
439
|
58
|
|
|
|
|
229
|
$self->reset_parameters; |
|
440
|
|
|
|
|
|
|
} |
|
441
|
62
|
100
|
|
|
|
1073
|
return $seq if $seq; |
|
442
|
4
|
|
|
|
|
22
|
return 0; |
|
443
|
|
|
|
|
|
|
} |
|
444
|
|
|
|
|
|
|
|
|
445
|
|
|
|
|
|
|
=head2 location_factory |
|
446
|
|
|
|
|
|
|
|
|
447
|
|
|
|
|
|
|
Title : location_factory |
|
448
|
|
|
|
|
|
|
Usage : |
|
449
|
|
|
|
|
|
|
Function: |
|
450
|
|
|
|
|
|
|
Returns : |
|
451
|
|
|
|
|
|
|
Args : |
|
452
|
|
|
|
|
|
|
Throws : |
|
453
|
|
|
|
|
|
|
Note : |
|
454
|
|
|
|
|
|
|
|
|
455
|
|
|
|
|
|
|
=cut |
|
456
|
|
|
|
|
|
|
|
|
457
|
|
|
|
|
|
|
sub location_factory { |
|
458
|
42
|
|
|
42
|
1
|
69
|
my ($self, $factory) = @_; |
|
459
|
42
|
50
|
|
|
|
193
|
if ($factory) { |
|
|
|
50
|
|
|
|
|
|
|
460
|
0
|
0
|
0
|
|
|
0
|
$self->throw("Must have a Bio::Factory::LocationFactoryI when ". |
|
461
|
|
|
|
|
|
|
"explicitly setting factory()") unless |
|
462
|
|
|
|
|
|
|
(ref($factory) && $factory->isa('Bio::Factory::LocationFactoryI')); |
|
463
|
0
|
|
|
|
|
0
|
$self->{'_locfactory'} = $factory; |
|
464
|
|
|
|
|
|
|
} elsif (!defined($self->{'_locfactory'})) { |
|
465
|
42
|
|
|
|
|
250
|
$self->{'_locfactory'} = Bio::Factory::FTLocationFactory->new() |
|
466
|
|
|
|
|
|
|
} |
|
467
|
42
|
|
|
|
|
71
|
return $self->{'_locfactory'}; |
|
468
|
|
|
|
|
|
|
} |
|
469
|
|
|
|
|
|
|
|
|
470
|
|
|
|
|
|
|
=head2 annotation_collection |
|
471
|
|
|
|
|
|
|
|
|
472
|
|
|
|
|
|
|
Title : annotation_collection |
|
473
|
|
|
|
|
|
|
Usage : |
|
474
|
|
|
|
|
|
|
Function: |
|
475
|
|
|
|
|
|
|
Returns : |
|
476
|
|
|
|
|
|
|
Args : |
|
477
|
|
|
|
|
|
|
Throws : |
|
478
|
|
|
|
|
|
|
Note : |
|
479
|
|
|
|
|
|
|
|
|
480
|
|
|
|
|
|
|
=cut |
|
481
|
|
|
|
|
|
|
|
|
482
|
|
|
|
|
|
|
sub annotation_collection { |
|
483
|
746
|
|
|
746
|
1
|
957
|
my ($self, $coll) = @_; |
|
484
|
746
|
50
|
|
|
|
1997
|
if ($coll) { |
|
|
|
100
|
|
|
|
|
|
|
485
|
0
|
0
|
0
|
|
|
0
|
$self->throw("Must have Bio::AnnotationCollectionI ". |
|
486
|
|
|
|
|
|
|
"when explicitly setting collection()") |
|
487
|
|
|
|
|
|
|
unless (ref($coll) && $coll->isa('Bio::AnnotationCollectionI')); |
|
488
|
0
|
|
|
|
|
0
|
$self->{'_params'}->{'-annotation'} = $coll; |
|
489
|
|
|
|
|
|
|
} elsif (!exists($self->{'_params'}->{'-annotation'})) { |
|
490
|
57
|
|
|
|
|
538
|
$self->{'_params'}->{'-annotation'} = Bio::Annotation::Collection->new() |
|
491
|
|
|
|
|
|
|
} |
|
492
|
746
|
|
|
|
|
2578
|
return $self->{'_params'}->{'-annotation'}; |
|
493
|
|
|
|
|
|
|
} |
|
494
|
|
|
|
|
|
|
|
|
495
|
|
|
|
|
|
|
####################### SEQUENCE HANDLERS ####################### |
|
496
|
|
|
|
|
|
|
|
|
497
|
|
|
|
|
|
|
# any sequence data |
|
498
|
|
|
|
|
|
|
sub _generic_seq { |
|
499
|
53
|
|
|
53
|
|
87
|
my ($self, $data) = @_; |
|
500
|
53
|
|
|
|
|
409
|
$self->{'_params'}->{'-seq'} = $data->{DATA}; |
|
501
|
|
|
|
|
|
|
} |
|
502
|
|
|
|
|
|
|
|
|
503
|
|
|
|
|
|
|
####################### RAW DATA HANDLERS ####################### |
|
504
|
|
|
|
|
|
|
|
|
505
|
|
|
|
|
|
|
# GenBank LOCUS line |
|
506
|
|
|
|
|
|
|
sub _genbank_locus { |
|
507
|
31
|
|
|
31
|
|
51
|
my ($self, $data) = @_; |
|
508
|
31
|
|
|
|
|
266
|
my (@tokens) = split m{\s+}, $data->{DATA}; |
|
509
|
31
|
|
|
|
|
57
|
my $display_id = shift @tokens; |
|
510
|
31
|
|
|
|
|
120
|
$self->{'_params'}->{'-display_id'} = $display_id; |
|
511
|
31
|
|
|
|
|
58
|
my $seqlength = shift @tokens; |
|
512
|
31
|
50
|
|
|
|
92
|
if (exists $VALID_ALPHABET{$seqlength}) { |
|
513
|
|
|
|
|
|
|
# moved one token too far. No locus name? |
|
514
|
|
|
|
|
|
|
$self->warn("Bad LOCUS name? Changing [".$self->{'_params'}->{'-display_id'}. |
|
515
|
0
|
|
|
|
|
0
|
"] to 'unknown' and length to ".$self->{'_params'}->{'-display_id'}); |
|
516
|
0
|
|
|
|
|
0
|
$self->{'_params'}->{'-length'} = $self->{'_params'}->{'-display_id'}; |
|
517
|
0
|
|
|
|
|
0
|
$self->{'_params'}->{'-display_id'} = 'unknown'; |
|
518
|
|
|
|
|
|
|
# add token back... |
|
519
|
0
|
|
|
|
|
0
|
unshift @tokens, $seqlength; |
|
520
|
|
|
|
|
|
|
} else { |
|
521
|
31
|
|
|
|
|
91
|
$self->{'_params'}->{'-length'} = $seqlength; |
|
522
|
|
|
|
|
|
|
} |
|
523
|
31
|
|
|
|
|
108
|
my $alphabet = lc(shift @tokens); |
|
524
|
|
|
|
|
|
|
$self->{'_params'}->{'-alphabet'} = |
|
525
|
31
|
50
|
|
|
|
107
|
(exists $VALID_ALPHABET{$alphabet}) ? $VALID_ALPHABET{$alphabet} : |
|
526
|
|
|
|
|
|
|
$self->warn("Unknown alphabet: $alphabet"); |
|
527
|
31
|
50
|
66
|
|
|
157
|
if (($self->{'_params'}->{'-alphabet'} eq 'dna') || (@tokens > 2)) { |
|
528
|
31
|
|
|
|
|
71
|
$self->{'_params'}->{'-molecule'} = shift(@tokens); |
|
529
|
31
|
|
|
|
|
67
|
my $circ = shift(@tokens); |
|
530
|
31
|
100
|
|
|
|
85
|
if ($circ eq 'circular') { |
|
531
|
2
|
|
|
|
|
5
|
$self->{'_params'}->{'-is_circular'} = 1; |
|
532
|
2
|
|
|
|
|
4
|
$self->{'_params'}->{'-division'} = shift(@tokens); |
|
533
|
|
|
|
|
|
|
} else { |
|
534
|
|
|
|
|
|
|
# 'linear' or 'circular' may actually be omitted altogether |
|
535
|
29
|
100
|
|
|
|
98
|
$self->{'_params'}->{'-division'} = |
|
536
|
|
|
|
|
|
|
(CORE::length($circ) == 3 ) ? $circ : shift(@tokens); |
|
537
|
|
|
|
|
|
|
} |
|
538
|
|
|
|
|
|
|
} else { |
|
539
|
0
|
0
|
|
|
|
0
|
$self->{'_params'}->{'-molecule'} = 'PRT' if($self->{'_params'}->{'-alphabet'} eq 'aa'); |
|
540
|
0
|
|
|
|
|
0
|
$self->{'_params'}->{'-division'} = shift(@tokens); |
|
541
|
|
|
|
|
|
|
} |
|
542
|
31
|
|
|
|
|
170
|
my $date = join(' ', @tokens); |
|
543
|
|
|
|
|
|
|
# maybe use Date::Time for dates? |
|
544
|
31
|
100
|
66
|
|
|
346
|
if($date && $date =~ s{\s*((\d{1,2})-(\w{3})-(\d{2,4})).*}{$1}) { |
|
545
|
|
|
|
|
|
|
|
|
546
|
30
|
50
|
|
|
|
76
|
if( length($date) < 11 ) { |
|
547
|
|
|
|
|
|
|
# improperly formatted date |
|
548
|
|
|
|
|
|
|
# But we'll be nice and fix it for them |
|
549
|
0
|
|
|
|
|
0
|
my ($d,$m,$y) = ($2,$3,$4); |
|
550
|
0
|
0
|
|
|
|
0
|
if( length($d) == 1 ) { |
|
551
|
0
|
|
|
|
|
0
|
$d = "0$d"; |
|
552
|
|
|
|
|
|
|
} |
|
553
|
|
|
|
|
|
|
# guess the century here |
|
554
|
0
|
0
|
|
|
|
0
|
if( length($y) == 2 ) { |
|
555
|
0
|
0
|
|
|
|
0
|
if( $y > 60 ) { # arbitrarily guess that '60' means 1960 |
|
556
|
0
|
|
|
|
|
0
|
$y = "19$y"; |
|
557
|
|
|
|
|
|
|
} else { |
|
558
|
0
|
|
|
|
|
0
|
$y = "20$y"; |
|
559
|
|
|
|
|
|
|
} |
|
560
|
0
|
|
|
|
|
0
|
$self->warn("Date was malformed, guessing the century for $date to be $y\n"); |
|
561
|
|
|
|
|
|
|
} |
|
562
|
0
|
|
|
|
|
0
|
$self->{'_params'}->{'-dates'} = [join('-',$d,$m,$y)]; |
|
563
|
|
|
|
|
|
|
} else { |
|
564
|
30
|
|
|
|
|
144
|
$self->{'_params'}->{'-dates'} = [$date]; |
|
565
|
|
|
|
|
|
|
} |
|
566
|
|
|
|
|
|
|
} |
|
567
|
|
|
|
|
|
|
} |
|
568
|
|
|
|
|
|
|
|
|
569
|
|
|
|
|
|
|
# EMBL ID line |
|
570
|
|
|
|
|
|
|
sub _embl_id { |
|
571
|
10
|
|
|
10
|
|
22
|
my ($self, $data) = @_; |
|
572
|
10
|
|
|
|
|
20
|
my $alphabet; |
|
573
|
10
|
|
|
|
|
19
|
my ($name, $sv, $topology, $mol, $div); |
|
574
|
10
|
|
|
|
|
32
|
my $line = $data->{DATA}; |
|
575
|
|
|
|
|
|
|
#$self->debug("$line\n"); |
|
576
|
10
|
|
|
|
|
35
|
my ($idtype) = $line =~ tr/;/;/; |
|
577
|
10
|
100
|
|
|
|
44
|
if ( $idtype == 6) { # New style headers contain exactly six semicolons. |
|
|
|
100
|
|
|
|
|
|
|
578
|
|
|
|
|
|
|
# New style header (EMBL Release >= 87, after June 2006) |
|
579
|
1
|
|
|
|
|
4
|
my $topology; |
|
580
|
|
|
|
|
|
|
my $sv; |
|
581
|
|
|
|
|
|
|
|
|
582
|
|
|
|
|
|
|
# ID DQ299383; SV 1; linear; mRNA; STD; MAM; 431 BP. |
|
583
|
|
|
|
|
|
|
# This regexp comes from the new2old.pl conversion script, from EBI |
|
584
|
1
|
50
|
|
|
|
13
|
if ($line =~ m/^(\w+);\s+SV (\d+); (\w+); ([^;]+); (\w{3}); (\w{3}); (\d+) \w{2}\./) { |
|
585
|
1
|
|
|
|
|
11
|
($name, $sv, $topology, $mol, $div) = ($1, $2, $3, $4, $6); |
|
586
|
|
|
|
|
|
|
} else { |
|
587
|
0
|
|
|
|
|
0
|
$self->throw("Unrecognized EMBL ID line:[$line]"); |
|
588
|
|
|
|
|
|
|
} |
|
589
|
1
|
50
|
|
|
|
6
|
if (defined($sv)) { |
|
590
|
1
|
|
|
|
|
7
|
$self->{'_params'}->{'-seq_version'} = $sv; |
|
591
|
1
|
|
|
|
|
3
|
$self->{'_params'}->{'-version'} = $sv; |
|
592
|
|
|
|
|
|
|
} |
|
593
|
|
|
|
|
|
|
|
|
594
|
1
|
50
|
|
|
|
5
|
if ($topology eq "circular") { |
|
595
|
0
|
|
|
|
|
0
|
$self->{'_params'}->{'-is_circular'} = 1; |
|
596
|
|
|
|
|
|
|
} |
|
597
|
|
|
|
|
|
|
|
|
598
|
1
|
50
|
|
|
|
4
|
if (defined $mol ) { |
|
599
|
1
|
50
|
|
|
|
8
|
if ($mol =~ /DNA/) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
600
|
0
|
|
|
|
|
0
|
$alphabet='dna'; |
|
601
|
|
|
|
|
|
|
} |
|
602
|
|
|
|
|
|
|
elsif ($mol =~ /RNA/) { |
|
603
|
1
|
|
|
|
|
3
|
$alphabet='rna'; |
|
604
|
|
|
|
|
|
|
} |
|
605
|
|
|
|
|
|
|
elsif ($mol =~ /AA/) { |
|
606
|
0
|
|
|
|
|
0
|
$alphabet='protein'; |
|
607
|
|
|
|
|
|
|
} |
|
608
|
|
|
|
|
|
|
} |
|
609
|
|
|
|
|
|
|
} elsif ($idtype) { # has internal ';' |
|
610
|
|
|
|
|
|
|
# Old style header (EMBL Release < 87, before June 2006) |
|
611
|
8
|
50
|
|
|
|
97
|
if ($line =~ m{^(\S+)[^;]*;\s+(\S+)[^;]*;\s+(\S+)[^;]*;}) { |
|
612
|
8
|
|
|
|
|
51
|
($name, $mol, $div) = ($1, $2, $3); |
|
613
|
|
|
|
|
|
|
#$self->debug("[$name][$mol][$div]"); |
|
614
|
|
|
|
|
|
|
} |
|
615
|
|
|
|
|
|
|
|
|
616
|
8
|
50
|
|
|
|
27
|
if($mol) { |
|
617
|
8
|
50
|
|
|
|
30
|
if ( $mol =~ m{circular} ) { |
|
618
|
0
|
|
|
|
|
0
|
$self->{'_params'}->{'-is_circular'} = 1; |
|
619
|
0
|
|
|
|
|
0
|
$mol =~ s{circular }{}; |
|
620
|
|
|
|
|
|
|
} |
|
621
|
8
|
50
|
|
|
|
26
|
if (defined $mol ) { |
|
622
|
8
|
100
|
|
|
|
46
|
if ($mol =~ /DNA/) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
623
|
7
|
|
|
|
|
26
|
$alphabet='dna'; |
|
624
|
|
|
|
|
|
|
} |
|
625
|
|
|
|
|
|
|
elsif ($mol =~ /RNA/) { |
|
626
|
1
|
|
|
|
|
3
|
$alphabet='rna'; |
|
627
|
|
|
|
|
|
|
} |
|
628
|
|
|
|
|
|
|
elsif ($mol =~ /AA/) { |
|
629
|
0
|
|
|
|
|
0
|
$alphabet='protein'; |
|
630
|
|
|
|
|
|
|
} |
|
631
|
|
|
|
|
|
|
} |
|
632
|
|
|
|
|
|
|
} |
|
633
|
|
|
|
|
|
|
} else { |
|
634
|
1
|
|
|
|
|
5
|
$name = $data->{DATA}; |
|
635
|
|
|
|
|
|
|
} |
|
636
|
10
|
50
|
33
|
|
|
79
|
unless( defined $name && length($name) ) { |
|
637
|
0
|
|
|
|
|
0
|
$name = "unknown_id"; |
|
638
|
|
|
|
|
|
|
} |
|
639
|
10
|
|
|
|
|
55
|
$self->{'_params'}->{'-display_id'} = $name; |
|
640
|
10
|
|
|
|
|
27
|
$self->{'_params'}->{'-alphabet'} = $alphabet; |
|
641
|
10
|
100
|
|
|
|
45
|
$self->{'_params'}->{'-division'} = $div if $div; |
|
642
|
10
|
100
|
|
|
|
65
|
$self->{'_params'}->{'-molecule'} = $mol if $mol; |
|
643
|
|
|
|
|
|
|
} |
|
644
|
|
|
|
|
|
|
|
|
645
|
|
|
|
|
|
|
# UniProt/SwissProt ID line |
|
646
|
|
|
|
|
|
|
sub _swiss_id { |
|
647
|
17
|
|
|
17
|
|
29
|
my ($self, $data) = @_; |
|
648
|
17
|
|
|
|
|
27
|
my ($name, $seq_div); |
|
649
|
17
|
50
|
|
|
|
181
|
if($data->{DATA} =~ m{^ |
|
650
|
|
|
|
|
|
|
(\S+) \s+ # $1 entryname |
|
651
|
|
|
|
|
|
|
([^\s;]+); \s+ # $2 DataClass |
|
652
|
|
|
|
|
|
|
(?:PRT;)? \s+ # Molecule Type (optional) |
|
653
|
|
|
|
|
|
|
[0-9]+[ ]AA \. # Sequencelength (capture?) |
|
654
|
|
|
|
|
|
|
$ |
|
655
|
|
|
|
|
|
|
}ox ) { |
|
656
|
17
|
|
|
|
|
78
|
($name, $seq_div) = ($1, $2); |
|
657
|
17
|
50
|
100
|
|
|
166
|
$self->{'_params'}->{'-namespace'} = |
|
|
|
100
|
66
|
|
|
|
|
|
658
|
|
|
|
|
|
|
($seq_div eq 'Reviewed' || $seq_div eq 'STANDARD') ? 'Swiss-Prot' : |
|
659
|
|
|
|
|
|
|
($seq_div eq 'Unreviewed' || $seq_div eq 'PRELIMINARY') ? 'TrEMBL' : |
|
660
|
|
|
|
|
|
|
$seq_div; |
|
661
|
|
|
|
|
|
|
# we shouldn't be setting the division, but for now... |
|
662
|
17
|
|
|
|
|
76
|
my ($junk, $division) = split q(_), $name; |
|
663
|
17
|
|
|
|
|
57
|
$self->{'_params'}->{'-division'} = $division; |
|
664
|
17
|
|
|
|
|
45
|
$self->{'_params'}->{'-alphabet'} = 'protein'; |
|
665
|
|
|
|
|
|
|
# this is important to have the id for display in e.g. FTHelper, otherwise |
|
666
|
|
|
|
|
|
|
# you won't know which entry caused an error |
|
667
|
17
|
|
|
|
|
72
|
$self->{'_params'}->{'-display_id'} = $name; |
|
668
|
|
|
|
|
|
|
} else { |
|
669
|
0
|
|
|
|
|
0
|
$self->throw("Unrecognized UniProt/SwissProt ID line:[".$data->{DATA}."]"); |
|
670
|
|
|
|
|
|
|
} |
|
671
|
|
|
|
|
|
|
} |
|
672
|
|
|
|
|
|
|
|
|
673
|
|
|
|
|
|
|
# UniProt/SwissProt GN line |
|
674
|
|
|
|
|
|
|
sub _swiss_genename { |
|
675
|
17
|
|
|
17
|
|
34
|
my ($self, $data) = @_; |
|
676
|
|
|
|
|
|
|
#$self->debug(Dumper($data)); |
|
677
|
17
|
|
|
|
|
45
|
my $genename = $data->{DATA}; |
|
678
|
17
|
|
|
|
|
26
|
my $gn; |
|
679
|
17
|
50
|
|
|
|
51
|
if ($genename) { |
|
680
|
17
|
|
|
|
|
24
|
my @stags; |
|
681
|
17
|
100
|
|
|
|
86
|
if ($genename =~ /\w=\w/) { |
|
682
|
|
|
|
|
|
|
# new format (e.g., Name=RCHY1; Synonyms=ZNF363, CHIMP) |
|
683
|
10
|
|
|
|
|
45
|
for my $n (split(m{\s+and\s+},$genename)) { |
|
684
|
12
|
|
|
|
|
30
|
my @genenames; |
|
685
|
12
|
|
|
|
|
62
|
for my $section (split(m{\s*;\s*},$n)) { |
|
686
|
15
|
|
|
|
|
42
|
my ($tag, $rest) = split("=",$section); |
|
687
|
15
|
|
50
|
|
|
42
|
$rest ||= ''; |
|
688
|
15
|
|
|
|
|
33
|
for my $val (split(m{\s*,\s*},$rest)) { |
|
689
|
19
|
|
|
|
|
63
|
push @genenames, [$tag => $val]; |
|
690
|
|
|
|
|
|
|
} |
|
691
|
|
|
|
|
|
|
} |
|
692
|
12
|
|
|
|
|
42
|
push @stags, ['gene_name' => \@genenames]; |
|
693
|
|
|
|
|
|
|
} |
|
694
|
|
|
|
|
|
|
} else { |
|
695
|
|
|
|
|
|
|
# old format |
|
696
|
7
|
|
|
|
|
37
|
for my $section (split(/ AND /, $genename)) { |
|
697
|
9
|
|
|
|
|
15
|
my @genenames; |
|
698
|
9
|
|
|
|
|
49
|
$section =~ s/[\(\)\.]//g; |
|
699
|
9
|
|
|
|
|
49
|
my @names = split(m{\s+OR\s+}, $section); |
|
700
|
9
|
|
|
|
|
34
|
push @genenames, ['Name' => shift @names]; |
|
701
|
9
|
|
|
|
|
24
|
push @genenames, map {['Synonyms' => $_]} @names; |
|
|
10
|
|
|
|
|
31
|
|
|
702
|
9
|
|
|
|
|
44
|
push @stags, ['gene_name' => \@genenames] |
|
703
|
|
|
|
|
|
|
} |
|
704
|
|
|
|
|
|
|
} #use Data::Dumper; print Dumper $gn, $genename;# exit; |
|
705
|
17
|
|
|
|
|
205
|
my $gn = Bio::Annotation::TagTree->new(-tagname => 'gene_name', |
|
706
|
|
|
|
|
|
|
-value => ['gene_names' => \@stags]); |
|
707
|
17
|
|
|
|
|
60
|
$self->annotation_collection->add_Annotation('gene_name', $gn); |
|
708
|
|
|
|
|
|
|
} |
|
709
|
|
|
|
|
|
|
} |
|
710
|
|
|
|
|
|
|
|
|
711
|
|
|
|
|
|
|
# GenBank VERSION line |
|
712
|
|
|
|
|
|
|
# old EMBL SV line (now obsolete) |
|
713
|
|
|
|
|
|
|
# UniProt/SwissProt? |
|
714
|
|
|
|
|
|
|
sub _generic_version { |
|
715
|
28
|
|
|
28
|
|
70
|
my ($self, $data) = @_; |
|
716
|
28
|
|
|
|
|
194
|
my ($acc,$gi) = split(' ',$data->{DATA}); |
|
717
|
28
|
100
|
|
|
|
172
|
if($acc =~ m{^\w+\.(\d+)}xmso) { |
|
718
|
27
|
|
|
|
|
81
|
$self->{'_params'}->{'-version'} = $1; |
|
719
|
27
|
|
|
|
|
65
|
$self->{'_params'}->{'-seq_version'} = $1; |
|
720
|
|
|
|
|
|
|
} |
|
721
|
28
|
100
|
66
|
|
|
160
|
if($gi && (index($gi,"GI:") == 0)) { |
|
722
|
24
|
|
|
|
|
106
|
$self->{'_params'}->{'-primary_id'} = substr($gi,3); |
|
723
|
|
|
|
|
|
|
} |
|
724
|
|
|
|
|
|
|
} |
|
725
|
|
|
|
|
|
|
|
|
726
|
|
|
|
|
|
|
# EMBL DT lines |
|
727
|
|
|
|
|
|
|
sub _embl_date { |
|
728
|
5
|
|
|
5
|
|
11
|
my ($self, $data) = @_; |
|
729
|
5
|
|
|
|
|
59
|
while ($data->{DATA} =~ m{(\S+)\s\((.*?)\)}g) { |
|
730
|
10
|
|
|
|
|
36
|
my ($date, $version) = ($1, $2); |
|
731
|
10
|
|
|
|
|
21
|
$date =~ tr{,}{}d; # remove comma if new version |
|
732
|
10
|
50
|
|
|
|
49
|
if ($version =~ m{\(Rel\.\s(\d+),\sCreated\)}xmso ) { |
|
|
|
50
|
|
|
|
|
|
|
733
|
0
|
|
|
|
|
0
|
my $release = Bio::Annotation::SimpleValue->new( |
|
734
|
|
|
|
|
|
|
-tagname => 'creation_release', |
|
735
|
|
|
|
|
|
|
-value => $1 |
|
736
|
|
|
|
|
|
|
); |
|
737
|
0
|
|
|
|
|
0
|
$self->annotation_collection->add_Annotation($release); |
|
738
|
|
|
|
|
|
|
} elsif ($version =~ m{\(Rel\.\s(\d+),\sLast\supdated,\sVersion\s(\d+)\)}xmso ) { |
|
739
|
0
|
|
|
|
|
0
|
my $release = Bio::Annotation::SimpleValue->new( |
|
740
|
|
|
|
|
|
|
-tagname => 'update_release', |
|
741
|
|
|
|
|
|
|
-value => $1 |
|
742
|
|
|
|
|
|
|
); |
|
743
|
0
|
|
|
|
|
0
|
$self->annotation_collection->add_Annotation($release); |
|
744
|
0
|
|
|
|
|
0
|
my $update = Bio::Annotation::SimpleValue->new( |
|
745
|
|
|
|
|
|
|
-tagname => 'update_version', |
|
746
|
|
|
|
|
|
|
-value => $2 |
|
747
|
|
|
|
|
|
|
); |
|
748
|
0
|
|
|
|
|
0
|
$self->annotation_collection->add_Annotation($update); |
|
749
|
|
|
|
|
|
|
} |
|
750
|
10
|
|
|
|
|
13
|
push @{ $self->{'_params'}->{'-dates'} }, $date; |
|
|
10
|
|
|
|
|
79
|
|
|
751
|
|
|
|
|
|
|
} |
|
752
|
|
|
|
|
|
|
} |
|
753
|
|
|
|
|
|
|
|
|
754
|
|
|
|
|
|
|
# UniProt/SwissProt DT lines |
|
755
|
|
|
|
|
|
|
sub _swiss_date { |
|
756
|
17
|
|
|
17
|
|
30
|
my ($self, $data) = @_; |
|
757
|
|
|
|
|
|
|
# swissprot |
|
758
|
17
|
|
|
|
|
75
|
my @dls = split m{\n}, $data->{DATA}; |
|
759
|
17
|
|
|
|
|
49
|
for my $dl (@dls) { |
|
760
|
51
|
|
|
|
|
133
|
my ($date, $version) = split(' ', $dl, 2); |
|
761
|
51
|
|
|
|
|
89
|
$date =~ tr{,}{}d; # remove comma if new version |
|
762
|
51
|
100
|
100
|
|
|
547
|
if ($version =~ m{\(Rel\. (\d+), Last sequence update\)} || # old |
|
|
|
100
|
100
|
|
|
|
|
|
763
|
|
|
|
|
|
|
$version =~ m{sequence version (\d+)\.}) { #new |
|
764
|
17
|
|
|
|
|
187
|
my $update = Bio::Annotation::SimpleValue->new( |
|
765
|
|
|
|
|
|
|
-tagname => 'seq_update', |
|
766
|
|
|
|
|
|
|
-value => $1 |
|
767
|
|
|
|
|
|
|
); |
|
768
|
17
|
|
|
|
|
70
|
$self->annotation_collection->add_Annotation($update); |
|
769
|
|
|
|
|
|
|
} elsif ($version =~ m{\(Rel\. (\d+), Last annotation update\)} || #old |
|
770
|
|
|
|
|
|
|
$version =~ m{entry version (\d+)\.}) { #new |
|
771
|
17
|
|
|
|
|
87
|
$self->{'_params'}->{'-version'} = $1; |
|
772
|
17
|
|
|
|
|
56
|
$self->{'_params'}->{'-seq_version'} = $1; |
|
773
|
|
|
|
|
|
|
} |
|
774
|
51
|
|
|
|
|
55
|
push @{ $self->{'_params'}->{'-dates'} }, $date; |
|
|
51
|
|
|
|
|
229
|
|
|
775
|
|
|
|
|
|
|
} |
|
776
|
|
|
|
|
|
|
} |
|
777
|
|
|
|
|
|
|
|
|
778
|
|
|
|
|
|
|
# GenBank KEYWORDS line |
|
779
|
|
|
|
|
|
|
# EMBL KW line |
|
780
|
|
|
|
|
|
|
# UniProt/SwissProt KW line |
|
781
|
|
|
|
|
|
|
sub _generic_keywords { |
|
782
|
55
|
|
|
55
|
|
126
|
my ($self, $data) = @_; |
|
783
|
55
|
|
|
|
|
286
|
$data->{DATA} =~ s{\.$}{}; |
|
784
|
55
|
|
|
|
|
518
|
my @kw = split m{\s*\;\s*}xo ,$data->{DATA}; |
|
785
|
55
|
|
|
|
|
213
|
$self->{'_params'}->{'-keywords'} = \@kw; |
|
786
|
|
|
|
|
|
|
} |
|
787
|
|
|
|
|
|
|
|
|
788
|
|
|
|
|
|
|
# GenBank DEFINITION line |
|
789
|
|
|
|
|
|
|
# EMBL DE line |
|
790
|
|
|
|
|
|
|
# UniProt/SwissProt DE line |
|
791
|
|
|
|
|
|
|
sub _generic_description { |
|
792
|
57
|
|
|
57
|
|
121
|
my ($self, $data) = @_; |
|
793
|
57
|
|
|
|
|
233
|
$self->{'_params'}->{'-desc'} = $data->{DATA}; |
|
794
|
|
|
|
|
|
|
} |
|
795
|
|
|
|
|
|
|
|
|
796
|
|
|
|
|
|
|
# GenBank ACCESSION line |
|
797
|
|
|
|
|
|
|
# EMBL AC line |
|
798
|
|
|
|
|
|
|
# UniProt/SwissProt AC line |
|
799
|
|
|
|
|
|
|
sub _generic_accession { |
|
800
|
56
|
|
|
56
|
|
104
|
my ($self, $data) = @_; |
|
801
|
56
|
|
|
|
|
337
|
my @accs = split m{[\s;]+}, $data->{DATA}; |
|
802
|
56
|
|
|
|
|
186
|
$self->{'_params'}->{'-accession_number'} = shift @accs; |
|
803
|
56
|
100
|
|
|
|
245
|
$self->{'_params'}->{'-secondary_accessions'} = \@accs if @accs; |
|
804
|
|
|
|
|
|
|
} |
|
805
|
|
|
|
|
|
|
|
|
806
|
|
|
|
|
|
|
####################### SPECIES HANDLERS ####################### |
|
807
|
|
|
|
|
|
|
|
|
808
|
|
|
|
|
|
|
# uses Bio::Species |
|
809
|
|
|
|
|
|
|
# GenBank SOURCE, ORGANISM lines |
|
810
|
|
|
|
|
|
|
# EMBL O* lines |
|
811
|
|
|
|
|
|
|
# UniProt/SwissProt O* lines |
|
812
|
|
|
|
|
|
|
sub _generic_species { |
|
813
|
56
|
|
|
56
|
|
94
|
my ($self, $data) = @_; |
|
814
|
|
|
|
|
|
|
|
|
815
|
56
|
|
|
|
|
180
|
my $seqformat = $self->format; |
|
816
|
|
|
|
|
|
|
# if data is coming in from GenBank parser... |
|
817
|
56
|
100
|
66
|
|
|
506
|
if ($seqformat eq 'genbank' && |
|
818
|
|
|
|
|
|
|
$data->{ORGANISM} =~ m{(.+?)\s(\S+;[^\n\.]+)}ox) { |
|
819
|
30
|
|
|
|
|
200
|
($data->{ORGANISM}, $data->{CLASSIFICATION}) = ($1, $2); |
|
820
|
|
|
|
|
|
|
} |
|
821
|
|
|
|
|
|
|
|
|
822
|
|
|
|
|
|
|
# SwissProt stuff... |
|
823
|
|
|
|
|
|
|
# hybrid names in swissprot files are no longer valid per intergration into |
|
824
|
|
|
|
|
|
|
# UniProt. Files containing these have been split into separate entries, so |
|
825
|
|
|
|
|
|
|
# it is probably a good idea to update if one has these lingering around... |
|
826
|
|
|
|
|
|
|
|
|
827
|
56
|
|
|
|
|
93
|
my $taxid; |
|
828
|
56
|
100
|
|
|
|
143
|
if ($seqformat eq 'swiss') { |
|
829
|
17
|
50
|
|
|
|
116
|
if ($data->{DATA} =~ m{^([^,]+)}ox) { |
|
830
|
17
|
|
|
|
|
60
|
$data->{DATA} = $1; |
|
831
|
|
|
|
|
|
|
} |
|
832
|
17
|
100
|
66
|
|
|
153
|
if ($data->{CROSSREF} && $data->{CROSSREF} =~ m{NCBI_TaxID=(\d+)}) { |
|
833
|
16
|
|
|
|
|
39
|
$taxid = $1; |
|
834
|
|
|
|
|
|
|
} |
|
835
|
|
|
|
|
|
|
} |
|
836
|
|
|
|
|
|
|
|
|
837
|
|
|
|
|
|
|
my ($sl, $class, $sci_name) = ($data->{DATA}, |
|
838
|
|
|
|
|
|
|
$data->{CLASSIFICATION}, |
|
839
|
56
|
|
100
|
|
|
320
|
$data->{ORGANISM} || ''); |
|
840
|
56
|
|
|
|
|
85
|
my ($organelle,$abbr_name, $common); |
|
841
|
56
|
|
|
|
|
986
|
my @class = reverse split m{\s*;\s*}, $class; |
|
842
|
|
|
|
|
|
|
# have to treat swiss different from everything else... |
|
843
|
56
|
50
|
|
|
|
703
|
if ($sl =~ m{^(mitochondrion|chloroplast|plastid)? # GenBank format |
|
844
|
|
|
|
|
|
|
\s*(.*?) |
|
845
|
|
|
|
|
|
|
\s*(?: \( (.*?) \) )?\.?$ |
|
846
|
|
|
|
|
|
|
}xmso ){ |
|
847
|
56
|
|
|
|
|
364
|
($organelle, $abbr_name, $common) = ($1, $2, $3); # optional |
|
848
|
|
|
|
|
|
|
} else { |
|
849
|
0
|
|
|
|
|
0
|
$abbr_name = $sl; # nothing caught; this is a backup! |
|
850
|
|
|
|
|
|
|
} |
|
851
|
|
|
|
|
|
|
# there is no 'abbreviated name' for EMBL |
|
852
|
56
|
100
|
|
|
|
177
|
$sci_name = $abbr_name if $seqformat ne 'genbank'; |
|
853
|
56
|
|
100
|
|
|
225
|
$organelle ||= ''; |
|
854
|
56
|
|
100
|
|
|
186
|
$common ||= ''; |
|
855
|
56
|
50
|
|
|
|
127
|
$sci_name || return; |
|
856
|
56
|
|
|
|
|
128
|
unshift @class, $sci_name; |
|
857
|
|
|
|
|
|
|
# no genus/species parsing here; moving to Bio::Taxon-based taxonomy |
|
858
|
56
|
|
|
|
|
559
|
my $make = Bio::Species->new(); |
|
859
|
56
|
|
|
|
|
217
|
$make->scientific_name($sci_name); |
|
860
|
56
|
50
|
|
|
|
454
|
$make->classification(@class) if @class > 0; |
|
861
|
56
|
100
|
|
|
|
260
|
$common && $make->common_name( $common ); |
|
862
|
56
|
50
|
|
|
|
297
|
$abbr_name && $make->name('abbreviated', $abbr_name); |
|
863
|
56
|
100
|
|
|
|
135
|
$organelle && $make->organelle($organelle); |
|
864
|
56
|
100
|
|
|
|
219
|
$taxid && $make->ncbi_taxid($taxid); |
|
865
|
56
|
|
|
|
|
643
|
$self->{'_params'}->{'-species'} = $make; |
|
866
|
|
|
|
|
|
|
} |
|
867
|
|
|
|
|
|
|
|
|
868
|
|
|
|
|
|
|
####################### ANNOTATION HANDLERS ####################### |
|
869
|
|
|
|
|
|
|
|
|
870
|
|
|
|
|
|
|
# GenBank DBSOURCE line |
|
871
|
|
|
|
|
|
|
sub _genbank_dbsource { |
|
872
|
1
|
|
|
1
|
|
2
|
my ($self, $data) = @_; |
|
873
|
1
|
|
|
|
|
2
|
my $dbsource = $data->{DATA}; |
|
874
|
1
|
|
|
|
|
3
|
my $annotation = $self->annotation_collection; |
|
875
|
|
|
|
|
|
|
# deal with swissprot dbsources |
|
876
|
|
|
|
|
|
|
# we could possibly parcel these out to subhandlers... |
|
877
|
1
|
50
|
|
|
|
3
|
if( $dbsource =~ s/(UniProt(?:KB)|swissprot):\s+locus\s+(\S+)\,.+\n// ) { |
|
878
|
0
|
|
|
|
|
0
|
$annotation->add_Annotation |
|
879
|
|
|
|
|
|
|
('dblink', |
|
880
|
|
|
|
|
|
|
Bio::Annotation::DBLink->new |
|
881
|
|
|
|
|
|
|
(-primary_id => $2, |
|
882
|
|
|
|
|
|
|
-database => $1, |
|
883
|
|
|
|
|
|
|
-tagname => 'dblink')); |
|
884
|
0
|
0
|
|
|
|
0
|
if( $dbsource =~ s/\s*created:\s+([^\.]+)\.\n// ) { |
|
885
|
0
|
|
|
|
|
0
|
$annotation->add_Annotation |
|
886
|
|
|
|
|
|
|
('swissprot_dates', |
|
887
|
|
|
|
|
|
|
Bio::Annotation::SimpleValue->new |
|
888
|
|
|
|
|
|
|
(-tagname => 'date_created', |
|
889
|
|
|
|
|
|
|
-value => $1)); |
|
890
|
|
|
|
|
|
|
} |
|
891
|
0
|
|
|
|
|
0
|
while( $dbsource =~ s/\s*(sequence|annotation)\s+updated:\s+([^\.]+)\.\n//g ) { |
|
892
|
0
|
|
|
|
|
0
|
$annotation->add_Annotation |
|
893
|
|
|
|
|
|
|
('swissprot_dates', |
|
894
|
|
|
|
|
|
|
Bio::Annotation::SimpleValue->new |
|
895
|
|
|
|
|
|
|
(-tagname => 'date_updated', |
|
896
|
|
|
|
|
|
|
-value => $1)); |
|
897
|
|
|
|
|
|
|
} |
|
898
|
0
|
|
|
|
|
0
|
$dbsource =~ s/\n/ /g; |
|
899
|
0
|
0
|
|
|
|
0
|
if( $dbsource =~ s/\s*xrefs:\s+((?:\S+,\s+)+\S+)\s+xrefs/xrefs/ ) { |
|
|
|
0
|
|
|
|
|
|
|
900
|
|
|
|
|
|
|
# will use $i to determine even or odd |
|
901
|
|
|
|
|
|
|
# for swissprot the accessions are paired |
|
902
|
0
|
|
|
|
|
0
|
my $i = 0; |
|
903
|
0
|
|
|
|
|
0
|
for my $dbsrc ( split(/,\s+/,$1) ) { |
|
904
|
0
|
0
|
0
|
|
|
0
|
if( $dbsrc =~ /(\S+)\.(\d+)/ || $dbsrc =~ /(\S+)/ ) { |
|
905
|
0
|
|
|
|
|
0
|
my ($id,$version) = ($1,$2); |
|
906
|
0
|
0
|
|
|
|
0
|
$version ='' unless defined $version; |
|
907
|
0
|
|
|
|
|
0
|
my $db; |
|
908
|
0
|
0
|
|
|
|
0
|
if( $id =~ /^\d\S{3}/) { |
|
909
|
0
|
|
|
|
|
0
|
$db = 'PDB'; |
|
910
|
|
|
|
|
|
|
} else { |
|
911
|
0
|
0
|
|
|
|
0
|
$db = ($i++ % 2 ) ? 'GenPept' : 'GenBank'; |
|
912
|
|
|
|
|
|
|
} |
|
913
|
0
|
|
|
|
|
0
|
$annotation->add_Annotation |
|
914
|
|
|
|
|
|
|
('dblink', |
|
915
|
|
|
|
|
|
|
Bio::Annotation::DBLink->new |
|
916
|
|
|
|
|
|
|
(-primary_id => $id, |
|
917
|
|
|
|
|
|
|
-version => $version, |
|
918
|
|
|
|
|
|
|
-database => $db, |
|
919
|
|
|
|
|
|
|
-tagname => 'dblink')); |
|
920
|
|
|
|
|
|
|
} |
|
921
|
|
|
|
|
|
|
} |
|
922
|
|
|
|
|
|
|
} elsif( $dbsource =~ s/\s*xrefs:\s+(.+)\s+xrefs/xrefs/i ) { |
|
923
|
|
|
|
|
|
|
# download screwed up and ncbi didn't put acc in for gi numbers |
|
924
|
0
|
|
|
|
|
0
|
my $i = 0; |
|
925
|
0
|
|
|
|
|
0
|
for my $id ( split(/\,\s+/,$1) ) { |
|
926
|
0
|
|
|
|
|
0
|
my ($acc,$db); |
|
927
|
0
|
0
|
|
|
|
0
|
if( $id =~ /gi:\s+(\d+)/ ) { |
|
|
|
0
|
|
|
|
|
|
|
928
|
0
|
|
|
|
|
0
|
$acc= $1; |
|
929
|
0
|
0
|
|
|
|
0
|
$db = ($i++ % 2 ) ? 'GenPept' : 'GenBank'; |
|
930
|
|
|
|
|
|
|
} elsif( $id =~ /pdb\s+accession\s+(\S+)/ ) { |
|
931
|
0
|
|
|
|
|
0
|
$acc= $1; |
|
932
|
0
|
|
|
|
|
0
|
$db = 'PDB'; |
|
933
|
|
|
|
|
|
|
} else { |
|
934
|
0
|
|
|
|
|
0
|
$acc= $id; |
|
935
|
0
|
|
|
|
|
0
|
$db = ''; |
|
936
|
|
|
|
|
|
|
} |
|
937
|
0
|
|
|
|
|
0
|
$annotation->add_Annotation |
|
938
|
|
|
|
|
|
|
('dblink', |
|
939
|
|
|
|
|
|
|
Bio::Annotation::DBLink->new |
|
940
|
|
|
|
|
|
|
(-primary_id => $acc, |
|
941
|
|
|
|
|
|
|
-database => $db, |
|
942
|
|
|
|
|
|
|
-tagname => 'dblink')); |
|
943
|
|
|
|
|
|
|
} |
|
944
|
|
|
|
|
|
|
} else { |
|
945
|
0
|
|
|
|
|
0
|
$self->warn("Cannot match $dbsource\n"); |
|
946
|
|
|
|
|
|
|
} |
|
947
|
0
|
0
|
|
|
|
0
|
if( $dbsource =~ s/xrefs\s+\(non\-sequence\s+databases\):\s+ |
|
948
|
|
|
|
|
|
|
((?:\S+,\s+)+\S+)//x ) { |
|
949
|
0
|
|
|
|
|
0
|
for my $id ( split(/\,\s+/,$1) ) { |
|
950
|
0
|
|
|
|
|
0
|
my $db; |
|
951
|
|
|
|
|
|
|
# this is because GenBank dropped the spaces!!! |
|
952
|
|
|
|
|
|
|
# I'm sure we're not going to get this right |
|
953
|
|
|
|
|
|
|
##if( $id =~ s/^://i ) { |
|
954
|
|
|
|
|
|
|
## $db = $1; |
|
955
|
|
|
|
|
|
|
##} |
|
956
|
0
|
|
|
|
|
0
|
$db = substr($id,0,index($id,':')); |
|
957
|
0
|
0
|
|
|
|
0
|
if (! exists $DBSOURCE{ $db }) { |
|
958
|
0
|
|
|
|
|
0
|
$db = ''; # do we want 'GenBank' here? |
|
959
|
|
|
|
|
|
|
} |
|
960
|
0
|
|
|
|
|
0
|
$id = substr($id,index($id,':')+1); |
|
961
|
0
|
|
|
|
|
0
|
$annotation->add_Annotation |
|
962
|
|
|
|
|
|
|
('dblink',Bio::Annotation::DBLink->new |
|
963
|
|
|
|
|
|
|
(-primary_id => $id, |
|
964
|
|
|
|
|
|
|
-database => $db, |
|
965
|
|
|
|
|
|
|
-tagname => 'dblink')); |
|
966
|
|
|
|
|
|
|
} |
|
967
|
|
|
|
|
|
|
} |
|
968
|
|
|
|
|
|
|
} else { |
|
969
|
1
|
50
|
|
|
|
5
|
if( $dbsource =~ /^(\S*?):?\s*accession\s+(\S+)\.(\d+)/ ) { |
|
|
|
0
|
|
|
|
|
|
|
970
|
1
|
|
|
|
|
3
|
my ($db,$id,$version) = ($1,$2,$3); |
|
971
|
1
|
|
50
|
|
|
12
|
$annotation->add_Annotation |
|
972
|
|
|
|
|
|
|
('dblink', |
|
973
|
|
|
|
|
|
|
Bio::Annotation::DBLink->new |
|
974
|
|
|
|
|
|
|
(-primary_id => $id, |
|
975
|
|
|
|
|
|
|
-version => $version, |
|
976
|
|
|
|
|
|
|
-database => $db || 'GenBank', |
|
977
|
|
|
|
|
|
|
-tagname => 'dblink')); |
|
978
|
|
|
|
|
|
|
} elsif ( $dbsource =~ /(\S+)([\.:])(\d+)/ ) { |
|
979
|
0
|
|
|
|
|
0
|
my ($id, $db, $version); |
|
980
|
0
|
0
|
|
|
|
0
|
if ($2 eq ':') { |
|
981
|
0
|
|
|
|
|
0
|
($db, $id) = ($1, $3); |
|
982
|
|
|
|
|
|
|
} else { |
|
983
|
0
|
|
|
|
|
0
|
($db, $id, $version) = ('GenBank', $1, $3); |
|
984
|
|
|
|
|
|
|
} |
|
985
|
0
|
|
|
|
|
0
|
$annotation->add_Annotation('dblink', |
|
986
|
|
|
|
|
|
|
Bio::Annotation::DBLink->new( |
|
987
|
|
|
|
|
|
|
-primary_id => $id, |
|
988
|
|
|
|
|
|
|
-version => $version, |
|
989
|
|
|
|
|
|
|
-database => $db, |
|
990
|
|
|
|
|
|
|
-tagname => 'dblink') |
|
991
|
|
|
|
|
|
|
); |
|
992
|
|
|
|
|
|
|
} else { |
|
993
|
0
|
|
|
|
|
0
|
$self->warn("Unrecognized DBSOURCE data: $dbsource\n"); |
|
994
|
|
|
|
|
|
|
} |
|
995
|
|
|
|
|
|
|
} |
|
996
|
|
|
|
|
|
|
} |
|
997
|
|
|
|
|
|
|
|
|
998
|
|
|
|
|
|
|
# EMBL DR lines |
|
999
|
|
|
|
|
|
|
# UniProt/SwissProt DR lines |
|
1000
|
|
|
|
|
|
|
sub _generic_dbsource { |
|
1001
|
20
|
|
|
20
|
|
61
|
my ($self, $data) = @_; |
|
1002
|
|
|
|
|
|
|
#$self->debug(Dumper($data)); |
|
1003
|
20
|
|
|
|
|
131
|
while ($data->{DATA} =~ m{([^\n]+)}og) { |
|
1004
|
351
|
|
|
|
|
710
|
my $dblink = $1; |
|
1005
|
351
|
|
|
|
|
919
|
$dblink =~ s{\.$}{}; |
|
1006
|
351
|
|
|
|
|
338
|
my $link; |
|
1007
|
351
|
|
|
|
|
744
|
my @linkdata = split '; ',$dblink; |
|
1008
|
351
|
50
|
|
|
|
1319
|
if ( $dblink =~ m{([^\s;]+);\s*([^\s;]+);?\s*([^\s;]+)?}) { |
|
1009
|
|
|
|
|
|
|
#if ( $dblink =~ m{([^\s;]+);\s*([^\s;]+);?\s*([^\s;]+)?}) { |
|
1010
|
351
|
|
|
|
|
695
|
my ($databse, $prim_id, $sec_id) = ($1,$2,$3); |
|
1011
|
351
|
|
|
|
|
1247
|
$link = Bio::Annotation::DBLink->new(-database => $databse, |
|
1012
|
|
|
|
|
|
|
-primary_id => $prim_id, |
|
1013
|
|
|
|
|
|
|
-optional_id => $sec_id); |
|
1014
|
|
|
|
|
|
|
} else { |
|
1015
|
0
|
|
|
|
|
0
|
$self->warn("No match for $dblink"); |
|
1016
|
|
|
|
|
|
|
} |
|
1017
|
351
|
|
|
|
|
635
|
$self->annotation_collection->add_Annotation('dblink', $link); |
|
1018
|
|
|
|
|
|
|
} |
|
1019
|
|
|
|
|
|
|
} |
|
1020
|
|
|
|
|
|
|
|
|
1021
|
|
|
|
|
|
|
|
|
1022
|
|
|
|
|
|
|
# GenBank REFERENCE and related lines |
|
1023
|
|
|
|
|
|
|
# EMBL R* lines |
|
1024
|
|
|
|
|
|
|
# UniProt/SwissProt R* lines |
|
1025
|
|
|
|
|
|
|
sub _generic_reference { |
|
1026
|
301
|
|
|
301
|
|
281
|
my ($self, $data) = @_; |
|
1027
|
301
|
|
|
|
|
601
|
my $seqformat = $self->format; |
|
1028
|
301
|
|
|
|
|
293
|
my ($start, $end); |
|
1029
|
|
|
|
|
|
|
# get these in EMBL/Swiss |
|
1030
|
301
|
100
|
|
|
|
526
|
if ($data->{CROSSREF}) { |
|
1031
|
128
|
|
|
|
|
805
|
while ($data->{CROSSREF} =~ m{(pubmed|doi|medline)(?:=|;\s+)(\S+)}oig) { |
|
1032
|
199
|
|
|
|
|
473
|
my ($db, $ref) = (uc $1, $2); |
|
1033
|
199
|
|
|
|
|
489
|
$ref =~ s{[;.]+$}{}; |
|
1034
|
199
|
|
|
|
|
666
|
$data->{$db} = $ref; |
|
1035
|
|
|
|
|
|
|
} |
|
1036
|
|
|
|
|
|
|
} |
|
1037
|
|
|
|
|
|
|
# run some cleanup for swissprot |
|
1038
|
301
|
100
|
|
|
|
716
|
if ($seqformat eq 'swiss') { |
|
1039
|
109
|
|
|
|
|
140
|
for my $val (values %{ $data }) { |
|
|
109
|
|
|
|
|
300
|
|
|
1040
|
959
|
|
|
|
|
1055
|
$val =~ s{;$}{}; |
|
1041
|
959
|
|
|
|
|
980
|
$val =~ s{(\w-)\s}{$1}; |
|
1042
|
|
|
|
|
|
|
} |
|
1043
|
|
|
|
|
|
|
} |
|
1044
|
301
|
100
|
|
|
|
525
|
if ( $data->{POSITION} ) { |
|
1045
|
127
|
100
|
|
|
|
507
|
if ($seqformat eq 'embl') { |
|
|
|
100
|
|
|
|
|
|
|
1046
|
18
|
|
|
|
|
207
|
($start, $end) = split '-', $data->{POSITION},2; |
|
1047
|
|
|
|
|
|
|
} elsif ($data->{POSITION} =~ m{.+? OF (\d+)-(\d+).*}) { #swiss |
|
1048
|
23
|
|
|
|
|
56
|
($start, $end) = ($1, $2); |
|
1049
|
|
|
|
|
|
|
} |
|
1050
|
|
|
|
|
|
|
} |
|
1051
|
301
|
100
|
|
|
|
883
|
if ($data->{DATA} =~ m{^\d+\s+\([a-z]+\s+(\d+)\s+to\s+(\d+)\)}xmso) { |
|
1052
|
59
|
|
|
|
|
158
|
($start, $end) = ($1, $2); |
|
1053
|
|
|
|
|
|
|
} |
|
1054
|
|
|
|
|
|
|
my $ref = Bio::Annotation::Reference->new( |
|
1055
|
|
|
|
|
|
|
-comment => $data->{REMARK}, |
|
1056
|
|
|
|
|
|
|
-location => $data->{JOURNAL}, |
|
1057
|
|
|
|
|
|
|
-pubmed => $data->{PUBMED}, |
|
1058
|
|
|
|
|
|
|
-consortium => $data->{CONSRTM}, |
|
1059
|
|
|
|
|
|
|
-title => $data->{TITLE}, |
|
1060
|
|
|
|
|
|
|
-authors => $data->{AUTHORS}, |
|
1061
|
|
|
|
|
|
|
-medline => $data->{MEDLINE}, |
|
1062
|
|
|
|
|
|
|
-doi => $data->{DOI}, |
|
1063
|
|
|
|
|
|
|
-rp => $data->{POSITION}, # JIC... |
|
1064
|
301
|
|
|
|
|
3419
|
-start => $start, |
|
1065
|
|
|
|
|
|
|
-end => $end, |
|
1066
|
|
|
|
|
|
|
); |
|
1067
|
301
|
100
|
|
|
|
1418
|
if ($data->{DATA} =~ m{^\d+\s+\((.*)\)}xmso) { |
|
1068
|
59
|
|
|
|
|
177
|
$ref->gb_reference($1); |
|
1069
|
|
|
|
|
|
|
} |
|
1070
|
301
|
|
|
|
|
690
|
$self->annotation_collection->add_Annotation('reference', $ref); |
|
1071
|
|
|
|
|
|
|
} |
|
1072
|
|
|
|
|
|
|
|
|
1073
|
|
|
|
|
|
|
# GenBank COMMENT lines |
|
1074
|
|
|
|
|
|
|
# EMBL CC lines |
|
1075
|
|
|
|
|
|
|
# UniProt/SwissProt CC lines |
|
1076
|
|
|
|
|
|
|
sub _generic_comment { |
|
1077
|
43
|
|
|
43
|
|
76
|
my ($self, $data) = @_; |
|
1078
|
|
|
|
|
|
|
$self->annotation_collection->add_Annotation('comment', |
|
1079
|
43
|
|
|
|
|
122
|
Bio::Annotation::Comment->new( -text => $data->{DATA} )); |
|
1080
|
|
|
|
|
|
|
} |
|
1081
|
|
|
|
|
|
|
|
|
1082
|
|
|
|
|
|
|
####################### SEQFEATURE HANDLER ####################### |
|
1083
|
|
|
|
|
|
|
|
|
1084
|
|
|
|
|
|
|
# GenBank Feature Table |
|
1085
|
|
|
|
|
|
|
sub _generic_seqfeatures { |
|
1086
|
848
|
|
|
848
|
|
889
|
my ($self, $data) = @_; |
|
1087
|
848
|
100
|
|
|
|
1771
|
return if $data->{FEATURE_KEY} eq 'FEATURES'; |
|
1088
|
818
|
|
|
|
|
1021
|
my $primary_tag = $data->{FEATURE_KEY}; |
|
1089
|
|
|
|
|
|
|
|
|
1090
|
|
|
|
|
|
|
# grab the NCBI taxon ID from the source SF |
|
1091
|
818
|
100
|
66
|
|
|
1458
|
if ($primary_tag eq 'source' && exists $data->{'db_xref'}) { |
|
1092
|
36
|
100
|
66
|
|
|
332
|
if ( $self->{'_params'}->{'-species'} && |
|
1093
|
|
|
|
|
|
|
$data->{'db_xref'} =~ m{taxon:(\d+)}xmso ) { |
|
1094
|
35
|
|
|
|
|
164
|
$self->{'_params'}->{'-species'}->ncbi_taxid($1); |
|
1095
|
|
|
|
|
|
|
} |
|
1096
|
|
|
|
|
|
|
} |
|
1097
|
818
|
|
|
|
|
1526
|
my $source = $self->format; |
|
1098
|
|
|
|
|
|
|
|
|
1099
|
818
|
|
|
|
|
851
|
my $seqid = ${ $self->get_params('accession_number') }{'accession_number'}; |
|
|
818
|
|
|
|
|
1519
|
|
|
1100
|
|
|
|
|
|
|
|
|
1101
|
818
|
|
|
|
|
1097
|
my $loc; |
|
1102
|
818
|
|
|
|
|
1048
|
eval { |
|
1103
|
818
|
|
|
|
|
3152
|
$loc = $self->{'_locfactory'}->from_string($data->{'LOCATION'}); |
|
1104
|
|
|
|
|
|
|
}; |
|
1105
|
818
|
50
|
|
|
|
1810
|
if(! $loc) { |
|
1106
|
|
|
|
|
|
|
$self->warn("exception while parsing location line [" . |
|
1107
|
|
|
|
|
|
|
$data->{'LOCATION'} . |
|
1108
|
|
|
|
|
|
|
"] in reading $source, ignoring feature " . |
|
1109
|
0
|
|
|
|
|
0
|
$data->{'primary_tag'}. |
|
1110
|
|
|
|
|
|
|
" (seqid=" . $seqid . "): " . $@); |
|
1111
|
0
|
|
|
|
|
0
|
return; |
|
1112
|
|
|
|
|
|
|
} |
|
1113
|
818
|
50
|
33
|
|
|
1652
|
if($seqid && (! $loc->is_remote())) { |
|
1114
|
0
|
|
|
|
|
0
|
$loc->seq_id($seqid); # propagates if it is a split location |
|
1115
|
|
|
|
|
|
|
} |
|
1116
|
818
|
|
|
|
|
2697
|
my $sf = Bio::SeqFeature::Generic->direct_new(); |
|
1117
|
818
|
|
|
|
|
1872
|
$sf->location($loc); |
|
1118
|
818
|
|
|
|
|
2030
|
$sf->primary_tag($primary_tag); |
|
1119
|
818
|
|
|
|
|
1697
|
$sf->seq_id($seqid); |
|
1120
|
818
|
|
|
|
|
1598
|
$sf->source_tag($source); |
|
1121
|
818
|
|
|
|
|
1207
|
delete $data->{'FEATURE_KEY'}; |
|
1122
|
818
|
|
|
|
|
905
|
delete $data->{'LOCATION'}; |
|
1123
|
818
|
|
|
|
|
973
|
delete $data->{'NAME'}; |
|
1124
|
818
|
|
|
|
|
850
|
delete $data->{'DATA'}; |
|
1125
|
818
|
|
|
|
|
2490
|
$sf->set_attributes(-tag => $data); |
|
1126
|
818
|
|
|
|
|
1040
|
push @{ $self->{'_params'}->{'-features'} }, $sf; |
|
|
818
|
|
|
|
|
3440
|
|
|
1127
|
|
|
|
|
|
|
} |
|
1128
|
|
|
|
|
|
|
|
|
1129
|
|
|
|
|
|
|
####################### ODDS AND ENDS ####################### |
|
1130
|
|
|
|
|
|
|
|
|
1131
|
|
|
|
|
|
|
# Those things that don't fit anywhere else. If a specific name |
|
1132
|
|
|
|
|
|
|
# maps to the below table, that class and method are used, otherwise |
|
1133
|
|
|
|
|
|
|
# it goes into a SimpleValue (I think this is a good argument for why |
|
1134
|
|
|
|
|
|
|
# we need a generic mechanism for storing annotation) |
|
1135
|
|
|
|
|
|
|
|
|
1136
|
|
|
|
|
|
|
sub _generic_simplevalue { |
|
1137
|
16
|
|
|
16
|
|
29
|
my ($self, $data) = @_; |
|
1138
|
|
|
|
|
|
|
$self->annotation_collection->add_Annotation( |
|
1139
|
|
|
|
|
|
|
Bio::Annotation::SimpleValue->new(-tagname => lc($data->{NAME}), |
|
1140
|
|
|
|
|
|
|
-value => $data->{DATA}) |
|
1141
|
16
|
|
|
|
|
61
|
); |
|
1142
|
|
|
|
|
|
|
} |
|
1143
|
|
|
|
|
|
|
|
|
1144
|
|
|
|
15
|
0
|
|
sub noop {} |
|
1145
|
|
|
|
|
|
|
|
|
1146
|
|
|
|
|
|
|
sub _debug { |
|
1147
|
0
|
|
|
0
|
|
|
my ($self, $data) = @_; |
|
1148
|
0
|
|
|
|
|
|
$self->debug(Dumper($data)); |
|
1149
|
|
|
|
|
|
|
} |
|
1150
|
|
|
|
|
|
|
|
|
1151
|
|
|
|
|
|
|
|
|
1152
|
|
|
|
|
|
|
1; |