File Coverage

blib/lib/Bio/GenBankParser.pm

Criterion	Covered	Total	%
statement	37	62	59.6
branch	8	26	30.7
condition	1	15	6.6
subroutine	11	14	78.5
pod	7	7	100.0
total	64	124	51.6

line	stmt	bran	cond	sub	pod	time	code
1							package Bio::GenBankParser;
2
3							# $Id: GenBankParser.pm 27 2010-06-23 16:45:50Z kyclark $
4
5	1			1		1062	use warnings;
	1					3
	1					36
6	1			1		6	use strict;
	1					3
	1					40
7	1			1		16	use Carp qw( croak );
	1					2
	1					69
8	1			1		6	use File::Spec::Functions;
	1					2
	1					103
9	1			1		1887	use Parse::RecDescent;
	1					87487
	1					11
10	1			1		44	use Readonly;
	1					3
	1					863
11
12							Readonly my $GENBANK_RECORD_SEPARATOR => "//\n";
13
14							=pod
15
16							=head1 NAME
17
18							Bio::GenBankParser - Parse::RecDescent parser for a GenBank record
19
20							=head1 VERSION
21
22							Version 0.05
23
24							=cut
25
26							our $VERSION = '0.05';
27
28							=head1 SYNOPSIS
29
30							This module aims to improve on the BioPerl GenBank parser by using
31							a grammar-based approach with Parse::RecDescent.
32
33							use Bio::GenBankParser;
34
35							my $parser = Bio::GenBankParser->new();
36
37							local $/ = "//\n";
38							while ( my $rec = <$input> ) {
39							my $gb_rec = $parser->parse( $rec );
40							}
41
42							Or:
43
44							my $parser = Bio::GenBankParser->new( file => $file );
45							while ( my $seq = $parser->next_seq ) {
46							...
47							}
48
49							=head1 METHODS
50
51							=cut
52
53							# ----------------------------------------------------------------
54							sub new {
55
56							=pod
57
58							=head2 new
59
60							use Bio::GenBankParser;
61							my $parser = Bio::GenBankParser->new;
62
63							=cut
64
65	1			1	1	392	my $class = shift;
66	1	50	33			9	my %args = ( @_ && ref $_[0] eq 'HASH' ) ? %{ $_[0] } : @_;
	0					0
67	1					3	my $self = bless \%args, $class;
68
69	1	50				4	if ( $args{'file'} ) {
70	0					0	$self->file( $args{'file'} );
71							}
72
73	1					4	return $self;
74							}
75
76							# ----------------------------------------------------------------
77							sub DESTROY {
78	1			1		72822	my $self = shift;
79
80	1	50				16	if ( my $fh = $self->{'fh'} ) {
81	0					0	close $fh;
82							}
83							}
84
85							# ----------------------------------------------------------------
86							sub file {
87
88							=pod
89
90							=head2 file
91
92							$parser->file('/path/to/file');
93
94							Informs the parser to read sequentially from a file.
95
96							=cut
97
98	0			0	1	0	my $self = shift;
99
100	0	0				0	if ( my $file = shift ) {
101	0					0	$file = canonpath( $file );
102
103	0	0	0			0	if ( -e $file && -s _ && -r _ ) {
			0
104	0	0				0	open my $fh, '<', $file or croak("Can't read file '$file'; $!\n");
105
106	0					0	$self->{'file'} = $file;
107	0					0	$self->{'fh'} = $fh;
108							}
109							else {
110	0					0	croak("Non-existent, empty or unreadable file: '$file'");
111							}
112							}
113
114	0					0	return 1;
115							}
116
117							# ----------------------------------------------------------------
118							sub current_record {
119
120							=pod
121
122							=head2 current_record
123
124							my $genbank_record = $parser->current_record;
125
126							Returns the current unparsed GenBank record.
127
128							=cut
129
130	0			0	1	0	my $self = shift;
131
132	0					0	return $self->{'current_record'};
133							}
134
135							# ----------------------------------------------------------------
136							sub next_seq {
137
138							=pod
139
140							=head2 next_seq
141
142							my $seq = $parser->next_seq;
143
144							Returns the next sequence from the C.
145
146							=cut
147
148	0			0	1	0	my $self = shift;
149
150	0	0				0	if ( my $fh = $self->{'fh'} ) {
151	0					0	local $/ = $GENBANK_RECORD_SEPARATOR;
152
153	0					0	my $rec;
154	0					0	for (;;) {
155	0					0	$rec = <$fh>;
156	0	0	0			0	last if !defined $rec \|\| $rec =~ /\S+/;
157							}
158
159	0	0	0			0	if ( defined $rec && $rec =~ /\S+/ ) {
160	0					0	return $self->parse( $rec );
161							}
162							else {
163	0					0	return undef;
164							}
165							}
166							else {
167	0					0	croak("Can't call 'next_seq' without a 'file' argument");
168							}
169							}
170
171							# ----------------------------------------------------------------
172							sub parse {
173
174							=pod
175
176							=head2 parse
177
178							my $rec = $parser->parse( $text );
179							print $rec->{'ACCESSION'};
180
181							Parses a (single) GenBank record into a hash reference.
182
183							=cut
184
185	2			2	1	78128	my $self = shift;
186	2	50				12	my $text = shift() or croak('No input to parse');
187	2	50				10	my $parser = $self->parser or croak('No parser');
188
189	2					7	$self->{'current_record'} = $text;
190
191	2					25	return $parser->startrule( $text );
192							}
193
194							# ----------------------------------------------------------------
195							sub parser {
196
197							=pod
198
199							=head2 parser
200
201							Returns the Parse::RecDescent object.
202
203							=cut
204
205	2			2	1	4	my $self = shift;
206
207	2	100				11	if ( !defined $self->{'parser'} ) {
208	1	50				5	my $grammar = $self->grammar or croak('No grammar');
209	1					7	$self->{'parser'} = Parse::RecDescent->new( $grammar );
210							}
211
212	2					342084	return $self->{'parser'};
213							}
214
215							# ----------------------------------------------------------------
216							sub grammar {
217
218							=pod
219
220							=head2 grammar
221
222							Returns the Parse::RecDescent grammar for a GenBank record.
223
224							=cut
225
226	1			1	1	1	my $self = shift;
227	1					5	return <<'END_OF_GRAMMAR';
228							{
229							my $ref_num = 1;
230							my %record = ();
231							my %ATTRIBUTE_PROMOTE = map { $_, 1 } qw[
232							mol_type
233							cultivar
234							variety
235							strain
236							];
237
238							$::RD_ERRORS; # report fatal errors
239							# $::RD_TRACE = 0;
240							# $::RD_WARN = 0; # Enable warnings. This will warn on unused rules &c.
241							# $::RD_HINT = 0; # Give out hints to help fix problems.
242							}
243
244							startrule: section(s) eofile
245							{
246							if ( !$record{'ACCESSION'} ) {
247							$record{'ACCESSION'} = $record{'LOCUS'}->{'genbank_accession'};
248							}
249
250							if ( ref $record{'SEQUENCE'} eq 'ARRAY' ) {
251							$record{'SEQUENCE'} = join('', @{ $record{'SEQUENCE'} });
252							}
253
254							$return = { %record };
255							%record = ();
256							}
257							\|
258
259							section: commented_line
260							\| header
261							\| locus
262							\| dbsource
263							\| definition
264							\| accession_line
265							\| project_line
266							\| version_line
267							\| keywords
268							\| source_line
269							\| organism
270							\| reference
271							\| features
272							\| base_count
273							\| contig
274							\| origin
275							\| comment
276							\| record_delimiter
277							\|
278
279							header: /.+(?=\nLOCUS)/xms
280
281							locus: /LOCUS/xms locus_name sequence_length molecule_type
282							genbank_division(?) modification_date
283							{
284							$record{'LOCUS'} = {
285							locus_name => $item{'locus_name'},
286							sequence_length => $item{'sequence_length'},
287							molecule_type => $item{'molecule_type'},
288							genbank_division => $item{'genbank_division(?)'}[0],
289							modification_date => $item{'modification_date'},
290							}
291							}
292
293							locus_name: /\w+/
294
295							space: /\s+/
296
297							sequence_length: /\d+/ /(aa\|bp)/ { $return = "$item[1] $item[2]" }
298
299							molecule_type: /\w+/ (/[a-zA-Z]{4,}/)(?)
300							{
301							$return = join(' ', map { $_ \|\| () } $item[1], $item[2][0] )
302							}
303
304							genbank_division:
305							/(PRI\|CON\|ROD\|MAM\|VRT\|INV\|PLN\|BCT\|VRL\|PHG\|SYN\|UNA\|EST\|PAT\|STS\|GSS\|HTG\|HTC\|ENV)/
306
307							modification_date: /\d+-[A-Z]{3}-\d{4}/
308
309							definition: /DEFINITION/ section_continuing_indented
310							{
311							( $record{'DEFINITION'} = $item[2] ) =~ s/\n\s+/ /g;
312							}
313
314							section_continuing_indented: /.*?(?=\n[A-Z]+\s+)/xms
315
316							section_continuing_indented: /.*?(?=\n\/\/)/xms
317
318							accession_line: /ACCESSION/ section_continuing_indented
319							{
320							my @accs = split /\s+/, $item[2];
321							$record{'ACCESSION'} = shift @accs;
322							push @{ $record{'VERSION'} }, @accs;
323							}
324
325							version_line: /VERSION/ /(.+)(?=\n)/
326							{
327							push @{ $record{'VERSION'} }, split /\s+/, $item[2];
328							}
329
330							project_line: /PROJECT/ section_continuing_indented
331							{
332							$record{'PROJECT'} = $item[2];
333							}
334
335							keywords: /KEYWORDS/ keyword_value
336							{
337							$record{'KEYWORDS'} = $item[2];
338							}
339
340							keyword_value: section_continuing_indented
341							{
342							( my $str = $item[1] ) =~ s/\.$//;
343							$return = [ split(/,\s*/, $str ) ];
344							}
345							\| PERIOD { $return = [] }
346
347							dbsource: /DBSOURCE/ /\w+/ /[^\n]+/xms
348							{
349							push @{ $record{'DBSOURCE'} }, {
350							$item[2], $item[3]
351							};
352							}
353
354							source_line: /SOURCE/ source_value
355							{
356							( my $src = $item[2] ) =~ s/\.$//;
357							$src =~ s/\bsp$/sp./;
358							$record{'SOURCE'} = $src;
359							}
360
361							source_value: /(.+?)(?=\n\s{0,2}[A-Z]+)/xms { $return = $1 }
362
363							organism: organism_line classification_line
364							{
365							$record{'ORGANISM'} = $item[1];
366							$record{'CLASSIFICATION'} = $item[2];
367							}
368
369							organism_line: /ORGANISM/ organism_value { $return = $item[2] }
370
371							organism_value: /([^\n]+)(?=\n)/xms { $return = $1 }
372
373							classification_line: /([^.]+)[.]/xms { $return = [ split(/;\s*/, $1) ] }
374
375							word: /\w+/
376
377							reference: /REFERENCE/ NUMBER(?) parenthetical_phrase(?) authors(?) consrtm(?) title journal remark(?) pubmed(?) remark(?)
378							{
379							my $num = $item[2][0] \|\| $ref_num++;
380							my $remark = join(' ', map { $_ \|\| () } $item[8][0], $item[10][0]);
381							$remark = undef if $remark !~ /\S+/;
382
383							push @{ $record{'REFERENCES'} }, {
384							number => $num,
385							authors => $item{'authors(?)'}[0],
386							title => $item{'title'},
387							journal => $item{'journal'},
388							pubmed => $item[9][0],
389							note => $item[3][0],
390							remark => $remark,
391							consrtm => $item[5][0],
392							};
393
394							}
395
396							parenthetical_phrase: /$ ([^)]+) $/xms
397							{
398							$return = $1;
399							}
400
401							authors: /AUTHORS/ author_value { $return = $item[2] }
402
403							author_value: /(.+?)(?=\n\s{0,2}[A-Z]+)/xms
404							{
405							$return = [
406							grep { !/and/ }
407							map { s/,$//; $_ }
408							split /\s+/, $1
409							];
410							}
411
412							title: /TITLE/ /.*?(?=\n\s{0,2}[A-Z]+)/xms
413							{ ( $return = $item[2] ) =~ s/\n\s+/ /; }
414
415							journal: /JOURNAL/ journal_value
416							{
417							$return = $item[2]
418							}
419
420							journal_value: /(.+)(?=\n\s{3}PUBMED)/xms
421							{
422							$return = $1;
423							$return =~ s/\n\s+/ /g;
424							}
425							\| /(.+?)(?=\n\s{0,2}[A-Z]+)/xms
426							{
427							$return = $1;
428							$return =~ s/\n\s+/ /g;
429							}
430
431							pubmed: /PUBMED/ NUMBER
432							{ $return = $item[2] }
433
434							remark: /REMARK/ section_continuing_indented
435							{ $return = $item[2] }
436
437							consrtm: /CONSRTM/ /[^\n]+/xms { $return = $item[2] }
438
439							features: /FEATURES/ section_continuing_indented
440							{
441							my ( $location, $cur_feature_name, %cur_features, $cur_key );
442							for my $fline ( split(/\n/, $item[2]) ) {
443							next if $fline =~ m{^\s*Location/Qualifiers};
444							next if $fline !~ /\S+/;
445
446							if ( $fline =~ /^\s{21}\/ (\w+?) = (.+)$/xms ) {
447							my ( $key, $value ) = ( $1, $2 );
448							$value =~ s/^"\|"$//g;
449							$cur_key = $key;
450							$cur_features{ $key } = $value;
451
452							if ( $key eq 'db_xref' && $value =~ /^taxon:(\d+)$/ ) {
453							$record{'NCBI_TAXON_ID'} = $1;
454							}
455
456							if ( $ATTRIBUTE_PROMOTE{ $key } ) {
457							$record{ uc $key } = $value;
458							}
459							}
460							elsif ( $fline =~ /^\s{5}(\S+) \s+ (.+)$/xms ) {
461							my ( $this_feature_name, $this_location ) = ( $1, $2 );
462							$cur_key = '';
463
464							if ( $cur_feature_name ) {
465							push @{ $record{'FEATURES'} }, {
466							name => $cur_feature_name,
467							location => $location,
468							feature => { %cur_features },
469							};
470
471							%cur_features = ();
472							}
473
474							( $cur_feature_name, $location ) =
475							( $this_feature_name, $this_location );
476							}
477							elsif ( $fline =~ /^\s{21}([^"]+)["]?$/ ) {
478							if ( $cur_key ) {
479							$cur_features{ $cur_key } .=
480							$cur_key eq 'translation'
481							? $1
482							: ' ' . $1;
483							}
484							}
485							}
486
487							push @{ $record{'FEATURES'} }, {
488							name => $cur_feature_name,
489							location => $location,
490							feature => { %cur_features },
491							};
492							}
493
494							base_count: /BASE COUNT/ base_summary(s)
495							{
496							for my $sum ( @{ $item[2] } ) {
497							$record{'BASE_COUNT'}{ $sum->[0] } = $sum->[1];
498							}
499							}
500
501							base_summary: /\d+/ /[a-zA-Z]+/
502							{
503							$return = [ $item[2], $item[1] ];
504							}
505
506							origin: /ORIGIN/ origin_value
507							{
508							$record{'ORIGIN'} = $item[2]
509							}
510
511							origin_value: /(.*?)(?=\n\/\/)/xms
512							{
513							my $seq = $1;
514							$record{'SEQUENCE'} = [];
515							while ( $seq =~ /([actg]+)/gc ) {
516							push @{ $record{'SEQUENCE'} }, $1;
517							}
518
519							$return = $seq;
520							}
521
522							comment: /COMMENT/ comment_value
523
524							comment_value: /(.+?)(?=\n[A-Z]+)/xms
525							{
526							$record{'COMMENT'} = $1;
527							}
528
529							contig: /CONTIG/ section_continuing_indented
530							{
531							$record{'CONTIG'} = $item[2];
532							}
533
534							commented_line: /#[^\n]+/
535
536							NUMBER: /\d+/
537
538							PERIOD: /\./
539
540							record_delimiter: /\/\/\s*/xms
541
542							eofile: /^\Z/
543
544							END_OF_GRAMMAR
545							}
546
547							# ----------------------------------------------------------------
548							=pod
549
550							=head1 AUTHOR
551
552							Ken Youens-Clark Ekclark at cpan.orgE.
553
554							=head1 BUGS
555
556							Please report any bugs or feature requests to C
557							at rt.cpan.org>, or through the web interface at
558							L.
559							I will be notified, and then you'll automatically be notified of
560							progress on your bug as I make changes.
561
562							=head1 SUPPORT
563
564							You can find documentation for this module with the perldoc command.
565
566							perldoc Bio::GenBankParser
567
568							=over 4
569
570							=item * RT: CPAN's request tracker
571
572							L
573
574							=item * AnnoCPAN: Annotated CPAN documentation
575
576							L
577
578							=item * CPAN Ratings
579
580							L
581
582							=item * Search CPAN
583
584							L
585
586							=back
587
588							=head1 ACKNOWLEDGEMENTS
589
590							Lincoln Stein, Doreen Ware and everyone at Cold Spring Harbor Lab.
591
592							=head1 COPYRIGHT & LICENSE
593
594							Copyright 2008 Ken Youens-Clark, all rights reserved.
595
596							This program is free software; you can redistribute it and/or modify it
597							under the same terms as Perl itself.
598
599							=cut
600
601							1; # End of Bio::GenBankParser