File Coverage

blib/lib/Bio/Gonzales/Feat/IO/SWISS.pm

Criterion	Covered	Total	%
statement	108	116	93.1
branch	68	90	75.5
condition	7	17	41.1
subroutine	14	14	100.0
pod	0	2	0.0
total	197	239	82.4

line	stmt	bran	cond	sub	pod	time	code
1							package Bio::Gonzales::Feat::IO::SWISS;
2
3							# https://github.com/biopython/biopython/blob/master/Bio/SwissProt/__init__.py
4							# http://web.expasy.org/docs/userman.html#GN_line
5							# https://metacpan.org/pod/Bio::SeqIO::swiss
6	1			1		114219	use Mouse;
	1					29983
	1					4
7
8	1			1		368	use warnings;
	1					2
	1					25
9	1			1		5	use strict;
	1					2
	1					17
10
11	1			1		19	use 5.010;
	1					3
12
13	1			1		687	use List::MoreUtils qw/zip/;
	1					14591
	1					6
14	1			1		1606	use Bio::Gonzales::Feat;
	1					4
	1					54
15	1			1		8	use Data::Dumper;
	1					2
	1					48
16	1			1		6	use Carp;
	1					2
	1					50
17	1			1		11	use Scalar::Util qw/blessed/;
	1					11
	1					48
18	1			1		8	use Bio::Gonzales::Seq::Util qw/crc64/;
	1					2
	1					37
19	1			1		23	use Bio::Gonzales::MiniFeat;
	1					3
	1					1669
20
21							extends 'Bio::Gonzales::Feat::IO::Base';
22
23							#my @lineObjects = ('IDs', 'ACs', 'DTs', 'DEs', 'GNs',
24							#'OSs', 'OGs', 'OCs', 'OXs', 'OHs',
25							#'Refs', 'CCs', 'DRs', 'PE', 'KWs', 'FTs', 'Stars', 'SQs');
26
27							our $VERSION = '0.083'; # VERSION
28							has check_crc64 => ( is => 'rw' );
29
30							sub next_feat {
31	26			26	0	84	my ($self) = @_;
32	26					76	my $fhi = $self->_fhi;
33
34	26					42	my $l;
35							my @entry;
36	26					83	while ( defined( $l = $fhi->() ) ) {
37	1060	50	33			4004	next if ( !$l \|\| $l =~ /^\s*$/ );
38	1060					1802	push @entry, $l;
39	1060	100				2684	last if $l =~ m{^//};
40							}
41	26	100				73	if ( @entry > 0 ) {
42	25					60	return Parse_entry( \@entry );
43							} else {
44	1					4	return;
45							}
46							}
47
48							sub Parse_entry {
49	25			25	0	41	my $data = shift;
50
51	25					37	my $lines;
52	25	50				59	if ( ref $data eq 'ARRAY' ) {
		0
53	25					39	$lines = $data;
54							} elsif ( ref $data eq 'SCALAR' ) {
55	0					0	$lines = [ split /\n/, $$data ];
56							} else {
57	0					0	$lines = [ split /\n/, $data ];
58							}
59
60	25					46	my $ide = shift @$lines;
61
62	25	50				175	confess "could not parse ID field: $ide"
63							unless (
64							$ide =~ m{^
65							ID \s+ #
66							(\S+) \s+ # $1 entryname
67							([^\s;]+); \s+ # $2 DataClass
68							(?:PRT;)? \s+ # Molecule Type (optional)
69							[0-9]+[ ]AA \. # Sequencelength (capture?)
70							$
71							}ox
72							);
73
74	25					104	my ( $name, $seq_div ) = ( $1, $2 );
75	25	50	33			132	my $src
		50	33
76							= ( $seq_div eq 'Reviewed' \|\| $seq_div eq 'STANDARD' ) ? 'sp'
77							: ( $seq_div eq 'Unreviewed' \|\| $seq_div eq 'PRELIMINARY' ) ? 'tr'
78							: $seq_div;
79	25					355	my $mfeat = Bio::Gonzales::MiniFeat->new(
80							type => 'polypeptide',
81							source => $src,
82							attributes => { 'Name' => [$name] }
83							);
84	25					97	my $sequence;
85	25					68	my %description = (main => []);
86	25					45	my $description_level = 'main';
87
88	25					33	my @seq;
89	25					60	for my $e (@$lines) {
90
91	1035					1808	my $key = substr $e, 0, 2;
92
93							# FIXME also parse these parts of the data entry
94							#last if ( $key eq 'FT' );
95							#last if ( $key eq 'SQ' );
96
97	1035	100				1829	last if ( $key eq '//' );
98
99	1010					1628	my $val = substr $e, 5;
100
101	1010	50	33			2827	die "too short: $e" unless ( $key && $val );
102	1010					3301	$val =~ s/[.;]\s*$//;
103
104	1010	50				6412	if ( $key eq '**' ) {
		100
		100
		100
		100
		100
		50
		100
		100
		50
		100
		100
		100
		100
		100
		100
		50
		100
		100
		100
		100
		100
		100
		100
		50
		0
105							#See Bug 2353, some files from the EBI have extra lines
106							#starting "**" (two asterisks/stars). They appear
107							#to be unofficial automated annotations. e.g.
108							#**
109							#** ################# INTERNAL SECTION ##################
110							#**HA SAM; Annotated by PicoHamap 1.88; MF_01138.1; 09-NOV-2003.
111	0					0	next;
112							} elsif ( $key eq 'AC' ) {
113	25					117	$mfeat->add_attr( 'accession_number' => [ split /;\s+/, $val ] );
114							} elsif ( $key eq 'DT' ) {
115							} elsif ( $key eq 'DE' ) {
116	49	50				206	if($val =~ /^(\w+):\s*$/) {
117	0					0	$description_level = lc($1);
118	0					0	next;
119							}
120	49	100				141	if($val =~ s/^Flags:\s*//) {
121	23					40	push @{$description{main}}, [ 'flags', undef, split( /;\s*/, $val )];
	23					78
122	23					61	next;
123							}
124	26	50				140	die $val unless($val =~ /^(?:(\w+):)?\s(\w+)=\s(.*)$/);
125	26		66			104	my $cat = $1 // $description{$description_level}[-1][0];
126	26					38	push @{$description{$description_level}}, [ lc($cat), lc($2), $3 ];
	26					146
127							} elsif ( $key eq 'GN' ) {
128	24	50				49	next if ( $val eq 'and' );
129	24					66	for my $a ( split /;\s+/, $val ) {
130	24					79	my ( $ak, $av ) = split /=/, $a, 2;
131	24	50				124	$mfeat->add_attr( "gene_" . lc($ak) => [ $av ? split( /\s,\s/, $av ) : '' ] );
132							}
133							} elsif ( $key eq 'OS' ) {
134							} elsif ( $key eq 'OG' ) {
135							} elsif ( $key eq 'OC' ) {
136							} elsif ( $key eq 'OX' ) {
137	25	50				92	if ( $val =~ /NCBI_TaxID=(\w+)/ ) {
138	25					78	$mfeat->add_attr( 'ncbi_taxid' => $1 );
139							} else {
140	0					0	confess "$val doesn't look like NCBI_TaxID";
141							}
142							} elsif ( $key eq 'OH' ) {
143							} elsif ( $key eq 'RN' ) {
144
145							} elsif ( $key eq 'RP' ) {
146							# rn required
147							} elsif ( $key eq 'RC' ) {
148							# rn required
149							} elsif ( $key eq 'RX' ) {
150							# rn required
151							} elsif ( $key eq 'RL' ) {
152							# rn required
153							# In UniProt release 1.12 of 6/21/04, there is a new RG
154							# (Reference Group) line, which references a group instead of
155							# an author. Each block must have at least 1 RA or RG line.
156							} elsif ( $key eq 'RA' ) {
157							# rn required
158							} elsif ( $key eq 'RG' ) {
159							# rn required
160							} elsif ( $key eq "RT" ) {
161							} elsif ( $key eq 'CC' ) {
162							} elsif ( $key eq 'DR' ) {
163	261					1308	my ( $database, $primaryid, $optional, @comment ) = split /;\s+/, $val;
164	261					636	my $comment = join " ", @comment;
165
166							# drop leading and training spaces and trailing .
167	261					410	$comment =~ s/\.\s*$//;
168
169	261					1061	$mfeat->add_attr(
170							'dbxref' => {
171							db => $database,
172							id => $primaryid,
173							optional_id => $optional,
174							comment => $comment
175							}
176							);
177							} elsif ( $key eq 'PE' ) {
178							} elsif ( $key eq 'KW' ) {
179							#cols = value.rstrip(";.").split('; ')
180							#record.keywords.extend(cols)
181							} elsif ( $key eq 'FT' ) {
182							#_read_ft($feat, line)
183							} elsif ( $key eq 'SQ' ) {
184							#SQ SEQUENCE XXXX AA; XXXXX MW; XXXXXXXXXXXXXXXX CRC64;
185	25					74	$val =~ s/^SEQUENCE\s+//;
186	25					125	my ( $length, $weight, $crc64 ) = split /;\s+/, $val;
187	25					98	$length =~ s/\s+AA$//;
188	25					75	$weight =~ s/\s+MW$//;
189	25					78	$crc64 =~ s/\s+CRC64$//;
190	25					173	$mfeat->add_attr(
191							'seq' => { length => int($length), molecular_weight => $weight + 0.0, crc64 => $crc64 } );
192							} elsif ( $key eq ' ' ) {
193	92					180	$val =~ y/A-Za-z//cd;
194	92					229	$sequence .= $val;
195							} elsif ( $key eq '//' ) {
196							# Join multiline data into one string
197							#record.description = " ".join(record.description)
198							#record.organism = " ".join(record.organism)
199							#record.organelle = record.organelle.rstrip()
200							#for reference in record.references:
201							#reference.authors = " ".join(reference.authors).rstrip(";")
202							#reference.title = " ".join(reference.title).rstrip(";")
203							#if reference.title.startswith('"') and reference.title.endswith('"'):
204							#reference.title = reference.title[1:-1] # remove quotes
205							#reference.location = " ".join(reference.location)
206							#record.sequence = "".join(_sequence_lines)
207							#return record
208	0					0	last;
209							} else {
210	0					0	die sprintf( "Unknown keyword '%s' found", $key );
211							}
212							}
213
214	25					58	$mfeat->add_attr(description => _merge_descriptions(\%description));
215	25					70	$mfeat->add_attr( ID => $mfeat->attr_first('accession_number') );
216
217							die "no sequence object found in " . $mfeat->id . Dumper($mfeat)
218	25	50				68	unless ( $mfeat->attr->{seq} );
219							die "CRC64 does not match for " . $mfeat->id
220	25	50				81	unless ( crc64($sequence) eq $mfeat->attr->{seq}[0]{crc64} );
221	25					69	$mfeat->attr->{seq}[0]{data} = $sequence;
222
223	25					234	return $mfeat;
224							}
225
226							sub _merge_descriptions {
227	25			25		48	my $desc = shift;
228	25					30	my %desc_new;
229	25					80	while(my ($lvl, $data) = each %$desc) {
230	25					49	for my $d (@$data) {
231	49					89	my $cat = shift @$d;
232	49					78	my $scat = shift @$d;
233
234	49	100				103	$cat .= "_" . $scat if($scat);
235
236	49		50			251	$desc_new{$lvl}{$cat} //= [];
237
238	49					69	push @{$desc_new{$lvl}{$cat}}, @$d;
	49					198
239							}
240
241							}
242	25					43	return { %{delete $desc_new{main}}, %desc_new };
	25					179
243
244							}
245
246							__PACKAGE__->meta->make_immutable();