File Coverage

Bio/SearchIO/gmap_f9.pm

Criterion	Covered	Total	%
statement	78	111	70.2
branch	15	64	23.4
condition	5	9	55.5
subroutine	8	10	80.0
pod	1	1	100.0
total	107	195	54.8

line	stmt	bran	cond	sub	pod	time	code
1							# $Id: gmap_f9.pm 15987 2009-08-18 21:08:55Z lstein $
2							#
3							# BioPerl module for Bio::SearchIO::gmap_f9
4							#
5							# Cared for by George Hartzell
6							#
7							# Copyright George Hartzell
8							#
9							# You may distribute this module under the same terms as perl itself
10
11							# POD documentation - main docs before the code
12
13							=head1 NAME
14
15							Bio::SearchIO::gmap_f9 - Event generator for parsing gmap reports (Z format)
16
17							=head1 SYNOPSIS
18
19							# Do not use this object directly - it is used as part of the
20							# Bio::SearchIO system.
21
22							use Bio::SearchIO;
23							my $searchio = Bio::SearchIO->new(-format => 'gmap',
24							-file => 't/data/her2.gmapz');
25							while( my $result = $searchio->next_result ) {
26							while( my $hit = $result->next_hit ) {
27							while( my $hsp = $hit->next_hsp ) {
28							# ...
29							}
30							}
31							}
32
33
34							=head1 DESCRIPTION
35
36							This object encapsulated the necessary methods for generating events
37							suitable for building Bio::Search objects from a GMAP "compressed"
38							report (from gmap run with -Z flag) Read the L for more
39							information about how to use this.
40
41							=head2 REVERSE STRAND AND BIOPERL COORDINATES
42
43							I believe that I'm doing the correct thing when reporting hits on the
44							negative strand of the genome. In particular, I've compared the
45							"exons" this code generates with the set returned by ncbi's megablast
46							web service. NCBI's hsp's are ordered differently and have a
47							different genomic location (off by ~18,000,000 bases, padding?) but
48							the starts, ends, and lengths were similar and my strand handling
49							matches theirs. E.g.
50
51							CDNA GENOME
52							start end strand start end strand
53
54							blast
55							1913 2989 1 86236731 86237808 -1
56							1 475 1 86260509 86260983 -1
57							1510 1727 1 86240259 86240476 -1
58							841 989 1 86243034 86243182 -1
59							1381 1514 1 86240630 86240763 -1
60							989 1122 1 86242457 86242590 -1
61							599 729 1 86247470 86247600 -1
62							473 608 1 86259972 86260107 -1
63							1255 1382 1 86240837 86240964 -1
64							730 842 1 86244040 86244152 -1
65							1813 1921 1 86238123 86238231 -1
66							1725 1814 1 86239747 86239836 -1
67							1167 1256 1 86241294 86241383 -1
68							1120 1188 1 86242319 86242387 -1
69
70							gmap
71							1 475 1 104330509 104330983 -1
72							476 600 1 104329980 104330104 -1
73							601 729 1 104317470 104317598 -1
74							730 841 1 104314041 104314152 -1
75							842 989 1 104313034 104313181 -1
76							990 1121 1 104312458 104312589 -1
77							1122 1187 1 104312320 104312385 -1
78							1188 1256 1 104311294 104311362 -1
79							1257 1382 1 104310837 104310962 -1
80							1383 1511 1 104310633 104310761 -1
81							1512 1726 1 104310260 104310474 -1
82							1727 1814 1 104309747 104309834 -1
83							1815 1917 1 104308127 104308229 -1
84							1918 2989 1 104306731 104307802 -1
85
86							=head1 FEEDBACK
87
88							=head2 Mailing Lists
89
90							User feedback is an integral part of the evolution of this and other
91							Bioperl modules. Send your comments and suggestions preferably to
92							the Bioperl mailing list. Your participation is much appreciated.
93
94							bioperl-l@bioperl.org - General discussion
95							http://bioperl.org/wiki/Mailing_lists - About the mailing lists
96
97							=head2 Reporting Bugs
98
99							Report bugs to the Bioperl bug tracking system to help us keep track
100							of the bugs and their resolution. Bug reports can be submitted via
101							the web:
102
103							https://github.com/bioperl/bioperl-live/issues
104
105							=head1 AUTHOR - George Hartzell
106
107							Email hartzell@alerce.com
108
109							=head1 CONTRIBUTORS
110
111							Additional contributors names and emails here
112
113							=head1 APPENDIX
114
115							The rest of the documentation details each of the object methods.
116							Internal methods are usually preceded with an underscore (_).
117
118							=cut
119
120
121							# Let the code begin...
122
123
124							package Bio::SearchIO::gmap_f9;
125	1			1		6	use strict;
	1					2
	1					25
126	1			1		4	use warnings;
	1					2
	1					29
127
128	1			1		361	use Bio::Search::Hit::GenericHit;
	1					3
	1					34
129	1			1		439	use Bio::Search::HSP::GenericHSP;
	1					4
	1					59
130
131	1			1		11	use base qw(Bio::SearchIO );
	1					2
	1					113
132
133	1			1		6	use Data::Dumper;
	1					1
	1					1105
134
135							=head2 next_result
136
137							Title : next_result
138							Usage : $result = stream->next_result
139							Function: Reads the next ResultI object from the stream and returns it.
140							Returns : A Bio::Search::Result::ResultI object
141							Args : n/a
142
143							=cut
144
145							sub next_result {
146	62			62	1	163	my $self = shift;
147
148	62					192	my $info = [];
149	62					252	my $result;
150							my $hit;
151	62					0	my @hsp_info;
152	62					0	my $previous_hit_pos;
153
154	62					296	while ( $_ = $self->_readline ) {
155	86654	100				129965	if ( $_ =~ /^>/ ) { # looking at the start of a result
156	118	100				304	if ($result) { # and done if there's one in progress
157	57					281	$self->_pushback($_);
158	57					910	goto DONE;
159							}
160							else { # otherwise start a new one.
161	61					644	my ($id, $desc, $md5) = m\|>([^ ])\s(.)\s(?:md5:(.*))?\|;
162
163	61					419	$result = Bio::Search::Result::GenericResult->new();
164	61					254	$result->algorithm('gmap');
165	61					263	$result->query_name($id);
166	61					190	$result->query_accession($id);
167	61					285	$result->query_description($desc);
168							#$self->warn("Take care of MD5!\n");
169
170	61		33			471	$hit \|\|= Bio::Search::Hit::GenericHit->new( -name =>
171							"NONE_SPECIFIED");
172							}
173							}
174							else { # add another position to the hit/hsp
175							# 468 H 1956 C -14:104307764 2298317517 C H
176							# 468 1957 A -14:104307763 2298317516 A
177	86536					76862	my $c; # info about a column
178							($c->{query_aa_pos}, $c->{query_aa}, $c->{query_pos},
179							$c->{query_base},
180							$c->{hit_strand}, $c->{hit_chromo}, $c->{hit_pos},
181							$c->{hit_concat_pos}, $c->{hit_base}, $c->{hit_aa})
182	86536					674091	= ($_ =~
183							m\|
184							(\d+)[ ]?(.)?[\t]
185							(\d+)[ ]?(.)?[\t]
186							# TODO chromosome isn't a number... X, Y, MT....
187							(\+\\|\-)([\dxXyY]+\\|MT):(\d+)[ ](\d+)[ ](.)
188							[\t]?(.)?
189							\|xo
190							);
191
192	86536	100	100			304362	if ($previous_hit_pos &&
193							(abs($c->{hit_pos} - $previous_hit_pos) > 1)) {
194	541		33			1593	$hit \|\|= Bio::Search::Hit::GenericHit->new( -name =>
195							"NONE_SPECIFIED",
196							);
197	541					2415	$hit->add_hsp( $self->_hsp_from_info(\@hsp_info) );
198	541					99593	@hsp_info = ();
199							}
200	86536					108959	push @hsp_info, $c;
201	86536					177890	$previous_hit_pos = $c->{hit_pos};
202							}
203							}
204
205							DONE:
206	62	100				316	if ($result) {
207	61	50				489	$hit->add_hsp( $self->_hsp_from_info(\@hsp_info) ) if (@hsp_info);
208
209	61					141	my ($hit_length,$query_length);
210	61					307	for my $hsp ($hit->hsps) {
211	602					1382	$hit_length += $hsp->length();
212	602					1025	$query_length += $hsp->length('query');
213							}
214	61					419	$hit->length($hit_length);
215	61					264	$hit->query_length($query_length);
216							# update this now that we actually know something useful.q
217	61					427	$hit->name($hsp_info[0]->{hit_chromo});
218
219	61	50				465	$result->add_hit($hit) if ($hit);
220							}
221
222	62					25944	return($result);
223							}
224
225
226
227							sub _hsp_from_info {
228	602			602		1200	my $self = shift;
229	602					1139	my $info = shift;
230	602					1005	my $a = {}; # args w/ which we'll create hsp
231	602					1256	my $hsp;
232							my $identical;
233
234	602					1583	$a->{-algorithm} = 'GMAP';
235
236	602					915	for my $c (@{$info}) {
	602					1950
237	86536					100834	$a->{-query_seq} .= $c->{query_base};
238	86536					95273	$a->{-hit_seq} .= $c->{hit_base};
239	86536	100				108745	$a->{-homology_seq} .= $c->{query_base} eq $c->{hit_base} ? $c->{hit_base} : ' ';
240	86536	100				119914	$identical++ if ( $c->{query_base} eq $c->{hit_base} );
241							}
242
243	602					1853	$a->{-query_seq} =~ s\| \|\-\|g; # switch to bioperl gaps.
244	602					1794	$a->{-hit_seq} =~ s\| \|\-\|g;
245
246	602					1867	$a->{-conserved} = $a->{-identical} = $identical;
247
248							# use the coordinates from from gmap's -f 9 output to
249							# determine whether gmap revcomped the query sequence
250							# to generate the alignment. Note that this is not
251							# the same as the cDNA's sense/anti-sense-ness.
252	602					1322	$a->{-stranded} = 'both';
253
254	602					2307	$a->{-query_start} = $info->[0]->{query_pos};
255	602					1537	$a->{-query_end} = $info->[-1]->{query_pos};
256	602					2173	$a->{-query_length} = $a->{-query_end} - $a->{-query_start} + 1;
257
258							# hit can be either strand, -f 9 output tells us which.
259							# we don't have to worry about it here, but telling the generichsp code
260							# that this hit is 'stranded', it compares the start and end positions
261							# sets it for us.
262	602					1582	$a->{-hit_start} = $info->[0]->{hit_pos};
263	602					1413	$a->{-hit_end} = $info->[-1]->{hit_pos};
264
265	602					2063	$a->{-hit_length} = abs($a->{-hit_end} - $a->{-hit_start}) + 1;
266
267							$a->{-hsp_length} =
268							$a->{-query_length} > $a->{-hit_length} ?
269	602	50				2032	$a->{-query_length} : $a->{-hit_length};
270
271	602					6690	$hsp = Bio::Search::HSP::GenericHSP->new( %$a );
272
273	602					5302	return $hsp;
274							}
275
276							# TODO (adjust regexp to swallow lines w/out md5 sig's.
277							sub _parse_path_header {
278	0			0			my $self = shift;
279	0						my $path_line = shift;
280	0						my $path = {};
281
282							(
283							$path->{query},
284							$path->{db},
285							$path->{path_num},
286							$path->{path_total_num},
287							$path->{query_length},
288							$path->{exon_count},
289							$path->{trimmed_coverage},
290							$path->{percent_identity},
291							$path->{query_start},
292							$path->{query_end},
293							$path->{whole_genome_start},
294							$path->{whole_genome_end},
295							$path->{chromosome},
296							$path->{chromo_start},
297							$path->{chromo_end},
298							$path->{strand},
299							$path->{sense},
300							$path->{md5},
301	0						) =
302							($_ =~ qr\|
303							>
304							([^ ]*)[ ] # the query id}, followed by a space
305							([^ ]*)[ ] # the genome database, followed by a space
306							(\d+)/(\d+)[ ] # path_num/path_total_num (e.g. 3/12)
307							(\d+)[ ] # query length, followed by a space
308							(\d+)[ ] # hsp/exon count, followed by a space
309							(\d+\.\d*)[ ] # trimmed coverage
310							(\d+\.\d*)[ ] # percent identity
311							(\d+)\.\.(\d+)[ ] # query start .. query end, followed by space
312							(\d+)\.\.(\d+)[ ] # whole genome s..e, followed by space
313							(\d+): # chromosome number
314							(\d+)\.\.(\d+)[ ] # chromo s..e, followed by a space
315							([+-])[ ] # strand, followed by a space
316							dir:(.*) # dir:sense or dir:antisense
317							[ ]md5:([\dabcdefg]+) # md5 signature
318							\|x
319							);
320
321	0	0					$path->{query} or $self->throw("query was not found in path line.");
322	0	0					$path->{db} or $self->throw("db was not found in path line.");
323	0	0					$path->{path_num} or $self->throw("path_num was not found in path line.");
324							$path->{path_total_num} or
325	0	0					$self->throw("path_total_num was not found in path line.");
326							$path->{query_length} or
327	0	0					$self->throw("query_length was not found in path line.");
328							$path->{exon_count} or
329	0	0					$self->throw("exon_count was not found in path line.");
330							$path->{trimmed_coverage} or
331	0	0					$self->throw("trimmed_coverage was not found in path line.");
332							$path->{percent_identity} or
333	0	0					$self->throw("percent_identity was not found in path line.");
334							$path->{query_start} or
335	0	0					$self->throw("query_start was not found in path line.");
336							$path->{query_end} or
337	0	0					$self->throw("query_end was not found in path line.");
338							$path->{whole_genome_start} or
339	0	0					$self->throw("whole_genome_start was not found in path line.");
340							$path->{whole_genome_end} or
341	0	0					$self->throw("whole_genome_end was not found in path line.");
342							$path->{chromosome} or
343	0	0					$self->throw("chromosome was not found in path line.");
344							$path->{chromo_start} or
345	0	0					$self->throw("chromo_start was not found in path line.");
346							$path->{chromo_end} or
347	0	0					$self->throw("chromo_end was not found in path line.");
348	0	0					$path->{strand} or $self->throw("strand was not found in path line.");
349	0	0					$path->{sense} or $self->throw("sense was not found in path line.");
350
351	0						return $path;
352							}
353
354							sub _parse_alignment_line {
355	0			0			my $self = shift;
356	0						my $a_line = shift;
357	0						my $align = {};
358
359							(
360							$align->{chromo_start},
361							$align->{chromo_end},
362							$align->{query_start},
363							$align->{query_end},
364							$align->{percent_identity},
365							$align->{align_length},
366							$align->{intron_length},
367	0						) =
368							($_ =~ qr\|
369							[\t]
370							([\d]+)[ ] # start in chromosome coord.
371							([\d]+)[ ] # end in chromosome coord.
372							([\d]+)[ ] # start in query coord.
373							([\d]+)[ ] # end in query coord.
374							([\d]+) # percent identity (as integer)
375							[\t].*[\t] # skip the edit script
376							([\d]+) # length of alignment block.
377							[\t]([\d]+) # length of following intron.
378							\|x
379							);
380
381							$align->{chromo_start}
382	0	0					or $self->throw("chromo_start missing in alignment line.");
383							$align->{chromo_end},
384	0	0					or $self->throw("chromo_end was missing in alignment line.");
385							$align->{query_start},
386	0	0					or $self->throw("query_start was missing in alignment line.");
387							$align->{query_end},
388	0	0					or $self->throw("query_end was missing in alignment line.");
389							$align->{percent_identity},
390	0	0					or $self->throw("percent_identity was missing in alignment line.");
391							$align->{align_length},
392	0	0					or $self->throw("align_length was missing in alignment line.");
393
394	0						return $align;
395							}
396
397							1;