File Coverage

Bio/SearchIO/gmap_f9.pm
Criterion Covered Total %
statement 78 111 70.2
branch 15 64 23.4
condition 5 9 55.5
subroutine 8 10 80.0
pod 1 1 100.0
total 107 195 54.8


line stmt bran cond sub pod time code
1             # $Id: gmap_f9.pm 15987 2009-08-18 21:08:55Z lstein $
2             #
3             # BioPerl module for Bio::SearchIO::gmap_f9
4             #
5             # Cared for by George Hartzell
6             #
7             # Copyright George Hartzell
8             #
9             # You may distribute this module under the same terms as perl itself
10              
11             # POD documentation - main docs before the code
12              
13             =head1 NAME
14              
15             Bio::SearchIO::gmap_f9 - Event generator for parsing gmap reports (Z format)
16              
17             =head1 SYNOPSIS
18              
19             # Do not use this object directly - it is used as part of the
20             # Bio::SearchIO system.
21              
22             use Bio::SearchIO;
23             my $searchio = Bio::SearchIO->new(-format => 'gmap',
24             -file => 't/data/her2.gmapz');
25             while( my $result = $searchio->next_result ) {
26             while( my $hit = $result->next_hit ) {
27             while( my $hsp = $hit->next_hsp ) {
28             # ...
29             }
30             }
31             }
32              
33              
34             =head1 DESCRIPTION
35              
36             This object encapsulated the necessary methods for generating events
37             suitable for building Bio::Search objects from a GMAP "compressed"
38             report (from gmap run with -Z flag) Read the L for more
39             information about how to use this.
40              
41             =head2 REVERSE STRAND AND BIOPERL COORDINATES
42              
43             I believe that I'm doing the correct thing when reporting hits on the
44             negative strand of the genome. In particular, I've compared the
45             "exons" this code generates with the set returned by ncbi's megablast
46             web service. NCBI's hsp's are ordered differently and have a
47             different genomic location (off by ~18,000,000 bases, padding?) but
48             the starts, ends, and lengths were similar and my strand handling
49             matches theirs. E.g.
50              
51             CDNA GENOME
52             start end strand start end strand
53              
54             blast
55             1913 2989 1 86236731 86237808 -1
56             1 475 1 86260509 86260983 -1
57             1510 1727 1 86240259 86240476 -1
58             841 989 1 86243034 86243182 -1
59             1381 1514 1 86240630 86240763 -1
60             989 1122 1 86242457 86242590 -1
61             599 729 1 86247470 86247600 -1
62             473 608 1 86259972 86260107 -1
63             1255 1382 1 86240837 86240964 -1
64             730 842 1 86244040 86244152 -1
65             1813 1921 1 86238123 86238231 -1
66             1725 1814 1 86239747 86239836 -1
67             1167 1256 1 86241294 86241383 -1
68             1120 1188 1 86242319 86242387 -1
69              
70             gmap
71             1 475 1 104330509 104330983 -1
72             476 600 1 104329980 104330104 -1
73             601 729 1 104317470 104317598 -1
74             730 841 1 104314041 104314152 -1
75             842 989 1 104313034 104313181 -1
76             990 1121 1 104312458 104312589 -1
77             1122 1187 1 104312320 104312385 -1
78             1188 1256 1 104311294 104311362 -1
79             1257 1382 1 104310837 104310962 -1
80             1383 1511 1 104310633 104310761 -1
81             1512 1726 1 104310260 104310474 -1
82             1727 1814 1 104309747 104309834 -1
83             1815 1917 1 104308127 104308229 -1
84             1918 2989 1 104306731 104307802 -1
85              
86             =head1 FEEDBACK
87              
88             =head2 Mailing Lists
89              
90             User feedback is an integral part of the evolution of this and other
91             Bioperl modules. Send your comments and suggestions preferably to
92             the Bioperl mailing list. Your participation is much appreciated.
93              
94             bioperl-l@bioperl.org - General discussion
95             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
96              
97             =head2 Reporting Bugs
98              
99             Report bugs to the Bioperl bug tracking system to help us keep track
100             of the bugs and their resolution. Bug reports can be submitted via
101             the web:
102              
103             https://github.com/bioperl/bioperl-live/issues
104              
105             =head1 AUTHOR - George Hartzell
106              
107             Email hartzell@alerce.com
108              
109             =head1 CONTRIBUTORS
110              
111             Additional contributors names and emails here
112              
113             =head1 APPENDIX
114              
115             The rest of the documentation details each of the object methods.
116             Internal methods are usually preceded with an underscore (_).
117              
118             =cut
119              
120              
121             # Let the code begin...
122              
123              
124             package Bio::SearchIO::gmap_f9;
125 1     1   6 use strict;
  1         2  
  1         25  
126 1     1   4 use warnings;
  1         2  
  1         29  
127              
128 1     1   361 use Bio::Search::Hit::GenericHit;
  1         3  
  1         34  
129 1     1   439 use Bio::Search::HSP::GenericHSP;
  1         4  
  1         59  
130              
131 1     1   11 use base qw(Bio::SearchIO );
  1         2  
  1         113  
132              
133 1     1   6 use Data::Dumper;
  1         1  
  1         1105  
134              
135             =head2 next_result
136              
137             Title : next_result
138             Usage : $result = stream->next_result
139             Function: Reads the next ResultI object from the stream and returns it.
140             Returns : A Bio::Search::Result::ResultI object
141             Args : n/a
142              
143             =cut
144              
145             sub next_result {
146 62     62 1 163 my $self = shift;
147              
148 62         192 my $info = [];
149 62         252 my $result;
150             my $hit;
151 62         0 my @hsp_info;
152 62         0 my $previous_hit_pos;
153              
154 62         296 while ( $_ = $self->_readline ) {
155 86654 100       129965 if ( $_ =~ /^>/ ) { # looking at the start of a result
156 118 100       304 if ($result) { # and done if there's one in progress
157 57         281 $self->_pushback($_);
158 57         910 goto DONE;
159             }
160             else { # otherwise start a new one.
161 61         644 my ($id, $desc, $md5) = m|>([^ ]*)\s*(.*)\s*(?:md5:(.*))?|;
162              
163 61         419 $result = Bio::Search::Result::GenericResult->new();
164 61         254 $result->algorithm('gmap');
165 61         263 $result->query_name($id);
166 61         190 $result->query_accession($id);
167 61         285 $result->query_description($desc);
168             #$self->warn("Take care of MD5!\n");
169              
170 61   33     471 $hit ||= Bio::Search::Hit::GenericHit->new( -name =>
171             "NONE_SPECIFIED");
172             }
173             }
174             else { # add another position to the hit/hsp
175             # 468 H 1956 C -14:104307764 2298317517 C H
176             # 468 1957 A -14:104307763 2298317516 A
177 86536         76862 my $c; # info about a column
178             ($c->{query_aa_pos}, $c->{query_aa}, $c->{query_pos},
179             $c->{query_base},
180             $c->{hit_strand}, $c->{hit_chromo}, $c->{hit_pos},
181             $c->{hit_concat_pos}, $c->{hit_base}, $c->{hit_aa})
182 86536         674091 = ($_ =~
183             m|
184             (\d+)[ ]?(.)?[\t]
185             (\d+)[ ]?(.)?[\t]
186             # TODO chromosome isn't a number... X, Y, MT....
187             (\+\|\-)([\dxXyY]+\|MT):(\d+)[ ](\d+)[ ](.)
188             [\t]?(.)?
189             |xo
190             );
191              
192 86536 100 100     304362 if ($previous_hit_pos &&
193             (abs($c->{hit_pos} - $previous_hit_pos) > 1)) {
194 541   33     1593 $hit ||= Bio::Search::Hit::GenericHit->new( -name =>
195             "NONE_SPECIFIED",
196             );
197 541         2415 $hit->add_hsp( $self->_hsp_from_info(\@hsp_info) );
198 541         99593 @hsp_info = ();
199             }
200 86536         108959 push @hsp_info, $c;
201 86536         177890 $previous_hit_pos = $c->{hit_pos};
202             }
203             }
204              
205             DONE:
206 62 100       316 if ($result) {
207 61 50       489 $hit->add_hsp( $self->_hsp_from_info(\@hsp_info) ) if (@hsp_info);
208              
209 61         141 my ($hit_length,$query_length);
210 61         307 for my $hsp ($hit->hsps) {
211 602         1382 $hit_length += $hsp->length();
212 602         1025 $query_length += $hsp->length('query');
213             }
214 61         419 $hit->length($hit_length);
215 61         264 $hit->query_length($query_length);
216             # update this now that we actually know something useful.q
217 61         427 $hit->name($hsp_info[0]->{hit_chromo});
218              
219 61 50       465 $result->add_hit($hit) if ($hit);
220             }
221              
222 62         25944 return($result);
223             }
224              
225              
226              
227             sub _hsp_from_info {
228 602     602   1200 my $self = shift;
229 602         1139 my $info = shift;
230 602         1005 my $a = {}; # args w/ which we'll create hsp
231 602         1256 my $hsp;
232             my $identical;
233              
234 602         1583 $a->{-algorithm} = 'GMAP';
235              
236 602         915 for my $c (@{$info}) {
  602         1950  
237 86536         100834 $a->{-query_seq} .= $c->{query_base};
238 86536         95273 $a->{-hit_seq} .= $c->{hit_base};
239 86536 100       108745 $a->{-homology_seq} .= $c->{query_base} eq $c->{hit_base} ? $c->{hit_base} : ' ';
240 86536 100       119914 $identical++ if ( $c->{query_base} eq $c->{hit_base} );
241             }
242              
243 602         1853 $a->{-query_seq} =~ s| |\-|g; # switch to bioperl gaps.
244 602         1794 $a->{-hit_seq} =~ s| |\-|g;
245              
246 602         1867 $a->{-conserved} = $a->{-identical} = $identical;
247              
248             # use the coordinates from from gmap's -f 9 output to
249             # determine whether gmap revcomped the query sequence
250             # to generate the alignment. Note that this is not
251             # the same as the cDNA's sense/anti-sense-ness.
252 602         1322 $a->{-stranded} = 'both';
253              
254 602         2307 $a->{-query_start} = $info->[0]->{query_pos};
255 602         1537 $a->{-query_end} = $info->[-1]->{query_pos};
256 602         2173 $a->{-query_length} = $a->{-query_end} - $a->{-query_start} + 1;
257              
258             # hit can be either strand, -f 9 output tells us which.
259             # we don't have to worry about it here, but telling the generichsp code
260             # that this hit is 'stranded', it compares the start and end positions
261             # sets it for us.
262 602         1582 $a->{-hit_start} = $info->[0]->{hit_pos};
263 602         1413 $a->{-hit_end} = $info->[-1]->{hit_pos};
264              
265 602         2063 $a->{-hit_length} = abs($a->{-hit_end} - $a->{-hit_start}) + 1;
266              
267             $a->{-hsp_length} =
268             $a->{-query_length} > $a->{-hit_length} ?
269 602 50       2032 $a->{-query_length} : $a->{-hit_length};
270              
271 602         6690 $hsp = Bio::Search::HSP::GenericHSP->new( %$a );
272              
273 602         5302 return $hsp;
274             }
275              
276             # TODO (adjust regexp to swallow lines w/out md5 sig's.
277             sub _parse_path_header {
278 0     0     my $self = shift;
279 0           my $path_line = shift;
280 0           my $path = {};
281              
282             (
283             $path->{query},
284             $path->{db},
285             $path->{path_num},
286             $path->{path_total_num},
287             $path->{query_length},
288             $path->{exon_count},
289             $path->{trimmed_coverage},
290             $path->{percent_identity},
291             $path->{query_start},
292             $path->{query_end},
293             $path->{whole_genome_start},
294             $path->{whole_genome_end},
295             $path->{chromosome},
296             $path->{chromo_start},
297             $path->{chromo_end},
298             $path->{strand},
299             $path->{sense},
300             $path->{md5},
301 0           ) =
302             ($_ =~ qr|
303             >
304             ([^ ]*)[ ] # the query id}, followed by a space
305             ([^ ]*)[ ] # the genome database, followed by a space
306             (\d+)/(\d+)[ ] # path_num/path_total_num (e.g. 3/12)
307             (\d+)[ ] # query length, followed by a space
308             (\d+)[ ] # hsp/exon count, followed by a space
309             (\d+\.\d*)[ ] # trimmed coverage
310             (\d+\.\d*)[ ] # percent identity
311             (\d+)\.\.(\d+)[ ] # query start .. query end, followed by space
312             (\d+)\.\.(\d+)[ ] # whole genome s..e, followed by space
313             (\d+): # chromosome number
314             (\d+)\.\.(\d+)[ ] # chromo s..e, followed by a space
315             ([+-])[ ] # strand, followed by a space
316             dir:(.*) # dir:sense or dir:antisense
317             [ ]md5:([\dabcdefg]+) # md5 signature
318             |x
319             );
320              
321 0 0         $path->{query} or $self->throw("query was not found in path line.");
322 0 0         $path->{db} or $self->throw("db was not found in path line.");
323 0 0         $path->{path_num} or $self->throw("path_num was not found in path line.");
324             $path->{path_total_num} or
325 0 0         $self->throw("path_total_num was not found in path line.");
326             $path->{query_length} or
327 0 0         $self->throw("query_length was not found in path line.");
328             $path->{exon_count} or
329 0 0         $self->throw("exon_count was not found in path line.");
330             $path->{trimmed_coverage} or
331 0 0         $self->throw("trimmed_coverage was not found in path line.");
332             $path->{percent_identity} or
333 0 0         $self->throw("percent_identity was not found in path line.");
334             $path->{query_start} or
335 0 0         $self->throw("query_start was not found in path line.");
336             $path->{query_end} or
337 0 0         $self->throw("query_end was not found in path line.");
338             $path->{whole_genome_start} or
339 0 0         $self->throw("whole_genome_start was not found in path line.");
340             $path->{whole_genome_end} or
341 0 0         $self->throw("whole_genome_end was not found in path line.");
342             $path->{chromosome} or
343 0 0         $self->throw("chromosome was not found in path line.");
344             $path->{chromo_start} or
345 0 0         $self->throw("chromo_start was not found in path line.");
346             $path->{chromo_end} or
347 0 0         $self->throw("chromo_end was not found in path line.");
348 0 0         $path->{strand} or $self->throw("strand was not found in path line.");
349 0 0         $path->{sense} or $self->throw("sense was not found in path line.");
350              
351 0           return $path;
352             }
353              
354             sub _parse_alignment_line {
355 0     0     my $self = shift;
356 0           my $a_line = shift;
357 0           my $align = {};
358              
359             (
360             $align->{chromo_start},
361             $align->{chromo_end},
362             $align->{query_start},
363             $align->{query_end},
364             $align->{percent_identity},
365             $align->{align_length},
366             $align->{intron_length},
367 0           ) =
368             ($_ =~ qr|
369             [\t]
370             ([\d]+)[ ] # start in chromosome coord.
371             ([\d]+)[ ] # end in chromosome coord.
372             ([\d]+)[ ] # start in query coord.
373             ([\d]+)[ ] # end in query coord.
374             ([\d]+) # percent identity (as integer)
375             [\t].*[\t] # skip the edit script
376             ([\d]+) # length of alignment block.
377             [\t]*([\d]+)* # length of following intron.
378             |x
379             );
380              
381             $align->{chromo_start}
382 0 0         or $self->throw("chromo_start missing in alignment line.");
383             $align->{chromo_end},
384 0 0         or $self->throw("chromo_end was missing in alignment line.");
385             $align->{query_start},
386 0 0         or $self->throw("query_start was missing in alignment line.");
387             $align->{query_end},
388 0 0         or $self->throw("query_end was missing in alignment line.");
389             $align->{percent_identity},
390 0 0         or $self->throw("percent_identity was missing in alignment line.");
391             $align->{align_length},
392 0 0         or $self->throw("align_length was missing in alignment line.");
393              
394 0           return $align;
395             }
396              
397             1;