File Coverage

blib/lib/Bio/WebService/LANL/SequenceLocator.pm
Criterion Covered Total %
statement 27 178 15.1
branch 0 74 0.0
condition 0 33 0.0
subroutine 9 25 36.0
pod 1 6 16.6
total 37 316 11.7


line stmt bran cond sub pod time code
1 1     1   1754 use strictures 1;
  1         902  
  1         36  
2 1     1   1987 use utf8;
  1         14  
  1         6  
3 1     1   75 use 5.018;
  1         3  
  1         70  
4              
5             =head1 NAME
6              
7             Bio::WebService::LANL::SequenceLocator - Locate sequences within HIV using LANL's web tool
8              
9             =head1 SYNOPSIS
10              
11             use Bio::WebService::LANL::SequenceLocator;
12            
13             my $locator = Bio::WebService::LANL::SequenceLocator->new(
14             agent_string => 'Your Organization - you@example.com',
15             );
16             my @sequences = $locator->find([
17             "agcaatcagatggtcagccaaaattgccctatagtgcagaacatccaggggcaagtggtacatcaggccatatcacctagaactttaaatgca",
18             ]);
19              
20             See L below.
21              
22             =head1 DESCRIPTION
23              
24             This library provides simple programmatic access to
25             L
26             web tool and is also used to power
27             L
28             for the same tool (via L).
29              
30             Nearly all of the information output by LANL's sequence locator is parsed and
31             provided by this library, though the results do vary slightly depending on the
32             base type of the query sequence. Multiple query sequences can be located at
33             the same time and results will be returned for all.
34              
35             Results are extracted from both tab-delimited files provided by LANL as well as
36             the HTML itself.
37              
38             =head1 EXAMPLE RESULTS
39              
40             # Using @sequences from the SYNOPSIS above
41             use JSON;
42             print encode_json(\@sequences);
43            
44             __END__
45             [
46             {
47             "query" : "sequence_1",
48             "query_sequence" : "AGCAATCAGATGGTCAGCCAAAATTGCCCTATAGTGCAGAACATCCAGGGGCAAGTGGTACATCAGGCCATATCACCTAGAACTTTAAATGCA",
49             "base_type" : "nucleotide",
50             "reverse_complement" : "0",
51             "alignment" : "\n Query AGCAATCAGA TGGTCAGCCA AAATTGCCCT ATAGTGCAGA ACATCCAGGG 50\n :::::::: ::::::::: ::::: :::: :::::::::: :::::::::: \n HXB2 AGCAATCA-- -GGTCAGCCA AAATTACCCT ATAGTGCAGA ACATCCAGGG 1208\n\n Query GCAAGTGGTA CATCAGGCCA TATCACCTAG AACTTTAAAT GCA 93\n :::: ::::: :::::::::: :::::::::: :::::::::: ::: \n HXB2 GCAAATGGTA CATCAGGCCA TATCACCTAG AACTTTAAAT GCA 1251\n\n ",
52             "hxb2_sequence" : "AGCAATCA---GGTCAGCCAAAATTACCCTATAGTGCAGAACATCCAGGGGCAAATGGTACATCAGGCCATATCACCTAGAACTTTAAATGCA",
53             "similarity_to_hxb2" : "94.6",
54             "start" : "373",
55             "end" : "462",
56             "genome_start" : "1162",
57             "genome_end" : "1251",
58             "polyprotein" : "Gag",
59             "region_names" : [
60             "Gag",
61             "p17",
62             "p24"
63             ],
64             "regions" : [
65             {
66             "cds" : "Gag",
67             "aa_from_protein_start" : [ "125", "154" ],
68             "na_from_cds_start" : [ "373", "462" ],
69             "na_from_hxb2_start" : [ "1162", "1251" ],
70             "na_from_query_start" : [ "1", "93" ],
71             "protein_translation" : "SNQMVSQNCPIVQNIQGQVVHQAISPRTLNA"
72             },
73             {
74             "cds" : "p17",
75             "aa_from_protein_start" : [ "125", "132" ],
76             "na_from_cds_start" : [ "373", "396" ],
77             "na_from_hxb2_start" : [ "1162", "1185" ],
78             "na_from_query_start" : [ "1", "27" ],
79             "protein_translation" : "SNQMVSQNC"
80             },
81             {
82             "cds" : "p24",
83             "aa_from_protein_start" : [ "1", "22" ],
84             "na_from_cds_start" : [ "1", "66" ],
85             "na_from_hxb2_start" : [ "1186", "1251" ],
86             "na_from_query_start" : [ "28", "93" ],
87             "protein_translation" : "PIVQNIQGQVVHQAISPRTLNA"
88             }
89             ]
90             }
91             ]
92              
93             =cut
94              
95             package Bio::WebService::LANL::SequenceLocator;
96              
97 1     1   954 use Moo;
  1         17906  
  1         6  
98 1     1   2924 use HTML::LinkExtor;
  1         21120  
  1         49  
99 1     1   1391 use HTML::TableExtract;
  1         11499  
  1         8  
100 1     1   962 use HTML::TokeParser;
  1         2395  
  1         29  
101 1     1   17439 use HTTP::Request::Common;
  1         37638  
  1         128  
102 1     1   1243 use List::AllUtils qw< pairwise part min max >;
  1         3235  
  1         3420  
103              
104             our $VERSION = 20140624;
105              
106             =head1 METHODS
107              
108             =head2 new
109              
110             Returns a new instance of this class. An optional parameter C
111             should be provided to identify yourself to LANL out of politeness. See the
112             L for an example.
113              
114             =cut
115              
116             has agent_string => (
117             is => 'ro',
118             lazy => 1,
119 0     0     builder => sub { '' },
120             );
121              
122             has agent => (
123             is => 'ro',
124             lazy => 1,
125             builder => sub {
126 0     0     require LWP::UserAgent;
127 0           my $self = shift;
128 0           my $agent = LWP::UserAgent->new(
129             agent => join(" ", __PACKAGE__ . "/$VERSION", $self->agent_string),
130             );
131 0           $agent->env_proxy;
132 0           return $agent;
133             },
134             );
135              
136             has lanl_base => (
137             is => 'ro',
138             lazy => 1,
139 0     0     builder => sub { 'http://www.hiv.lanl.gov' },
140             );
141              
142             has lanl_endpoint => (
143             is => 'ro',
144             lazy => 1,
145 0     0     builder => sub { shift->lanl_base . '/cgi-bin/LOCATE/locate.cgi' },
146             );
147              
148             has _bogus_slug => (
149             is => 'ro',
150             default => sub { 'BOGUS_SEQ_SO_TABULAR_FILES_ARE_LINKED_IN_OUTPUT' },
151             );
152              
153             sub _request {
154 0     0     my $self = shift;
155 0           my $req = shift;
156 0           my $response = $self->agent->request($req);
157              
158 0 0         if (not $response->is_success) {
159 0           warn sprintf "Request failed: %s %s -> %s\n",
160             $req->method, $req->uri, $response->status_line;
161 0           return;
162             }
163              
164 0           return $response->decoded_content;
165             }
166              
167             =head2 find
168              
169             Takes an array ref of sequence strings. Sequences may be in amino acids or
170             nucleotides and mixed freely. Sequences should not be in FASTA format.
171              
172             If sequence bases are not clearly nucleotides or clearly amino acids, LANL
173             seems to default to nucleotides. This can be an issue for some sequences since
174             the full alphabet for nucleotides overlaps with the alphabet for amino acids.
175             To overcome this problem, you may specify C<< base => 'nucleotide' >>
176             or C<< base => 'amino acid' >> after the array ref of sequences. This forces
177             every sequence to be interpreted as nucleotides or amino acids, so you cannot
178             mix base types in your sequences if you use this option. C, C, and
179             C are accepted aliases for C. C, C, C,
180             and C are accepted aliases for C.
181              
182             Returns a list of hashrefs when called in list context, otherwise returns an
183             arrayref of hashrefs.
184              
185             See L for the structure of the data returned.
186              
187             =cut
188              
189             sub find {
190 0     0 1   my ($self, $sequences, %args) = @_;
191              
192 0 0         my $content = $self->submit_sequences($sequences, %args)
193             or return;
194              
195 0           return $self->parse_html($content);
196             }
197              
198             sub submit_sequences {
199 0     0 0   my ($self, $sequences, %args) = @_;
200              
201 0 0         if (defined $args{base}) {
202 0           my $base = lc $args{base};
203 0 0         if ($base =~ /^n(uc(leotides?)?)?$/i) {
    0          
204 0           $args{base} = 1;
205             } elsif ($base =~ /^(a(mino( acids?)?)?|aa)$/i) {
206 0           $args{base} = 0;
207             } else {
208 0           warn "Unknown base type <$args{base}>, ignoring";
209 0           delete $args{base};
210             }
211             }
212              
213             # Submit multiple sequences at once using FASTA
214 0           my $fasta = join "\n", map {
215 0           ("> sequence_$_", $sequences->[$_ - 1])
216             } 1 .. @$sequences;
217              
218             # LANL only presents the parseable table.txt we want if there's more
219             # than a single sequence. We always add it so we can reliably skip it.
220 0           $fasta .= "\n> " . $self->_bogus_slug . "\n";
221              
222 0 0         return $self->_request(
223             POST $self->lanl_endpoint,
224             Content_Type => 'form-data',
225             Content => [
226             organism => 'HIV',
227             DoReverseComplement => 0,
228             SEQ => $fasta,
229             (defined $args{base}
230             ? ( base => $args{base} )
231             : ()),
232             ],
233             );
234             }
235              
236             sub parse_html {
237 0     0 0   my ($self, $content) = @_;
238              
239             # Fetch and parse the two tables provided as links which removes the need
240             # to parse all of the HTML.
241 0           my @results = $self->parse_tsv($content);
242              
243             # Now parse the table data from the HTML
244 0           my @tables = $self->parse_tables($content);
245              
246             # Extract the alignments, parsing the HTML a third time!
247 0           my @alignments = $self->parse_alignments($content);
248              
249 0 0 0       return unless @results and @tables and @alignments;
      0        
250              
251 0 0 0       unless (@results == @tables and @results == @alignments) {
252 0           warn "Tab-delimited results count doesn't match parsed HTML result count. Bug!\n";
253 0           warn "TSV: ", scalar @results, "\n";
254 0           warn "HTML tables: ", scalar @tables, "\n";
255 0           warn "HTML alignments: ", scalar @alignments, "\n";
256 0           return;
257             }
258              
259             @results = pairwise {
260 0           my $new = {
261             %$a,
262             base_type => $b->{base_type},
263             regions => $b->{rows},
264 0     0     region_names => [ map { $_->{cds} } @{$b->{rows}} ],
  0            
265             };
266 0           delete $new->{$_} for qw(protein protein_start protein_end);
267 0           $new;
268 0           } @results, @tables;
269              
270 0     0     @results = pairwise { +{ %$b, %$a } } @results, @alignments;
  0            
271              
272             # Fill in genome start/end for amino acid sequences
273 0           for my $r (@results) {
274 0 0         next unless $r->{base_type} eq 'amino acid';
275              
276 0 0 0       if ($r->{genome_start} or $r->{genome_end}) {
277 0           warn "Amino acid sequence with genome start/end already?!",
278             " query <$r->{query_sequence}>";
279 0           next;
280             }
281              
282 0           $r->{genome_start} = min map { $_->{na_from_hxb2_start}[0] } @{$r->{regions}};
  0            
  0            
283 0           $r->{genome_end} = max map { $_->{na_from_hxb2_start}[1] } @{$r->{regions}};
  0            
  0            
284             }
285              
286 0 0         return wantarray ? @results : \@results;
287             }
288              
289             sub parse_tsv {
290 0     0 0   my ($self, $content) = @_;
291 0           my @results;
292             my %urls;
293              
294             my $extract = HTML::LinkExtor->new(
295             sub {
296 0     0     my ($tag, %attr) = @_;
297 0 0 0       return unless $tag eq 'a' and $attr{href};
298 0 0         return unless $attr{href} =~ m{/(table|simple_results)\.txt$};
299 0           $urls{$1} = $attr{href};
300             },
301 0           $self->lanl_base,
302             );
303 0           $extract->parse($content);
304              
305 0           for my $table_name (qw(table simple_results)) {
306 0 0         next unless $urls{$table_name};
307 0 0         my $table = $self->_request(GET $urls{$table_name})
308             or next;
309              
310 0           my (@these_results, %seen);
311 0           my @lines = split "\n", $table;
312 0           my @fields = map {
313 0           s/^SeqName$/query/; # standard key
314 0           s/(?<=[a-z])(?=[A-Z])/_/g; # undo CamelCase
315 0           s/ +/_/g; # no spaces
316 0           y/A-Z/a-z/; # normalize to lowercase
317             # Account for the same field twice in the same data table
318 0 0         if ($seen{$_}++) {
319 0 0         $_ = /^(start|end)$/
320             ? "protein_$_"
321             : join "_", $_, $seen{$_};
322             }
323 0           $_;
324             } split "\t", shift @lines;
325              
326 0           for (@lines) {
327 0           my @values = split "\t";
328 0           my %data;
329 0           @data{@fields} = @values;
330              
331 0 0         next if $data{query} eq $self->_bogus_slug;
332              
333 0 0         $data{query_sequence} =~ s/\s+//g
334             if $data{query_sequence};
335 0           push @these_results, \%data;
336             }
337              
338             # Merge with existing results, if any
339             @results = @results
340 0 0   0     ? pairwise { +{ %$a, %$b } } @results, @these_results
  0            
341             : @these_results;
342             }
343              
344 0           return @results;
345             }
346              
347             sub parse_tables {
348 0     0 0   my ($self, $content) = @_;
349 0           my @tables;
350              
351 0           my %columns_for = (
352             'amino acid' => [
353             "CDS" => "cds",
354             "AA position relative to protein start in HXB2" => "aa_from_protein_start",
355             "AA position relative to query sequence start" => "aa_from_query_start",
356             "AA position relative to polyprotein start in HXB2" => "aa_from_polyprotein_start",
357             "NA position relative to CDS start in HXB2" => "aa_from_cds_start",
358             "NA position relative to HXB2 genome start" => "na_from_hxb2_start",
359             ],
360             'nucleotide' => [
361             "CDS" => "cds",
362             "Nucleotide position relative to CDS start in HXB2" => "na_from_cds_start",
363             "Nucleotide position relative to query sequence start" => "na_from_query_start",
364             "Nucleotide position relative to HXB2 genome start" => "na_from_hxb2_start",
365             "Amino Acid position relative to protein start in HXB2" => "aa_from_protein_start",
366             ],
367             );
368              
369 0           for my $base_type (sort keys %columns_for) {
370             my ($their_cols, $our_cols) = part {
371 0     0     state $i = 0;
372 0           $i++ % 2
373 0           } @{ $columns_for{$base_type} };
  0            
374              
375 0           my $extract = HTML::TableExtract->new( headers => $their_cols );
376 0           $extract->parse($content);
377              
378             # Examine all matching tables
379 0           for my $table ($extract->tables) {
380 0           my %table = (
381             coords => [$table->coords],
382             base_type => $base_type,
383             columns => $our_cols,
384             rows => [],
385             );
386 0           for my $row ($table->rows) {
387 0 0         @$row = map { defined $_ ? s/^\s+|\s*$//gr : $_ } @$row;
  0            
388              
389             # An empty row with only a sequence string in the first column.
390 0 0 0       if ( $row->[0]
  0 0 0        
391             and $row->[0] =~ /^[A-Za-z]+$/
392             and not grep { defined and length } @$row[1 .. scalar @$row - 1])
393             {
394 0           $table{rows}->[-1]{protein_translation} = $row->[0];
395 0           next;
396             }
397              
398             # Not all rows are data, some are informational sentences.
399 0 0         next if grep { not defined } @$row;
  0            
400              
401 0           my %row;
402 0 0 0       @row{@$our_cols} =
403 0 0 0       map { ($_ and $_ eq "NA") ? undef : $_ }
404 0           map { ($_ and /(\d+) → (\d+)/) ? [$1, $2] : $_ }
405             @$row;
406              
407 0           push @{$table{rows}}, \%row;
  0            
408             }
409 0           push @tables, \%table
410 0 0         if @{$table{rows}};
411             }
412             }
413              
414             # Sort by depth, then within each depth by count
415 0 0         @tables = sort {
416 0           $a->{coords}[0] <=> $b->{coords}[0]
417             or $a->{coords}[1] <=> $b->{coords}[1]
418             } @tables;
419              
420 0           return @tables;
421             }
422              
423             sub parse_alignments {
424 0     0 0   my ($self, $content) = @_;
425 0           my @alignments;
426              
427 0           my $doc = HTML::TokeParser->new(
428             \$content,
429             unbroken_text => 1,
430             );
431              
432 0           my $expect_alignment = 0;
433              
434 0           while (my $tag = $doc->get_tag("b", "pre")) {
435 0           my $name = lc $tag->[0];
436 0           my $text = $doc->get_text;
437 0 0         next unless defined $text;
438              
439             #
s are preceeded by a bold header, which we use as an indicator 
440 0 0         if ($name eq 'b') {
    0          
441 0           $expect_alignment = $text =~ /Alignment\s+of\s+the\s+query\s+sequence\s+to\s+HXB2/i;
442             } elsif ($name eq 'pre') {
443 0 0 0       if ($text =~ /^\s*Query\b/m and $text =~ /^\s*HXB2\b/m) {
    0 0        
444 0           push @alignments, $text;
445 0 0         warn "Not expecting alignment, but found one‽"
446             unless $expect_alignment;
447             }
448             elsif ($text =~ /^\s+$/ and $expect_alignment) {
449 0           push @alignments, undef; # We appear to have found an unaligned sequence.
450             }
451 0           $expect_alignment = 0;
452             }
453             }
454              
455 0 0         if (defined $alignments[-1]) {
456 0           warn "Last alignment is non-null! It should be the empty alignment of the bogus sequence.";
457 0           warn "Alignment is <$alignments[-1]>\n";
458 0           return;
459             } else {
460 0           pop @alignments;
461             }
462              
463 0           my @results;
464 0           for (@alignments) {
465 0           my @hxb2;
466 0 0         if (defined) {
467 0           push @hxb2, $1 =~ s/\s+//gr
468             while /^\s*HXB2\b\s+(.+?)(?:\s+\d+|\s*)$/gm;
469             }
470 0 0         push @results, {
471             alignment => $_,
472             hxb2_sequence => @hxb2 ? join("", @hxb2) : undef,
473             };
474             }
475              
476 0           return @results;
477             }
478              
479             =head1 AUTHOR
480              
481             Thomas Sibley Etrsibley@uw.eduE
482              
483             =head1 COPYRIGHT
484              
485             Copyright 2014 by the Mullins Lab, Department of Microbiology, University of
486             Washington.
487              
488             =head1 LICENSE
489              
490             Licensed under the same terms as Perl 5 itself.
491              
492             =cut
493              
494             42;