File Coverage

blib/lib/Bio/WebService/LANL/SequenceLocator.pm

Criterion	Covered	Total	%
statement	27	178	15.1
branch	0	74	0.0
condition	0	33	0.0
subroutine	9	25	36.0
pod	1	6	16.6
total	37	316	11.7

line	stmt	bran	cond	sub	pod	time	code
1	1			1		1754	use strictures 1;
	1					902
	1					36
2	1			1		1987	use utf8;
	1					14
	1					6
3	1			1		75	use 5.018;
	1					3
	1					70
4
5							=head1 NAME
6
7							Bio::WebService::LANL::SequenceLocator - Locate sequences within HIV using LANL's web tool
8
9							=head1 SYNOPSIS
10
11							use Bio::WebService::LANL::SequenceLocator;
12
13							my $locator = Bio::WebService::LANL::SequenceLocator->new(
14							agent_string => 'Your Organization - you@example.com',
15							);
16							my @sequences = $locator->find([
17							"agcaatcagatggtcagccaaaattgccctatagtgcagaacatccaggggcaagtggtacatcaggccatatcacctagaactttaaatgca",
18							]);
19
20							See L below.
21
22							=head1 DESCRIPTION
23
24							This library provides simple programmatic access to
25							L
26							web tool and is also used to power
27							L
28							for the same tool (via L).
29
30							Nearly all of the information output by LANL's sequence locator is parsed and
31							provided by this library, though the results do vary slightly depending on the
32							base type of the query sequence. Multiple query sequences can be located at
33							the same time and results will be returned for all.
34
35							Results are extracted from both tab-delimited files provided by LANL as well as
36							the HTML itself.
37
38							=head1 EXAMPLE RESULTS
39
40							# Using @sequences from the SYNOPSIS above
41							use JSON;
42							print encode_json(\@sequences);
43
44							__END__
45							[
46							{
47							"query" : "sequence_1",
48							"query_sequence" : "AGCAATCAGATGGTCAGCCAAAATTGCCCTATAGTGCAGAACATCCAGGGGCAAGTGGTACATCAGGCCATATCACCTAGAACTTTAAATGCA",
49							"base_type" : "nucleotide",
50							"reverse_complement" : "0",
51							"alignment" : "\n Query AGCAATCAGA TGGTCAGCCA AAATTGCCCT ATAGTGCAGA ACATCCAGGG 50\n :::::::: ::::::::: ::::: :::: :::::::::: :::::::::: \n HXB2 AGCAATCA-- -GGTCAGCCA AAATTACCCT ATAGTGCAGA ACATCCAGGG 1208\n\n Query GCAAGTGGTA CATCAGGCCA TATCACCTAG AACTTTAAAT GCA 93\n :::: ::::: :::::::::: :::::::::: :::::::::: ::: \n HXB2 GCAAATGGTA CATCAGGCCA TATCACCTAG AACTTTAAAT GCA 1251\n\n ",
52							"hxb2_sequence" : "AGCAATCA---GGTCAGCCAAAATTACCCTATAGTGCAGAACATCCAGGGGCAAATGGTACATCAGGCCATATCACCTAGAACTTTAAATGCA",
53							"similarity_to_hxb2" : "94.6",
54							"start" : "373",
55							"end" : "462",
56							"genome_start" : "1162",
57							"genome_end" : "1251",
58							"polyprotein" : "Gag",
59							"region_names" : [
60							"Gag",
61							"p17",
62							"p24"
63							],
64							"regions" : [
65							{
66							"cds" : "Gag",
67							"aa_from_protein_start" : [ "125", "154" ],
68							"na_from_cds_start" : [ "373", "462" ],
69							"na_from_hxb2_start" : [ "1162", "1251" ],
70							"na_from_query_start" : [ "1", "93" ],
71							"protein_translation" : "SNQMVSQNCPIVQNIQGQVVHQAISPRTLNA"
72							},
73							{
74							"cds" : "p17",
75							"aa_from_protein_start" : [ "125", "132" ],
76							"na_from_cds_start" : [ "373", "396" ],
77							"na_from_hxb2_start" : [ "1162", "1185" ],
78							"na_from_query_start" : [ "1", "27" ],
79							"protein_translation" : "SNQMVSQNC"
80							},
81							{
82							"cds" : "p24",
83							"aa_from_protein_start" : [ "1", "22" ],
84							"na_from_cds_start" : [ "1", "66" ],
85							"na_from_hxb2_start" : [ "1186", "1251" ],
86							"na_from_query_start" : [ "28", "93" ],
87							"protein_translation" : "PIVQNIQGQVVHQAISPRTLNA"
88							}
89							]
90							}
91							]
92
93							=cut
94
95							package Bio::WebService::LANL::SequenceLocator;
96
97	1			1		954	use Moo;
	1					17906
	1					6
98	1			1		2924	use HTML::LinkExtor;
	1					21120
	1					49
99	1			1		1391	use HTML::TableExtract;
	1					11499
	1					8
100	1			1		962	use HTML::TokeParser;
	1					2395
	1					29
101	1			1		17439	use HTTP::Request::Common;
	1					37638
	1					128
102	1			1		1243	use List::AllUtils qw< pairwise part min max >;
	1					3235
	1					3420
103
104							our $VERSION = 20140624;
105
106							=head1 METHODS
107
108							=head2 new
109
110							Returns a new instance of this class. An optional parameter C
111							should be provided to identify yourself to LANL out of politeness. See the
112							L for an example.
113
114							=cut
115
116							has agent_string => (
117							is => 'ro',
118							lazy => 1,
119	0			0			builder => sub { '' },
120							);
121
122							has agent => (
123							is => 'ro',
124							lazy => 1,
125							builder => sub {
126	0			0			require LWP::UserAgent;
127	0						my $self = shift;
128	0						my $agent = LWP::UserAgent->new(
129							agent => join(" ", __PACKAGE__ . "/$VERSION", $self->agent_string),
130							);
131	0						$agent->env_proxy;
132	0						return $agent;
133							},
134							);
135
136							has lanl_base => (
137							is => 'ro',
138							lazy => 1,
139	0			0			builder => sub { 'http://www.hiv.lanl.gov' },
140							);
141
142							has lanl_endpoint => (
143							is => 'ro',
144							lazy => 1,
145	0			0			builder => sub { shift->lanl_base . '/cgi-bin/LOCATE/locate.cgi' },
146							);
147
148							has _bogus_slug => (
149							is => 'ro',
150							default => sub { 'BOGUS_SEQ_SO_TABULAR_FILES_ARE_LINKED_IN_OUTPUT' },
151							);
152
153							sub _request {
154	0			0			my $self = shift;
155	0						my $req = shift;
156	0						my $response = $self->agent->request($req);
157
158	0	0					if (not $response->is_success) {
159	0						warn sprintf "Request failed: %s %s -> %s\n",
160							$req->method, $req->uri, $response->status_line;
161	0						return;
162							}
163
164	0						return $response->decoded_content;
165							}
166
167							=head2 find
168
169							Takes an array ref of sequence strings. Sequences may be in amino acids or
170							nucleotides and mixed freely. Sequences should not be in FASTA format.
171
172							If sequence bases are not clearly nucleotides or clearly amino acids, LANL
173							seems to default to nucleotides. This can be an issue for some sequences since
174							the full alphabet for nucleotides overlaps with the alphabet for amino acids.
175							To overcome this problem, you may specify C<< base => 'nucleotide' >>
176							or C<< base => 'amino acid' >> after the array ref of sequences. This forces
177							every sequence to be interpreted as nucleotides or amino acids, so you cannot
178							mix base types in your sequences if you use this option. C, C, and
179							C are accepted aliases for C. C, C, C,
180							and C are accepted aliases for C.
181
182							Returns a list of hashrefs when called in list context, otherwise returns an
183							arrayref of hashrefs.
184
185							See L for the structure of the data returned.
186
187							=cut
188
189							sub find {
190	0			0	1		my ($self, $sequences, %args) = @_;
191
192	0	0					my $content = $self->submit_sequences($sequences, %args)
193							or return;
194
195	0						return $self->parse_html($content);
196							}
197
198							sub submit_sequences {
199	0			0	0		my ($self, $sequences, %args) = @_;
200
201	0	0					if (defined $args{base}) {
202	0						my $base = lc $args{base};
203	0	0					if ($base =~ /^n(uc(leotides?)?)?$/i) {
		0
204	0						$args{base} = 1;
205							} elsif ($base =~ /^(a(mino( acids?)?)?\|aa)$/i) {
206	0						$args{base} = 0;
207							} else {
208	0						warn "Unknown base type <$args{base}>, ignoring";
209	0						delete $args{base};
210							}
211							}
212
213							# Submit multiple sequences at once using FASTA
214	0						my $fasta = join "\n", map {
215	0						("> sequence_$_", $sequences->[$_ - 1])
216							} 1 .. @$sequences;
217
218							# LANL only presents the parseable table.txt we want if there's more
219							# than a single sequence. We always add it so we can reliably skip it.
220	0						$fasta .= "\n> " . $self->_bogus_slug . "\n";
221
222	0	0					return $self->_request(
223							POST $self->lanl_endpoint,
224							Content_Type => 'form-data',
225							Content => [
226							organism => 'HIV',
227							DoReverseComplement => 0,
228							SEQ => $fasta,
229							(defined $args{base}
230							? ( base => $args{base} )
231							: ()),
232							],
233							);
234							}
235
236							sub parse_html {
237	0			0	0		my ($self, $content) = @_;
238
239							# Fetch and parse the two tables provided as links which removes the need
240							# to parse all of the HTML.
241	0						my @results = $self->parse_tsv($content);
242
243							# Now parse the table data from the HTML
244	0						my @tables = $self->parse_tables($content);
245
246							# Extract the alignments, parsing the HTML a third time!
247	0						my @alignments = $self->parse_alignments($content);
248
249	0	0	0				return unless @results and @tables and @alignments;
			0
250
251	0	0	0				unless (@results == @tables and @results == @alignments) {
252	0						warn "Tab-delimited results count doesn't match parsed HTML result count. Bug!\n";
253	0						warn "TSV: ", scalar @results, "\n";
254	0						warn "HTML tables: ", scalar @tables, "\n";
255	0						warn "HTML alignments: ", scalar @alignments, "\n";
256	0						return;
257							}
258
259							@results = pairwise {
260	0						my $new = {
261							%$a,
262							base_type => $b->{base_type},
263							regions => $b->{rows},
264	0			0			region_names => [ map { $_->{cds} } @{$b->{rows}} ],
	0
265							};
266	0						delete $new->{$_} for qw(protein protein_start protein_end);
267	0						$new;
268	0						} @results, @tables;
269
270	0			0			@results = pairwise { +{ %$b, %$a } } @results, @alignments;
	0
271
272							# Fill in genome start/end for amino acid sequences
273	0						for my $r (@results) {
274	0	0					next unless $r->{base_type} eq 'amino acid';
275
276	0	0	0				if ($r->{genome_start} or $r->{genome_end}) {
277	0						warn "Amino acid sequence with genome start/end already?!",
278							" query <$r->{query_sequence}>";
279	0						next;
280							}
281
282	0						$r->{genome_start} = min map { $_->{na_from_hxb2_start}[0] } @{$r->{regions}};
	0
	0
283	0						$r->{genome_end} = max map { $_->{na_from_hxb2_start}[1] } @{$r->{regions}};
	0
	0
284							}
285
286	0	0					return wantarray ? @results : \@results;
287							}
288
289							sub parse_tsv {
290	0			0	0		my ($self, $content) = @_;
291	0						my @results;
292							my %urls;
293
294							my $extract = HTML::LinkExtor->new(
295							sub {
296	0			0			my ($tag, %attr) = @_;
297	0	0	0				return unless $tag eq 'a' and $attr{href};
298	0	0					return unless $attr{href} =~ m{/(table\|simple_results)\.txt$};
299	0						$urls{$1} = $attr{href};
300							},
301	0						$self->lanl_base,
302							);
303	0						$extract->parse($content);
304
305	0						for my $table_name (qw(table simple_results)) {
306	0	0					next unless $urls{$table_name};
307	0	0					my $table = $self->_request(GET $urls{$table_name})
308							or next;
309
310	0						my (@these_results, %seen);
311	0						my @lines = split "\n", $table;
312	0						my @fields = map {
313	0						s/^SeqName$/query/; # standard key
314	0						s/(?<=[a-z])(?=[A-Z])/_/g; # undo CamelCase
315	0						s/ +/_/g; # no spaces
316	0						y/A-Z/a-z/; # normalize to lowercase
317							# Account for the same field twice in the same data table
318	0	0					if ($seen{$_}++) {
319	0	0					$_ = /^(start\|end)$/
320							? "protein_$_"
321							: join "_", $_, $seen{$_};
322							}
323	0						$_;
324							} split "\t", shift @lines;
325
326	0						for (@lines) {
327	0						my @values = split "\t";
328	0						my %data;
329	0						@data{@fields} = @values;
330
331	0	0					next if $data{query} eq $self->_bogus_slug;
332
333	0	0					$data{query_sequence} =~ s/\s+//g
334							if $data{query_sequence};
335	0						push @these_results, \%data;
336							}
337
338							# Merge with existing results, if any
339							@results = @results
340	0	0		0			? pairwise { +{ %$a, %$b } } @results, @these_results
	0
341							: @these_results;
342							}
343
344	0						return @results;
345							}
346
347							sub parse_tables {
348	0			0	0		my ($self, $content) = @_;
349	0						my @tables;
350
351	0						my %columns_for = (
352							'amino acid' => [
353							"CDS" => "cds",
354							"AA position relative to protein start in HXB2" => "aa_from_protein_start",
355							"AA position relative to query sequence start" => "aa_from_query_start",
356							"AA position relative to polyprotein start in HXB2" => "aa_from_polyprotein_start",
357							"NA position relative to CDS start in HXB2" => "aa_from_cds_start",
358							"NA position relative to HXB2 genome start" => "na_from_hxb2_start",
359							],
360							'nucleotide' => [
361							"CDS" => "cds",
362							"Nucleotide position relative to CDS start in HXB2" => "na_from_cds_start",
363							"Nucleotide position relative to query sequence start" => "na_from_query_start",
364							"Nucleotide position relative to HXB2 genome start" => "na_from_hxb2_start",
365							"Amino Acid position relative to protein start in HXB2" => "aa_from_protein_start",
366							],
367							);
368
369	0						for my $base_type (sort keys %columns_for) {
370							my ($their_cols, $our_cols) = part {
371	0			0			state $i = 0;
372	0						$i++ % 2
373	0						} @{ $columns_for{$base_type} };
	0
374
375	0						my $extract = HTML::TableExtract->new( headers => $their_cols );
376	0						$extract->parse($content);
377
378							# Examine all matching tables
379	0						for my $table ($extract->tables) {
380	0						my %table = (
381							coords => [$table->coords],
382							base_type => $base_type,
383							columns => $our_cols,
384							rows => [],
385							);
386	0						for my $row ($table->rows) {
387	0	0					@$row = map { defined $_ ? s/^\s+\|\s*$//gr : $_ } @$row;
	0
388
389							# An empty row with only a sequence string in the first column.
390	0	0	0				if ( $row->[0]
	0	0	0
391							and $row->[0] =~ /^[A-Za-z]+$/
392							and not grep { defined and length } @$row[1 .. scalar @$row - 1])
393							{
394	0						$table{rows}->[-1]{protein_translation} = $row->[0];
395	0						next;
396							}
397
398							# Not all rows are data, some are informational sentences.
399	0	0					next if grep { not defined } @$row;
	0
400
401	0						my %row;
402	0	0	0				@row{@$our_cols} =
403	0	0	0				map { ($_ and $_ eq "NA") ? undef : $_ }
404	0						map { ($_ and /(\d+) → (\d+)/) ? [$1, $2] : $_ }
405							@$row;
406
407	0						push @{$table{rows}}, \%row;
	0
408							}
409	0						push @tables, \%table
410	0	0					if @{$table{rows}};
411							}
412							}
413
414							# Sort by depth, then within each depth by count
415	0	0					@tables = sort {
416	0						$a->{coords}[0] <=> $b->{coords}[0]
417							or $a->{coords}[1] <=> $b->{coords}[1]
418							} @tables;
419
420	0						return @tables;
421							}
422
423							sub parse_alignments {
424	0			0	0		my ($self, $content) = @_;
425	0						my @alignments;
426
427	0						my $doc = HTML::TokeParser->new(
428							\$content,
429							unbroken_text => 1,
430							);
431
432	0						my $expect_alignment = 0;
433
434	0						while (my $tag = $doc->get_tag("b", "pre")) {
435	0						my $name = lc $tag->[0];
436	0						my $text = $doc->get_text;
437	0	0					next unless defined $text;
438
439							# s are preceeded by a bold header, which we use as an indicator
440	0	0					if ($name eq 'b') {
		0
441	0						$expect_alignment = $text =~ /Alignment\s+of\s+the\s+query\s+sequence\s+to\s+HXB2/i;
442							} elsif ($name eq 'pre') {
443	0	0	0				if ($text =~ /^\sQuery\b/m and $text =~ /^\sHXB2\b/m) {
		0	0
444	0						push @alignments, $text;
445	0	0					warn "Not expecting alignment, but found one‽"
446							unless $expect_alignment;
447							}
448							elsif ($text =~ /^\s+$/ and $expect_alignment) {
449	0						push @alignments, undef; # We appear to have found an unaligned sequence.
450							}
451	0						$expect_alignment = 0;
452							}
453							}
454
455	0	0					if (defined $alignments[-1]) {
456	0						warn "Last alignment is non-null! It should be the empty alignment of the bogus sequence.";
457	0						warn "Alignment is <$alignments[-1]>\n";
458	0						return;
459							} else {
460	0						pop @alignments;
461							}
462
463	0						my @results;
464	0						for (@alignments) {
465	0						my @hxb2;
466	0	0					if (defined) {
467	0						push @hxb2, $1 =~ s/\s+//gr
468							while /^\sHXB2\b\s+(.+?)(?:\s+\d+\|\s)$/gm;
469							}
470	0	0					push @results, {
471							alignment => $_,
472							hxb2_sequence => @hxb2 ? join("", @hxb2) : undef,
473							};
474							}
475
476	0						return @results;
477							}
478
479							=head1 AUTHOR
480
481							Thomas Sibley Etrsibley@uw.eduE
482
483							=head1 COPYRIGHT
484
485							Copyright 2014 by the Mullins Lab, Department of Microbiology, University of
486							Washington.
487
488							=head1 LICENSE
489
490							Licensed under the same terms as Perl 5 itself.
491
492							=cut
493
494							42;