File Coverage

blib/lib/TIGR/HmmTools.pm

Criterion	Covered	Total	%
statement	17	422	4.0
branch	0	190	0.0
condition	0	33	0.0
subroutine	6	14	42.8
pod	0	7	0.0
total	23	666	3.4

line	stmt	bran	cond	sub	pod	time	code
1							package TIGR::HmmTools;
2							require Exporter;
3	1			1		52796	use 5.006;
	1					3
4	1			1		4	use strict;
	1					2
	1					19
5	1			1		8	use warnings;
	1					7
	1					53
6
7							=head1 NAME
8
9							TIGR::HmmTools - Functions for processing files from HMMer
10
11							=head1 VERSION
12
13							Version 0.01
14
15							=cut
16
17							our $VERSION = '0.03';
18
19
20							=head1 SYNOPSIS
21
22							Ages ago developers at The Institute for Genomic Research (RIP) produced code to process
23							raw alignment files from the HMMer suite. Many of these functions were embedded within a
24							variety of scripts which do things like format conversion. Installing this package also
25							adds the utility script, 'convert_hmmscan_to_htab.pl'
26
27							=head1 EXPORT
28
29							A list of functions that can be exported. You can delete this section
30							if you don't export anything, such as for a purely object-oriented module.
31							The following functions are exported:
32
33							- read_hmmer3_output
34							- read_hmmer2_output
35							- print_htab
36							- hmm_database_info
37							- get_cutoffs_for_hmm_accession
38							- build_alignment
39							- read_hmmer3_output2
40
41							=head1 AUTHOR
42
43							Joshua Orvis, C<< >>
44
45							The actual author of the code is currently unknown. I just packaged it for CPAN.
46
47							=head1 BUGS
48
49							Please report any bugs or feature requests to C, or through
50							the web interface at L. I will be notified, and then you'll
51							automatically be notified of progress on your bug as I make changes.
52
53							=head1 SUPPORT
54
55							You can find documentation for this module with the perldoc command.
56
57							perldoc TIGR::HmmTools
58
59
60							You can also look for information at:
61
62							=over 4
63
64							=item * RT: CPAN's request tracker (report bugs here)
65
66							L
67
68							=item * AnnoCPAN: Annotated CPAN documentation
69
70							L
71
72							=item * CPAN Ratings
73
74							L
75
76							=item * Search CPAN
77
78							L
79
80							=back
81
82
83							=head1 ACKNOWLEDGEMENTS
84
85
86							=head1 LICENSE AND COPYRIGHT
87
88							Copyright 2017 Joshua Orvis.
89
90							This program is free software; you can redistribute it and/or modify it
91							under the terms of the the Artistic License (2.0). You may obtain a
92							copy of the full license at:
93
94							L
95
96							Any use, modification, and distribution of the Standard or Modified
97							Versions is governed by this Artistic License. By using, modifying or
98							distributing the Package, you accept this license. Do not use, modify,
99							or distribute the Package, if you do not accept this license.
100
101							If your Modified Version has been derived from a Modified Version made
102							by someone other than you, you are nevertheless required to ensure that
103							your Modified Version complies with the requirements of this license.
104
105							This license does not grant you the right to use any trademark, service
106							mark, tradename, or logo of the Copyright Holder.
107
108							This license includes the non-exclusive, worldwide, free-of-charge
109							patent license to make, have made, use, offer to sell, sell, import and
110							otherwise transfer the Package with respect to any patent claims
111							licensable by the Copyright Holder that are necessarily infringed by the
112							Package. If you institute patent litigation (including a cross-claim or
113							counterclaim) against any party alleging that the Package constitutes
114							direct or contributory patent infringement, then this Artistic License
115							to you shall terminate on the date that such litigation is filed.
116
117							Disclaimer of Warranty: THE PACKAGE IS PROVIDED BY THE COPYRIGHT HOLDER
118							AND CONTRIBUTORS "AS IS' AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES.
119							THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
120							PURPOSE, OR NON-INFRINGEMENT ARE DISCLAIMED TO THE EXTENT PERMITTED BY
121							YOUR LOCAL LAW. UNLESS REQUIRED BY LAW, NO COPYRIGHT HOLDER OR
122							CONTRIBUTOR WILL BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, OR
123							CONSEQUENTIAL DAMAGES ARISING IN ANY WAY OUT OF THE USE OF THE PACKAGE,
124							EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
125
126
127							=cut
128
129	1			1		5	use Carp;
	1					2
	1					54
130	1			1		440	use Data::Dumper;
	1					6228
	1					4374
131							our @ISA = qw (Exporter);
132							our @EXPORT =
133							qw(read_hmmer3_output read_hmmer2_output print_htab hmm_database_info get_cutoffs_for_hmm_accession build_alignment read_hmmer3_output2);
134							our @EXPORT_OK = qw ();
135
136							sub read_hmmer3_output {
137	0			0	0		my ($path) = @_;
138	0						my $retval = {};
139
140	0						my $in_result = 0;
141	0						my $in_hit_scores = 0;
142	0						my $in_domain_scores = 0;
143
144	0	0					if ( $path ne '' ) {
145	0						chomp $path;
146	0						my @statd = stat $path;
147	0						$retval->{'info'}->{ 'search_date' } = ( ( localtime( $statd[ 9 ] ) )[ 3 ] ) . "-"
148							. ( ( localtime( $statd[ 9 ] ) )[ 4 ] + 1 ) . "-"
149							. ( ( localtime( $statd[ 9 ] ) )[ 5 ] + 1900 );
150							} else {
151	0						$retval->{'info'}->{ 'search_date' } = ( ( localtime )[ 3 ] ) . "-"
152							. ( ( localtime )[ 4 ] + 1 ) . "-"
153							. ( ( localtime )[ 5 ] + 1900 );
154							}
155
156	0	0					open(my $fh, "< $path") or die("Unable to open $path: $!");
157	0						while( my $line = <$fh> ) {
158	0	0	0				next if( $line =~ /^\s*$/ \|\| $line =~ /inclusion threshold/ );
159
160							# The program line
161	0	0					if ( $line =~ /^\#\s((hmmscan)\.)/ ) {
		0
		0
		0
		0
162	0						$retval->{'info'}->{'program'} = $1;
163
164							# The version
165							} elsif( $line =~ /^\#\s*HMMER ([\d\.]+)\s+$([^$]+)\)/ ) {
166	0						( $retval->{'info'}->{'version'}, $retval->{'info'}->{'release'} ) = ( $1, $2 );
167
168							# The hmm database searched
169							} elsif( $line =~ /^\#\s+target HMM database:\s+(\S+)/ ) {
170	0						$retval->{'info'}->{'hmm_file'} = $1;
171
172							# The query file
173							} elsif( $line =~ /^\#\squery sequence (file\|database):\s+(.+)/ ) {
174	0						$retval->{'info'}->{'sequence_file'} = $2;
175
176							# This indicates we're parsing a hit.
177							} elsif( $line =~ /^Query\:\s+(\S+)/ ) {
178	0						my $data = &_parse_hmmpfam3_hit( $fh );
179	0						$retval->{'queries'}->{$1} = $data;
180
181							}
182							}
183
184	0						close($fh);
185	0						return $retval;
186							}
187
188							sub _parse_hmmpfam3_hit {
189	0			0			my ($fh) = @_;
190	0						my $data = {};
191
192	0						my $in_hit_scores = 0;
193	0						my $in_domain_scores = 0;
194	0						my $hit_acc;
195
196	0						while( my $line = <$fh> ) {
197	0						chomp($line);
198	0	0					last if( $line =~ m\|//\| );
199	0	0					next if( $line =~ /^\s*$/ );
200
201	0	0	0				if( $line =~ /^Domain annotation/ ) {
		0	0
		0	0
		0	0
		0
		0
202	0						$in_hit_scores = 0;
203
204							} elsif( $in_hit_scores && $line !~ /^\s+--/ && $line !~ /inclusion_threshold/ ) {
205	0	0					if( $line =~ /No hits detected/ ) {
206	0						$data->{'hits'} = {};
207	0						last;
208							}
209
210	0						my @c = split(/\s+/, $line);
211	0						my $t_hit_acc = $c[9];
212	0						$data->{'hits'}->{$t_hit_acc}->{total_evalue} = $c[1];
213	0						$data->{'hits'}->{$t_hit_acc}->{total_score} = $c[2];
214	0						$data->{'hits'}->{$t_hit_acc}->{accession} = $c[9];
215	0						$data->{'hits'}->{$t_hit_acc}->{hit_description} = join(" ",@c[10..(@c-1)]);
216	0						$data->{'hits'}->{$t_hit_acc}->{domain_count} = $c[8];
217	0						$data->{'hits'}->{$t_hit_acc}->{frame} = "";
218							} elsif( $line =~ /^>>\s*(\S+)/ ) {
219	0						$in_domain_scores = 1;
220	0						$hit_acc = $1;
221
222							} elsif( $in_domain_scores && $line =~ /Alignments for each/ ) {
223	0						undef $hit_acc;
224	0						$in_domain_scores = 0;
225
226							} elsif( $in_domain_scores && $line !~ /^\s*[>\#-]/ ) {
227	0	0					die("Didn't parse hit accession from header line before getting to domain table")
228							unless( $hit_acc );
229	0	0					die("The hit accession [$hit_acc] didn't exist in lookup") unless( exists( $data->{'hits'}->{$hit_acc} ) );
230
231	0	0					if( $line =~ /\[No individual domains that/ ) {
232	0						$data->{'hits'}->{$hit_acc}->{'domains'} = {};
233	0						$in_domain_scores = 0;
234	0						next;
235							}
236
237	0						my @c = split(/\s+/, $line);
238
239	0	0	0				if( $c[1] eq 'targets' \|\| $c[1] eq 'reported' \|\| $c[1] eq 'Fwd') {
			0
240	0						print Dumper( $data->{'hits'}->{$hit_acc} );
241	0						print "LINE: $line\n";
242	0						print Dumper( @c );
243	0						die("Issue parsing");
244							}
245
246	0						my $a = $c[1];
247	0						my $b = $a + 0;
248	0	0					if( $b ne $a ) {
249	0						print Dumper( $data->{'hits'}->{$hit_acc} );
250	0						print "LINE: $line\n";
251	0						print Dumper( \@c );
252	0						die("c[1] not numeric");
253							}
254
255
256	0						$data->{'hits'}->{$hit_acc}->{'domains'}->{$c[1]}->{'seq_f'} = $c[10];
257	0						$data->{'hits'}->{$hit_acc}->{'domains'}->{$c[1]}->{'seq_t'} = $c[11];
258	0						$data->{'hits'}->{$hit_acc}->{'domains'}->{$c[1]}->{'hmm_f'} = $c[7];
259	0						$data->{'hits'}->{$hit_acc}->{'domains'}->{$c[1]}->{'hmm_t'} = $c[8];
260	0						$data->{'hits'}->{$hit_acc}->{'domains'}->{$c[1]}->{'domain_score'} = $c[3];
261	0						$data->{'hits'}->{$hit_acc}->{'domains'}->{$c[1]}->{'domain_evalue'} = $c[6];
262							} elsif( $line =~ /^\s+E-value\s+score/ ) {
263	0						$in_hit_scores = 1;
264							}
265							}
266
267	0						return $data;
268							}
269
270							## Subroutine to parse hmmscan (Hmmer3.0) output
271							sub read_hmmer3_output_old {
272	0			0	0		my $path = shift;
273	0						my $data = {};
274	0						my @lines;
275	0	0					if ( $path ne '' ) {
276	0						chomp $path;
277	0						my @statd = stat $path;
278	0						$data->{ 'search_date' } = ( ( localtime( $statd[ 9 ] ) )[ 3 ] ) . "-"
279							. ( ( localtime( $statd[ 9 ] ) )[ 4 ] + 1 ) . "-"
280							. ( ( localtime( $statd[ 9 ] ) )[ 5 ] + 1900 );
281	0	0					open( FH, "$path" ) \|\| die "Can't open $path for reading: $!\n";
282	0						chomp( @lines = );
283	0						close FH;
284							} else {
285	0						chomp( @lines = );
286	0						$data->{ 'search_date' } = ( ( localtime )[ 3 ] ) . "-"
287							. ( ( localtime )[ 4 ] + 1 ) . "-"
288							. ( ( localtime )[ 5 ] + 1900 );
289							}
290	0	0					if ( !@lines ) {
291	0						carp "No data read from input $path";
292	0						return undef;
293							}
294	0						my $i = 0;
295	0						while ($i < @lines) {
296	0	0					if ( $lines[ $i ] =~ /^#\s((hmmscan)\.)/ ) {
297	0						$data->{ 'program' } = $1;
298	0						my $version_line = $lines[ ++$i ];
299	0						$version_line =~ /^#\s*HMMER (\d+)\.(\S+)/;
300	0						( $data->{ 'version' }, $data->{ 'release' } ) = ( $1, $2 );
301	0						$i +=2;
302	0						last;
303							}
304	0						++$i;
305							}
306	0						until ( $lines[ $i ] =~ /^\s*$/ ) {
307	0	0					if ( $lines[ $i ] =~ /^#\s*target HMM database:\s+(\S+)/ ) {
		0
308	0						$data->{ 'hmm_file' } = $1;
309							} elsif ( $lines[ $i ] =~ /^#\s*query sequence (file\|database):\s+(.+)/ ) {
310	0						$data->{ 'sequence_file' } = $2;
311							}
312	0						$i++;
313	0	0					die "Failure to parse" if $i > @lines;
314							}
315	0						$i++;
316	0	0					if ( $lines[ $i ] =~ /^Query:\s+(\S+)/ ) {
317	0						$data->{ 'query' } = $1;
318	0						print "Found query: $1\n";
319	0						$i++;
320							}
321	0						until ( $lines[$i] =~ /^\s*$/ ) {
322	0	0					if ( $lines[ $i ] =~ /^Scores for/ ) {
		0
323	0						$i++; # this skips the separator row
324	0						my $headers = $lines[ ++$i ];
325	0						$i++; # this skips the separator row
326							} elsif ( $lines[ $i ] !~ /No hits detected that satisfy reporting thresholds/i ) {
327	0	0					$i++ if ($lines[$i] =~ /inclusion threshold/g);
328	0						my @c = split /\s+/, $lines[ $i ],11;
329	0						my $hit_index = $c[9];
330	0						$data->{hit}{$hit_index}{total_evalue} = $c[1];
331	0						$data->{hit}{$hit_index}{total_score} = $c[2];
332	0						$data->{hit}{$hit_index}{accession} = $c[9];
333	0						$data->{hit}{$hit_index}{hit_description} = $c[10];
334	0						$data->{hit}{$hit_index}{domain_count} = $c[8];
335	0						$data->{hit}{$hit_index}{frame} = "";
336							} else {
337	0						return $data;
338							}
339	0						$i++;
340	0	0					die "Failure to parse" if $i > @lines;
341							}
342	0						$i++;
343	0	0					if ( $lines[ $i ] =~ /^Domain annotation for each model/ ) {
344	0						$i++;
345							}
346	0						until ($lines[$i] =~ /^\/\/$/) {
347	0	0					if ( $lines[ $i ] !~ /No targets detected that satisfy reporting thresholds/ ) {
348	0	0					if($lines[$i] =~ />>/) {
349	0						my @c = split /\s+/, $lines[ $i ];
350	0						my $hit_index = $c[ 1 ];
351	0						$i += 3;
352
353	0	0					if ( !defined $data->{ 'hit' }->{ $hit_index } ) {
354	0						warn "Why doesn't '$hit_index' match an existing identifier?";
355							} else {
356	0						until ($lines[$i] =~ /^\s*$/) {
357	0						my @res = split /\s+/, $lines[ $i ];
358	0						$data->{hit}{$hit_index}{domain}{$res[1]}{seq_f} = $res[10];
359	0						$data->{hit}{$hit_index}{domain}{$res[1]}{seq_t} = $res[11];
360	0						$data->{hit}{$hit_index}{domain}{$res[1]}{hmm_f} = $res[7];
361	0						$data->{hit}{$hit_index}{domain}{$res[1]}{hmm_t} = $res[8];
362	0						$data->{hit}{$hit_index}{domain}{$res[1]}{domain_score} = $res[3];
363	0						$data->{hit}{$hit_index}{domain}{$res[1]}{domain_evalue} = $res[6];
364	0						$i++;
365							}
366							}
367							}
368							} else {
369	0						return $data;
370							}
371	0						$i++;
372							}
373	0						return $data;
374							}
375							## Use with caution. Or just don't use this method.
376							sub read_hmmer2_output {
377	0			0	0		my $path = shift;
378	0						my $data = {};
379							# die "Bad Parser under reconstruction.\n\n";
380							# drink in data
381	0						my @lines;
382
383							# drink in the output from file or stdin
384	0	0					if ( $path ne '' ) {
385	0						chomp $path;
386	0						my @statd = stat $path;
387	0						$data->{ 'search_date' } =
388							( ( localtime( $statd[ 9 ] ) )[ 3 ] ) . "-"
389							. ( ( localtime( $statd[ 9 ] ) )[ 4 ] + 1 ) . "-"
390							. ( ( localtime( $statd[ 9 ] ) )[ 5 ] + 1900 );
391	0	0					open( FH, "$path" )
392							\|\| die "Can't open $path for reading: $!\n";
393	0						chomp( @lines = );
394	0						close FH;
395							}
396							else {
397	0						chomp( @lines = );
398	0						$data->{ 'search_date' } =
399							( ( localtime )[ 3 ] ) . "-"
400							. ( ( localtime )[ 4 ] + 1 ) . "-"
401							. ( ( localtime )[ 5 ] + 1900 );
402							}
403	0	0					if ( !@lines ) {
404	0						carp "No data read from input $path";
405	0						return undef;
406							}
407	0						my $i = 0;
408
409							# first line grouping is company, package and license info
410							# warn "Parsing License. Line $i\n";
411							# amahurkar:1/15/08 Seems like the current output does not have licesning info, so commenting this
412							# so commenting out these lines
413							#until ( $lines[ $i ] eq "" )
414							# $data->{ 'header' } .= $lines[ $i ] . "\n";
415							# $i++;
416
417							#$i++;
418
419							# next group is program and version
420							# warn "Parsing Program and version. Line $i\n";
421							# amahurkar:1/15/08 the format has changed and now there is no blank space
422							# after program name, so we are using '- -' as tha match param
423							# to stop parsing for program name
424							#until ( $lines[ $i ] eq "" )
425							#until ( $lines[ $i ] =~ m/^-\s-/) # It doesn't work with LDhmmpfam v1.5.4
426							#until ( $lines[ $i ] =~ m/^-{8}\s+-/)
427	0						while ($i < @lines) {
428	0	0					if ( $lines[ $i ] =~ /^((hmmpfam\|hmmsearch)\.*)/ ) {
429	0						$data->{ 'program' } = $1;
430	0						my $version_line = $lines[ ++$i ];
431	0						$version_line =~ /^HMMER (\d+)\.(\S+)/;
432	0						( $data->{ 'version' }, $data->{ 'release' } ) = ( $1, $2 );
433	0						$i +=2;
434	0						last;
435							}
436	0						++$i;
437							}
438
439							# next group is program parameters
440							# warn "Parsing Parameters. Line $i\n";
441							# amahurkar:1/15/08 the format has changed and now there is no blank space
442							# after program name, so we are using '- -' as tha match param
443							# to stop parsing for program parameters
444	0						until ( $lines[ $i ] =~ /^\s*$/ ) {
445	0	0					if ( $lines[ $i ] =~ /^HMM file:\s+(\S+)/ ) {
		0
		0
		0
		0
		0
446	0						$data->{ 'hmm_file' } = $1;
447							} elsif ( $lines[ $i ] =~ /^Sequence (file\|database):\s+(.+)/ ) {
448	0						$data->{ 'sequence_file' } = $2;
449							} elsif ( $lines[ $i ] =~ /^per-sequence score cutoff:\s+(.+)/ ) {
450	0						$data->{ 'total_score_cutoff' } = $1;
451							} elsif ( $lines[ $i ] =~ /^per-domain score cutoff:\s+(.+)/ ) {
452	0						$data->{ 'domain_score_cutoff' } = $1;
453							} elsif ( $lines[ $i ] =~ /^per-sequence E-value cutoff:\s+(.+)/ ) {
454	0						$data->{ 'total_evalue_cutoff' } = $1;
455							} elsif ( $lines[ $i ] =~ /^per-domain E-value cutoff:\s+(.+)/ ) {
456	0						$data->{ 'domain_evalue_cutoff' } = $1;
457							}
458	0						$i++;
459	0	0					die "Failure to parse" if $i > @lines;
460							}
461	0						$i++;
462
463							# get query info
464							# warn "Parsing Query Info. Line $i\n";
465							#until ( $lines[ $i ] eq "" )
466	0						until ( $lines[$i] =~ /^\s*$/ ) {
467	0	0					if ( $lines[ $i ] =~ /^Query (?:HMM\|sequence):\s+(.+)/ ) {
		0
		0
468	0						$data->{ 'query' } = $1;
469							}
470							elsif ( $lines[ $i ] =~ /^Accession:\s+(.+)/ ) {
471	0						$data->{ 'query_accession' } = $1;
472							}
473							elsif ( $lines[ $i ] =~ /^Description:\s+(.+)/ ) {
474	0						$data->{ 'query_description' } = $1;
475							}
476	0						$i++;
477	0	0					die "Failure to parse" if $i > @lines;
478							}
479	0						$i++;
480
481							# next section is global search results
482							# warn "Parsing Global Search Results. Line $i\n";
483	0						my $find_frame = 0; # is datbase nucleotide sequence?
484	0						my $hit_index;
485	0						until ( $lines[$i] =~ /^\s*$/ ) {
486	0	0					if ( $lines[ $i ] =~ /^Scores for/ ) {
		0
487	0						my $headers = $lines[ ++$i ];
488	0	0					if ( $headers =~ /\bFr\b/ ) {
489	0						$data->{ 'program' } .= "-frames";
490	0						$find_frame = 1;
491							}
492	0						$i++; # this skips the separator row
493							} elsif ( $lines[ $i ] !~ /no hits above thresholds/i ) {
494	0						my @c = split /\s+/, $lines[ $i ];
495	0	0					if ( $find_frame ) {
496	0						$hit_index = $c[ 0 ] . $c[ $#c ];
497	0						$data->{ 'hit' }->{ $hit_index }->{ 'frame' } = pop @c;
498							}
499							else {
500							# $hit_index = $c[ 0 ]; # AP 20090807
501	0						($hit_index = $c[0]) =~ s/\.\d+$//;
502							}
503	0						$data->{hit}{$hit_index}{accession} = shift @c;
504	0						$data->{hit}{$hit_index}{domain_count} = pop @c;
505	0						$data->{hit}{$hit_index}{total_evalue} = pop @c;
506	0						$data->{hit}{$hit_index}{total_score} = pop @c;
507	0						$data->{hit}{$hit_index}{hit_description} = join " ", @c;
508							}
509							else {
510	0						return $data;
511							}
512	0						$i++;
513	0	0					die "Failure to parse" if $i > @lines;
514							}
515	0						$i++;
516
517							# next section is domain breakdown
518							# warn "Parsing Domain Breakdown. Line $i\n";
519							#until ( $lines[ $i ] eq "" )
520	0						until ($lines[$i] =~ /^\s*$/) {
521	0	0					if ( $lines[ $i ] =~ /^Parsed for domains/ ) {
		0
522	0						$i += 2; # to skip header and separator
523							} elsif ( $lines[ $i ] !~ /no hits above thresholds/ ) {
524
525							# $lines[$i] =~ /^(\w+)\s+(\d+)\/\d+\s+(\d+)\s+(\d+)\s[\[\.\]]{2}\s+(\d+)\s+(\d+)\s[\[\.\]]{2}\s+(-?[\.\d]+)\s+([\.\-e\d]+)\s*(\-?\d)?/;
526	0						my @c = split /\s+/, $lines[ $i ];
527	0	0					if ( $find_frame ) {
528	0						$hit_index = $c[ 0 ] . $c[ $#c ];
529							}
530							else {
531							#$hit_index = $c[ 0 ]; # AP 20090807
532	0						($hit_index = $c[0]) =~ s/\.\d+$//;
533							}
534	0	0					if ( !defined $data->{ 'hit' }->{ $hit_index } ) {
535	0						warn
536							"Why doesn't '$hit_index' match an existing identifier?";
537							}
538
539							# $data->{'hit'}->{$hit_index}->{'domain'}->{$2}->{'seq_f'} = $3;
540	0						my ( $d, $t ) = split /\//, $c[ 1 ];
541	0						$data->{hit}{$hit_index}{domain}{$d}{seq_f} = $c[2];
542	0						$data->{hit}{$hit_index}{domain}{$d}{seq_t} = $c[3];
543	0						$data->{hit}{$hit_index}{domain}{$d}{hmm_f} = $c[5];
544	0						$data->{hit}{$hit_index}{domain}{$d}{hmm_t} = $c[6];
545	0						$data->{hit}{$hit_index}{domain}{$d}{domain_score} = $c[8];
546	0						$data->{hit}{$hit_index}{domain}{$d}{domain_evalue} = $c[9];
547							}
548	0						$i++;
549	0	0					die "Failure to parse" if $i > @lines;
550							}
551	0						$i++;
552	0	0					if ( $data->{ 'program' } =~ /hmmsearch/ ) {
553
554							# next section is alignments
555							# warn "Parsing Alignments. Line $i\n";
556	0	0					if ( $lines[ $i ] =~ /^Alignments of top-scoring domains/ ) {
557	0						$i++;
558							## kgalens [11/13/2012]
559							## Not sure how this ever worked? $domain isn't in scope here.
560							## I've added use strict; and this module wouldn't. So adding the next line
561							## just so it will run.
562	0						my $domain;
563							my $hit;
564	0						my $hit_index;
565							## END kgalens [11/13/2012]
566
567	0						until ( $lines[$i] =~ /^\s*$/ ) {
568	0	0					if ( $lines[ $i ] =~ /^(\S+): domain (\d+)/ ) {
569	0						($hit_index, $hit, $domain ) = ( $1, $1, $2 );
570	0						$hit =~ s/(.{10}).*/$1/;
571							# warn " Parsing hit $hit_index. Line $i\n";
572	0	0					if ( $find_frame ) {
573	0	0	0				if ( $lines[ $i ] =~ /Fr = ([\-\d]+)/ \|\| $lines[ $i ] =~ /\. frame ([\-\d]+)/ ) {
574	0						$hit_index .= $1;
575							} else {
576	0						warn "ERROR: Couldn't find frame from:\n '$lines[$i]'";
577							}
578							}
579	0	0					if ( !defined $data->{ 'hit' }->{ $hit_index } ) {
580	0						warn "Why doesn't '$hit_index' match an existing identifier?";
581							}
582	0						$i++;
583							}
584	0	0					if ( $lines[ $i ] =~ /\bRF\b/ ) { ## WHY??
585	0						$i++;
586							}
587
588							# capture aligned hmm consensus
589	0						my $hmm_seq = $lines[ $i ];
590	0						$hmm_seq =~ s/\s+//g;
591	0						$hmm_seq =~ s/[\*\-\>\<]//g;
592
593	0						$data->{ 'hit' }->{ $hit_index }->{ 'domain' }->{ $domain }->{hmm_seq} .= $hmm_seq;
594	0						until ( $lines[ $i ] =~ /^\s+\Q$hit\E/) { ## changed from /\b\Q$hit\E\b/ )
595	0						$i++;
596							}
597	0						my $prot_seq = $lines[ $i ];
598	0	0					if ( $prot_seq =~ /\w+\s+(\d+\|\-)\s+(\S+)\s+(\d+\|\-)/ ) {
599	0						$data->{ 'hit' }->{ $hit_index }->{ 'domain' }->{ $domain }->{prot_seq} .= $2;
600							}
601	0						$i += 2; # skip the blank line and move on to the next
602	0	0					die "Failure to parse" if $i > @lines;
603							}
604							}
605	0						$i++;
606
607							# next (last) section is statistics
608							# warn "Parsing Statistics. Line $i\n";
609	0						my @data;
610	0						while ( $i < @data ) {
611	0	0					if ( $lines[ $i ] =~ /^\s+mu =\s+(-?\d+)/ ) {
		0
		0
		0
		0
		0
612	0						$data->{ 'mu' } = $1;
613							} elsif ( $lines[ $i ] =~ /^\s+lambda =\s(-?\d+)/ ) {
614	0						$data->{ 'lambda' } = $1;
615							} elsif ( $lines[ $i ] =~ /chi-sq statistic =\s(\d+)/ ) {
616	0						$data->{ 'chisq' } = $1;
617							} elsif ( $lines[ $i ] =~ /Total sequences searched:\s*(\d+)/ ) {
618	0						$data->{ 'tot_seq_searched' } = $1;
619							} elsif ( $lines[ $i ] =~ /Whole sequence top hits/ ) {
620	0						$lines[ ++$i ] =~ /(\d+)/;
621	0						$data->{ 'total_hits' } = $1;
622	0						$lines[ ++$i ] =~ /(\d+)/;
623	0						$data->{ 'total_hits_above_evalue_cutoff' } = $1;
624							} elsif ( $lines[ $i ] =~ /Domain top hits/ ) {
625	0						$lines[ ++$i ] =~ /(\d+)/;
626	0						$data->{ 'domain_hits' } = $1;
627	0						$lines[ ++$i ] =~ /(\d+)/;
628	0						$data->{ 'domain_hits_above_evalue_cutoff' } = $1;
629							}
630	0						$i++;
631	0	0					die "Failure to parse" if $i > @lines;
632							}
633							}
634	0						return $data;
635							}
636
637							sub hmm_database_info {
638	0			0	0		my $dbh = shift;
639	0						my $hmm_q =
640							"SELECT hmm_acc, hmm_len, trusted_cutoff, noise_cutoff, hmm_com_name,"
641							. " trusted_cutoff2, noise_cutoff2, gathering_cutoff, gathering_cutoff2"
642							. " FROM hmm2"
643							. " WHERE is_current = 1";
644	0						my $HMM = $dbh->selectall_hashref( $hmm_q, 'hmm_acc' );
645	0						return $HMM;
646							}
647
648							sub print_htab {
649							#NOTE: This will produce results if the data hash was created through the 'read_hmmer3_output' subroutine
650							# This is because of a change in the naming of some of the property keys (ex 'hit' to 'hits')
651	0			0	0		my $data = shift;
652	0						my $HMM = shift;
653	0						my $output = shift;
654	0						foreach my $qry_id ( keys %{$data->{'queries'}} ) {
	0
655	0						foreach my $hit (
656							sort {
657							$data->{'queries'}->{$qry_id}->{'hits'}->{ $b }->{ 'total_score' } <=> $data->{'queries'}->{$qry_id}->
658	0						{'hits'}->{ $a }->{ 'total_score' }
659	0						} keys %{ $data->{'queries'}->{$qry_id}->{'hits'} } )
660							{
661	0						my $h = $data->{'queries'}->{$qry_id}->{'hits'}->{ $hit };
662	0	0					next if (scalar keys %{$h->{'domains'}} == 0); #skip model hits that have no domain hits
	0
663	0						foreach my $domain ( sort { $a <=> $b } keys %{ $h->{ 'domains' } } )
	0
	0
664							{
665							# for convenience
666	0						my $dh = $h->{ 'domains' }->{ $domain };
667	0	0					if ( $data->{'info'}->{ 'program' } =~ /hmmsearch/ ) { #hmmer2 is currently deprecated so this will probably error
		0
668							my $hmm_com_name =
669							$HMM->{ $data->{ 'query' } }->{ 'hmm_com_name' }
670							? $HMM->{ $data->{ 'query' } }->{ 'hmm_com_name' }
671	0	0					: $data->{ 'query_description' };
672	0						print $output "$data->{query}"
673							. "\t$data->{search_date}"
674							. "\t$HMM->{$data->{query}}->{hmm_len}"
675							. "\t$data->{program}"
676							. "\t$data->{sequence_file}"
677							. "\t$h->{accession}"
678							. "\t$dh->{hmm_f}"
679							. "\t$dh->{hmm_t}"
680							. "\t$dh->{seq_f}"
681							. "\t$dh->{seq_t}"
682							. "\t$h->{frame}"
683							. "\t$dh->{domain_score}"
684							. "\t$h->{total_score}"
685							. "\t$domain"
686							. "\t$h->{domain_count}"
687							. "\t$hmm_com_name"
688							. "\t$h->{hit_description}"
689							. "\t$HMM->{$data->{query}}->{trusted_cutoff}"
690							. "\t$HMM->{$data->{query}}->{noise_cutoff}"
691							. "\t$h->{total_evalue}"
692							. "\t$dh->{domain_evalue}"
693							. "\t$HMM->{$data->{query}}->{trusted_cutoff2}"
694							. "\t$HMM->{$data->{query}}->{noise_cutoff2}"
695							. "\t$HMM->{$data->{query}}->{gathering_cutoff}"
696							. "\t$HMM->{$data->{query}}->{gathering_cutoff2}" . "\n";
697							}
698							elsif ( $data->{'info'}->{ 'program' } =~ /hmmscan\|hmmpfam/ ) {
699							my $hmm_com_name =
700							$HMM->{ $hit }->{ 'hmm_com_name' }
701							? $HMM->{ $hit }->{ 'hmm_com_name' }
702	0	0					: $h->{ 'hit_description' };
703	0						print $output "$h->{accession}"
704							. "\t$data->{'info'}->{search_date}"
705							. "\t$HMM->{$hit}->{hmm_len}"
706							. "\t$data->{'info'}->{program}"
707							. "\t$data->{'info'}->{hmm_file}"
708							. "\t$qry_id"
709							. "\t$dh->{hmm_f}"
710							. "\t$dh->{hmm_t}"
711							. "\t$dh->{seq_f}"
712							. "\t$dh->{seq_t}"
713							. "\t$h->{frame}"
714							. "\t$dh->{domain_score}"
715							. "\t$h->{total_score}"
716							. "\t$domain"
717							. "\t$h->{domain_count}"
718							. "\t$hmm_com_name"
719							. "\t$h->{hit_description}"
720							. "\t$HMM->{$h->{accession}}->{trusted_cutoff}"
721							. "\t$HMM->{$h->{accession}}->{noise_cutoff}"
722							. "\t$h->{total_evalue}"
723							. "\t$dh->{domain_evalue}"
724							. "\t$HMM->{$h->{accession}}->{trusted_cutoff2}"
725							. "\t$HMM->{$h->{accession}}->{noise_cutoff2}"
726							. "\t$HMM->{$h->{accession}}->{gathering_cutoff}"
727							. "\t$HMM->{$h->{accession}}->{gathering_cutoff2}" . "\n";
728							}
729							}
730							}
731							}
732							}
733
734							sub build_alignment {
735	0			0	0		my $data = shift;
736	0						my $instructions = shift;
737
738							# build output file name
739	0						my $output_file;
740							$output_file =
741	0						$instructions->{file_prefix} . "." . $instructions->{file_format};
742	0	0					open my $OUT, ">$output_file"
743							or croak "Can't open '$output_file' as output file: $!\n";
744	0						select $OUT;
745
746							# retrieve aligned sequences
747	0						my %screened;
748	0						foreach my $hit ( keys %{ $data->{ 'hit' } } ) {
	0
749
750							# screen for total score cutoffs
751	0	0	0				if ( $data->{hit}->{ $hit }->{total_score} >=
752							$instructions->{total_bit_cutoff}
753							&& $data->{hit}->{ $hit }->{total_evalue} <=
754							$instructions->{total_evalue_cutoff} )
755							{
756	0						foreach my $domain ( keys %{ $data->{hit}->{ $hit }->{domain} } )
	0
757							{
758	0	0	0				if ( $data->{hit}->{ $hit }->{domain}->{ $domain }
759							->{domain_score} >= $instructions->{domain_bit_cutoff}
760							&& $data->{hit}->{ $hit }->{domain}->{ $domain }
761							->{domain_evalue} <=
762							$instructions->{domain_evalue_cutoff} )
763							{
764	0						$screened{ $hit } = $domain;
765							}
766							}
767							}
768							}
769
770							# Now that we have sequences aligned to hmm sequence, we have to translate
771							# this into a multiple alignment. Assign each position in each alignment to
772							# a position on the hmm 'sequence', and keep track of gaps in the hmm alignment
773	0						my %DIST;
774							my $ref;
775	0						foreach my $hit ( keys %screened ) {
776							$ref =
777	0						$data->{ 'hit' }->{ $hit }->{ 'domain' }->{ $screened{ $hit } };
778
779							# split aligned hmm seq and aligned protein seq into arrays
780	0						my @hmma = split / */, $ref->{hmm_seq};
781	0						my @prota = split / */, $ref->{prot_seq};
782
783							# these should be the same length. If not, there's an error.
784	0	0					if ( @hmma != @prota ) {
785	0						croak "Length of hmm alignment (" . @hmma . ")"
786							. " is not equal to protein alignment ("
787							. @prota . ")"
788							. ": $data->{query}/$data->{hit}->{$hit}->{accession}\n"
789							. "$ref->{hmm_seq}\n@hmma\n$ref->{prot_seq}\n@prota\n";
790							}
791
792							# assign each position in the protein alignment to its hmm alignment position,
793							# if one exists
794	0						my $hmm_pos = $ref->{hmm_f};
795	0						my $gap = 0;
796	0						for ( my $i = 0 ; $i < @hmma ; $i++ ) {
797	0	0					if ( $hmma[ $i ] ne "." ) {
798
799							# assign position in the protein alignment to its hmm alignment position,
800	0						$prota[ $i ] = $hmm_pos;
801
802							# record max gap distance between hmm alignment positions.
803							$DIST{ $hmm_pos } = $gap
804	0	0					if ( $gap >= $DIST{ $hmm_pos } );
805	0						$gap = 0;
806	0						$hmm_pos++;
807							}
808							else {
809	0						$gap++;
810							}
811							}
812	0						$ref->{aln_map} = \@prota;
813							}
814
815							# Now go back through (now that we've fully expanded the hmm alignment
816							# to include any and all gaps) and make aligned protein sequence
817	0						foreach my $hit ( keys %screened ) {
818							$ref =
819	0						$data->{ 'hit' }->{ $hit }->{ 'domain' }->{ $screened{ $hit } };
820	0						my @prot_seq = split / */, $ref->{prot_seq};
821
822							# start our aligned protein with any gap resulting from a partial HMM hit.
823	0						my $aln_prot = "." x ( $ref->{hmm_f} - 1 );
824	0						my $insert = 0;
825	0						for ( my $i = 0 ; $i < @prot_seq ; $i++ ) {
826
827							# grab the hmm alignment position for each protein alignment position
828	0						my $pos = $ref->{aln_map}->[ $i ];
829	0	0					if ( $pos =~ /\d+/ ) {
830
831							# if it maps to a position, first insert any gap from the hmm alignment
832	0						$aln_prot .= "." x ( $DIST{ $pos } - $insert );
833
834							# then add the aa.
835	0						$aln_prot .= $prot_seq[ $i ];
836	0						$insert = 0;
837							}
838
839							# if it is an insertion (ie the hmm alignment shows gap), insert a gap
840							else {
841	0						$aln_prot .= $prot_seq[ $i ];
842	0						$insert++;
843							}
844							}
845	0						$aln_prot =~ s/\-/\./g;
846	0						$ref->{aln_prot} = $aln_prot;
847							}
848
849							# Now print out in selected format
850							# Stockholm format
851	0	0	0				if ( $instructions->{file_format} eq "mul" ) {
		0
		0
852	0						print "# STOCKHOLM 1.0\n";
853	0						foreach my $hit (
854							sort {
855							$data->{ 'hit' }->{ $b }
856							->{ 'total_score' } <=> $data->{ 'hit' }->{ $a }
857	0						->{ 'total_score' }
858							} keys %screened
859							)
860							{
861	0						my $domain = $screened{ $hit };
862	0						my $hit_ref = $data->{hit}->{ $hit };
863	0						my $domain_ref = $hit_ref->{domain}->{ $domain };
864
865							# each line should look like:
866							# prot_acc/coord-coord sequence
867							printf "%-40s%s\n",
868							(
869							"$hit_ref->{accession}/$domain_ref->{seq_f}-$domain_ref->{seq_t}",
870							$domain_ref->{aln_prot}
871	0						);
872							}
873							}
874
875							# FASTA format
876							elsif ($instructions->{file_format} eq "fasta"
877							\|\| $instructions->{file_format} eq "fa" )
878							{
879	1			1		412	use TIGR::FASTA::Record;
	1					2
	1					580
880	0						foreach my $hit (
881							sort {
882							$data->{ 'hit' }->{ $b }
883							->{ 'total_score' } <=> $data->{ 'hit' }->{ $a }
884	0						->{ 'total_score' }
885							} keys %screened
886							)
887							{
888	0						my $domain = $screened{ $hit };
889	0						my $hit_ref = $data->{hit}->{ $hit };
890	0						my $domain_ref = $hit_ref->{domain}->{ $domain };
891	0						my $aln_prot = $domain_ref->{aln_prot};
892	0						$aln_prot =~ s/(.{60})/$1\n/g;
893	0						$aln_prot =~ s/\n+/\n/g;
894	0						chomp $aln_prot;
895	0						my $header =
896							">$hit_ref->{accession}/$domain_ref->{seq_f}-$domain_ref->{seq_t}\n";
897	0						print $header . $aln_prot . "\n";
898							}
899							}
900
901							# MSF format
902							elsif ( $instructions->{file_format} eq "msf" ) {
903	0						my $head_len = 40;
904	0						my ( %new_acc, %tmp_seq );
905	0						my $header_line = 0;
906	0						my @alignment;
907	0						foreach my $hit (
908							sort {
909							$data->{ 'hit' }->{ $b }
910							->{ 'total_score' } <=> $data->{ 'hit' }->{ $a }
911	0						->{ 'total_score' }
912							} keys %screened
913							)
914							{
915	0						my $domain = $screened{ $hit };
916	0						my $hit_ref = $data->{hit}->{ $hit };
917	0						my $domain_ref = $hit_ref->{domain}->{ $domain };
918	0						my $len = length( $domain_ref->{aln_prot} );
919	0	0					if ( $header_line == 0 ) {
920
921							#added DUMMY checksum for HMMER 2.2g hmmbuild. DHH.
922	0						print
923							"PileUp\n\n MSF: $len Type: P Check: 1111 ..\n\n";
924	0						$header_line = 1;
925							}
926
927							# print accession list at top
928	0						printf " Name: %-40s Len: $len Check: 0 Weight: 1.0\n",
929							"$hit_ref->{accession}/$domain_ref->{seq_f}-$domain_ref->{seq_t}";
930
931							# prepare alignment for bottom
932	0						my @tmp_pep = split //, $domain_ref->{aln_prot};
933	0						for ( my $i = 0 ; $i < ( $len / 50 ) ; $i++ ) {
934	0						$alignment[ $i ] .= sprintf "%-40s",
935							"$hit_ref->{accession}/$domain_ref->{seq_f}-$domain_ref->{seq_t}";
936	0						for ( my $b = 0 ; $b < 5 ; $b++ ) {
937	0						for ( my $a = 0 ; $a < 10 ; $a++ ) {
938	0						$alignment[ $i ] .=
939							$tmp_pep[ $a + ( $b * 10 ) + ( 50 * $i ) ];
940							}
941	0						$alignment[ $i ] .= " ";
942							}
943	0						$alignment[ $i ] .= "\n";
944							}
945							}
946	0						print "\n//\n\n\n";
947	0						foreach my $block ( @alignment ) {
948	0						print "$block\n\n";
949							}
950							}
951							else {
952	0						croak
953							"Don't recognize alignment file format $instructions->{file_format}\n";
954							}
955	0						select STDOUT;
956							}
957
958							sub get_cutoffs_for_hmm_accession {
959	0			0	0		my $dbh = shift;
960	0						my $accession = shift;
961	0						my $hmm_q =
962							"select trusted_cutoff, trusted_cutoff2, noise_cutoff from egad..hmm2 where hmm_acc = '$accession'";
963	0						return $dbh->selectrow_hashref( $hmm_q );
964							}
965
966							1; # End of TIGR::HmmTools