File Coverage

blib/lib/TIGR/HmmTools.pm
Criterion Covered Total %
statement 17 422 4.0
branch 0 190 0.0
condition 0 33 0.0
subroutine 6 14 42.8
pod 0 7 0.0
total 23 666 3.4


line stmt bran cond sub pod time code
1             package TIGR::HmmTools;
2             require Exporter;
3 1     1   52796 use 5.006;
  1         3  
4 1     1   4 use strict;
  1         2  
  1         19  
5 1     1   8 use warnings;
  1         7  
  1         53  
6              
7             =head1 NAME
8              
9             TIGR::HmmTools - Functions for processing files from HMMer
10              
11             =head1 VERSION
12              
13             Version 0.01
14              
15             =cut
16              
17             our $VERSION = '0.03';
18              
19              
20             =head1 SYNOPSIS
21              
22             Ages ago developers at The Institute for Genomic Research (RIP) produced code to process
23             raw alignment files from the HMMer suite. Many of these functions were embedded within a
24             variety of scripts which do things like format conversion. Installing this package also
25             adds the utility script, 'convert_hmmscan_to_htab.pl'
26              
27             =head1 EXPORT
28              
29             A list of functions that can be exported. You can delete this section
30             if you don't export anything, such as for a purely object-oriented module.
31             The following functions are exported:
32              
33             - read_hmmer3_output
34             - read_hmmer2_output
35             - print_htab
36             - hmm_database_info
37             - get_cutoffs_for_hmm_accession
38             - build_alignment
39             - read_hmmer3_output2
40            
41             =head1 AUTHOR
42              
43             Joshua Orvis, C<< >>
44              
45             The actual author of the code is currently unknown. I just packaged it for CPAN.
46              
47             =head1 BUGS
48              
49             Please report any bugs or feature requests to C, or through
50             the web interface at L. I will be notified, and then you'll
51             automatically be notified of progress on your bug as I make changes.
52              
53             =head1 SUPPORT
54              
55             You can find documentation for this module with the perldoc command.
56              
57             perldoc TIGR::HmmTools
58              
59              
60             You can also look for information at:
61              
62             =over 4
63              
64             =item * RT: CPAN's request tracker (report bugs here)
65              
66             L
67              
68             =item * AnnoCPAN: Annotated CPAN documentation
69              
70             L
71              
72             =item * CPAN Ratings
73              
74             L
75              
76             =item * Search CPAN
77              
78             L
79              
80             =back
81              
82              
83             =head1 ACKNOWLEDGEMENTS
84              
85              
86             =head1 LICENSE AND COPYRIGHT
87              
88             Copyright 2017 Joshua Orvis.
89              
90             This program is free software; you can redistribute it and/or modify it
91             under the terms of the the Artistic License (2.0). You may obtain a
92             copy of the full license at:
93              
94             L
95              
96             Any use, modification, and distribution of the Standard or Modified
97             Versions is governed by this Artistic License. By using, modifying or
98             distributing the Package, you accept this license. Do not use, modify,
99             or distribute the Package, if you do not accept this license.
100              
101             If your Modified Version has been derived from a Modified Version made
102             by someone other than you, you are nevertheless required to ensure that
103             your Modified Version complies with the requirements of this license.
104              
105             This license does not grant you the right to use any trademark, service
106             mark, tradename, or logo of the Copyright Holder.
107              
108             This license includes the non-exclusive, worldwide, free-of-charge
109             patent license to make, have made, use, offer to sell, sell, import and
110             otherwise transfer the Package with respect to any patent claims
111             licensable by the Copyright Holder that are necessarily infringed by the
112             Package. If you institute patent litigation (including a cross-claim or
113             counterclaim) against any party alleging that the Package constitutes
114             direct or contributory patent infringement, then this Artistic License
115             to you shall terminate on the date that such litigation is filed.
116              
117             Disclaimer of Warranty: THE PACKAGE IS PROVIDED BY THE COPYRIGHT HOLDER
118             AND CONTRIBUTORS "AS IS' AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES.
119             THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
120             PURPOSE, OR NON-INFRINGEMENT ARE DISCLAIMED TO THE EXTENT PERMITTED BY
121             YOUR LOCAL LAW. UNLESS REQUIRED BY LAW, NO COPYRIGHT HOLDER OR
122             CONTRIBUTOR WILL BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, OR
123             CONSEQUENTIAL DAMAGES ARISING IN ANY WAY OUT OF THE USE OF THE PACKAGE,
124             EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
125              
126              
127             =cut
128              
129 1     1   5 use Carp;
  1         2  
  1         54  
130 1     1   440 use Data::Dumper;
  1         6228  
  1         4374  
131             our @ISA = qw (Exporter);
132             our @EXPORT =
133             qw(read_hmmer3_output read_hmmer2_output print_htab hmm_database_info get_cutoffs_for_hmm_accession build_alignment read_hmmer3_output2);
134             our @EXPORT_OK = qw ();
135              
136             sub read_hmmer3_output {
137 0     0 0   my ($path) = @_;
138 0           my $retval = {};
139              
140 0           my $in_result = 0;
141 0           my $in_hit_scores = 0;
142 0           my $in_domain_scores = 0;
143              
144 0 0         if ( $path ne '' ) {
145 0           chomp $path;
146 0           my @statd = stat $path;
147 0           $retval->{'info'}->{ 'search_date' } = ( ( localtime( $statd[ 9 ] ) )[ 3 ] ) . "-"
148             . ( ( localtime( $statd[ 9 ] ) )[ 4 ] + 1 ) . "-"
149             . ( ( localtime( $statd[ 9 ] ) )[ 5 ] + 1900 );
150             } else {
151 0           $retval->{'info'}->{ 'search_date' } = ( ( localtime )[ 3 ] ) . "-"
152             . ( ( localtime )[ 4 ] + 1 ) . "-"
153             . ( ( localtime )[ 5 ] + 1900 );
154             }
155              
156 0 0         open(my $fh, "< $path") or die("Unable to open $path: $!");
157 0           while( my $line = <$fh> ) {
158 0 0 0       next if( $line =~ /^\s*$/ || $line =~ /inclusion threshold/ );
159              
160             # The program line
161 0 0         if ( $line =~ /^\#\s*((hmmscan)\.*)/ ) {
    0          
    0          
    0          
    0          
162 0           $retval->{'info'}->{'program'} = $1;
163              
164             # The version
165             } elsif( $line =~ /^\#\s*HMMER ([\d\.]+)\s+\(([^\)]+)\)/ ) {
166 0           ( $retval->{'info'}->{'version'}, $retval->{'info'}->{'release'} ) = ( $1, $2 );
167              
168             # The hmm database searched
169             } elsif( $line =~ /^\#\s+target HMM database:\s+(\S+)/ ) {
170 0           $retval->{'info'}->{'hmm_file'} = $1;
171              
172             # The query file
173             } elsif( $line =~ /^\#\squery sequence (file|database):\s+(.+)/ ) {
174 0           $retval->{'info'}->{'sequence_file'} = $2;
175              
176             # This indicates we're parsing a hit.
177             } elsif( $line =~ /^Query\:\s+(\S+)/ ) {
178 0           my $data = &_parse_hmmpfam3_hit( $fh );
179 0           $retval->{'queries'}->{$1} = $data;
180              
181             }
182             }
183              
184 0           close($fh);
185 0           return $retval;
186             }
187              
188             sub _parse_hmmpfam3_hit {
189 0     0     my ($fh) = @_;
190 0           my $data = {};
191            
192 0           my $in_hit_scores = 0;
193 0           my $in_domain_scores = 0;
194 0           my $hit_acc;
195            
196 0           while( my $line = <$fh> ) {
197 0           chomp($line);
198 0 0         last if( $line =~ m|//| );
199 0 0         next if( $line =~ /^\s*$/ );
200              
201 0 0 0       if( $line =~ /^Domain annotation/ ) {
    0 0        
    0 0        
    0 0        
    0          
    0          
202 0           $in_hit_scores = 0;
203              
204             } elsif( $in_hit_scores && $line !~ /^\s+--/ && $line !~ /inclusion_threshold/ ) {
205 0 0         if( $line =~ /No hits detected/ ) {
206 0           $data->{'hits'} = {};
207 0           last;
208             }
209            
210 0           my @c = split(/\s+/, $line);
211 0           my $t_hit_acc = $c[9];
212 0           $data->{'hits'}->{$t_hit_acc}->{total_evalue} = $c[1];
213 0           $data->{'hits'}->{$t_hit_acc}->{total_score} = $c[2];
214 0           $data->{'hits'}->{$t_hit_acc}->{accession} = $c[9];
215 0           $data->{'hits'}->{$t_hit_acc}->{hit_description} = join(" ",@c[10..(@c-1)]);
216 0           $data->{'hits'}->{$t_hit_acc}->{domain_count} = $c[8];
217 0           $data->{'hits'}->{$t_hit_acc}->{frame} = "";
218             } elsif( $line =~ /^>>\s*(\S+)/ ) {
219 0           $in_domain_scores = 1;
220 0           $hit_acc = $1;
221            
222             } elsif( $in_domain_scores && $line =~ /Alignments for each/ ) {
223 0           undef $hit_acc;
224 0           $in_domain_scores = 0;
225              
226             } elsif( $in_domain_scores && $line !~ /^\s*[>\#-]/ ) {
227 0 0         die("Didn't parse hit accession from header line before getting to domain table")
228             unless( $hit_acc );
229 0 0         die("The hit accession [$hit_acc] didn't exist in lookup") unless( exists( $data->{'hits'}->{$hit_acc} ) );
230              
231 0 0         if( $line =~ /\[No individual domains that/ ) {
232 0           $data->{'hits'}->{$hit_acc}->{'domains'} = {};
233 0           $in_domain_scores = 0;
234 0           next;
235             }
236              
237 0           my @c = split(/\s+/, $line);
238              
239 0 0 0       if( $c[1] eq 'targets' || $c[1] eq 'reported' || $c[1] eq 'Fwd') {
      0        
240 0           print Dumper( $data->{'hits'}->{$hit_acc} );
241 0           print "LINE: $line\n";
242 0           print Dumper( @c );
243 0           die("Issue parsing");
244             }
245            
246 0           my $a = $c[1];
247 0           my $b = $a + 0;
248 0 0         if( $b ne $a ) {
249 0           print Dumper( $data->{'hits'}->{$hit_acc} );
250 0           print "LINE: $line\n";
251 0           print Dumper( \@c );
252 0           die("c[1] not numeric");
253             }
254            
255            
256 0           $data->{'hits'}->{$hit_acc}->{'domains'}->{$c[1]}->{'seq_f'} = $c[10];
257 0           $data->{'hits'}->{$hit_acc}->{'domains'}->{$c[1]}->{'seq_t'} = $c[11];
258 0           $data->{'hits'}->{$hit_acc}->{'domains'}->{$c[1]}->{'hmm_f'} = $c[7];
259 0           $data->{'hits'}->{$hit_acc}->{'domains'}->{$c[1]}->{'hmm_t'} = $c[8];
260 0           $data->{'hits'}->{$hit_acc}->{'domains'}->{$c[1]}->{'domain_score'} = $c[3];
261 0           $data->{'hits'}->{$hit_acc}->{'domains'}->{$c[1]}->{'domain_evalue'} = $c[6];
262             } elsif( $line =~ /^\s+E-value\s+score/ ) {
263 0           $in_hit_scores = 1;
264             }
265             }
266            
267 0           return $data;
268             }
269              
270             ## Subroutine to parse hmmscan (Hmmer3.0) output
271             sub read_hmmer3_output_old {
272 0     0 0   my $path = shift;
273 0           my $data = {};
274 0           my @lines;
275 0 0         if ( $path ne '' ) {
276 0           chomp $path;
277 0           my @statd = stat $path;
278 0           $data->{ 'search_date' } = ( ( localtime( $statd[ 9 ] ) )[ 3 ] ) . "-"
279             . ( ( localtime( $statd[ 9 ] ) )[ 4 ] + 1 ) . "-"
280             . ( ( localtime( $statd[ 9 ] ) )[ 5 ] + 1900 );
281 0 0         open( FH, "$path" ) || die "Can't open $path for reading: $!\n";
282 0           chomp( @lines = );
283 0           close FH;
284             } else {
285 0           chomp( @lines = );
286 0           $data->{ 'search_date' } = ( ( localtime )[ 3 ] ) . "-"
287             . ( ( localtime )[ 4 ] + 1 ) . "-"
288             . ( ( localtime )[ 5 ] + 1900 );
289             }
290 0 0         if ( !@lines ) {
291 0           carp "No data read from input $path";
292 0           return undef;
293             }
294 0           my $i = 0;
295 0           while ($i < @lines) {
296 0 0         if ( $lines[ $i ] =~ /^#\s*((hmmscan)\.*)/ ) {
297 0           $data->{ 'program' } = $1;
298 0           my $version_line = $lines[ ++$i ];
299 0           $version_line =~ /^#\s*HMMER (\d+)\.(\S+)/;
300 0           ( $data->{ 'version' }, $data->{ 'release' } ) = ( $1, $2 );
301 0           $i +=2;
302 0           last;
303             }
304 0           ++$i;
305             }
306 0           until ( $lines[ $i ] =~ /^\s*$/ ) {
307 0 0         if ( $lines[ $i ] =~ /^#\s*target HMM database:\s+(\S+)/ ) {
    0          
308 0           $data->{ 'hmm_file' } = $1;
309             } elsif ( $lines[ $i ] =~ /^#\s*query sequence (file|database):\s+(.+)/ ) {
310 0           $data->{ 'sequence_file' } = $2;
311             }
312 0           $i++;
313 0 0         die "Failure to parse" if $i > @lines;
314             }
315 0           $i++;
316 0 0         if ( $lines[ $i ] =~ /^Query:\s+(\S+)/ ) {
317 0           $data->{ 'query' } = $1;
318 0           print "Found query: $1\n";
319 0           $i++;
320             }
321 0           until ( $lines[$i] =~ /^\s*$/ ) {
322 0 0         if ( $lines[ $i ] =~ /^Scores for/ ) {
    0          
323 0           $i++; # this skips the separator row
324 0           my $headers = $lines[ ++$i ];
325 0           $i++; # this skips the separator row
326             } elsif ( $lines[ $i ] !~ /No hits detected that satisfy reporting thresholds/i ) {
327 0 0         $i++ if ($lines[$i] =~ /inclusion threshold/g);
328 0           my @c = split /\s+/, $lines[ $i ],11;
329 0           my $hit_index = $c[9];
330 0           $data->{hit}{$hit_index}{total_evalue} = $c[1];
331 0           $data->{hit}{$hit_index}{total_score} = $c[2];
332 0           $data->{hit}{$hit_index}{accession} = $c[9];
333 0           $data->{hit}{$hit_index}{hit_description} = $c[10];
334 0           $data->{hit}{$hit_index}{domain_count} = $c[8];
335 0           $data->{hit}{$hit_index}{frame} = "";
336             } else {
337 0           return $data;
338             }
339 0           $i++;
340 0 0         die "Failure to parse" if $i > @lines;
341             }
342 0           $i++;
343 0 0         if ( $lines[ $i ] =~ /^Domain annotation for each model/ ) {
344 0           $i++;
345             }
346 0           until ($lines[$i] =~ /^\/\/$/) {
347 0 0         if ( $lines[ $i ] !~ /No targets detected that satisfy reporting thresholds/ ) {
348 0 0         if($lines[$i] =~ />>/) {
349 0           my @c = split /\s+/, $lines[ $i ];
350 0           my $hit_index = $c[ 1 ];
351 0           $i += 3;
352              
353 0 0         if ( !defined $data->{ 'hit' }->{ $hit_index } ) {
354 0           warn "Why doesn't '$hit_index' match an existing identifier?";
355             } else {
356 0           until ($lines[$i] =~ /^\s*$/) {
357 0           my @res = split /\s+/, $lines[ $i ];
358 0           $data->{hit}{$hit_index}{domain}{$res[1]}{seq_f} = $res[10];
359 0           $data->{hit}{$hit_index}{domain}{$res[1]}{seq_t} = $res[11];
360 0           $data->{hit}{$hit_index}{domain}{$res[1]}{hmm_f} = $res[7];
361 0           $data->{hit}{$hit_index}{domain}{$res[1]}{hmm_t} = $res[8];
362 0           $data->{hit}{$hit_index}{domain}{$res[1]}{domain_score} = $res[3];
363 0           $data->{hit}{$hit_index}{domain}{$res[1]}{domain_evalue} = $res[6];
364 0           $i++;
365             }
366             }
367             }
368             } else {
369 0           return $data;
370             }
371 0           $i++;
372             }
373 0           return $data;
374             }
375             ## Use with caution. Or just don't use this method.
376             sub read_hmmer2_output {
377 0     0 0   my $path = shift;
378 0           my $data = {};
379             # die "Bad Parser under reconstruction.\n\n";
380             # drink in data
381 0           my @lines;
382              
383             # drink in the output from file or stdin
384 0 0         if ( $path ne '' ) {
385 0           chomp $path;
386 0           my @statd = stat $path;
387 0           $data->{ 'search_date' } =
388             ( ( localtime( $statd[ 9 ] ) )[ 3 ] ) . "-"
389             . ( ( localtime( $statd[ 9 ] ) )[ 4 ] + 1 ) . "-"
390             . ( ( localtime( $statd[ 9 ] ) )[ 5 ] + 1900 );
391 0 0         open( FH, "$path" )
392             || die "Can't open $path for reading: $!\n";
393 0           chomp( @lines = );
394 0           close FH;
395             }
396             else {
397 0           chomp( @lines = );
398 0           $data->{ 'search_date' } =
399             ( ( localtime )[ 3 ] ) . "-"
400             . ( ( localtime )[ 4 ] + 1 ) . "-"
401             . ( ( localtime )[ 5 ] + 1900 );
402             }
403 0 0         if ( !@lines ) {
404 0           carp "No data read from input $path";
405 0           return undef;
406             }
407 0           my $i = 0;
408              
409             # first line grouping is company, package and license info
410             # warn "Parsing License. Line $i\n";
411             # amahurkar:1/15/08 Seems like the current output does not have licesning info, so commenting this
412             # so commenting out these lines
413             #until ( $lines[ $i ] eq "" )
414             # $data->{ 'header' } .= $lines[ $i ] . "\n";
415             # $i++;
416              
417             #$i++;
418              
419             # next group is program and version
420             # warn "Parsing Program and version. Line $i\n";
421             # amahurkar:1/15/08 the format has changed and now there is no blank space
422             # after program name, so we are using '- -' as tha match param
423             # to stop parsing for program name
424             #until ( $lines[ $i ] eq "" )
425             #until ( $lines[ $i ] =~ m/^-\s-/) # It doesn't work with LDhmmpfam v1.5.4
426             #until ( $lines[ $i ] =~ m/^-{8}\s+-/)
427 0           while ($i < @lines) {
428 0 0         if ( $lines[ $i ] =~ /^((hmmpfam|hmmsearch)\.*)/ ) {
429 0           $data->{ 'program' } = $1;
430 0           my $version_line = $lines[ ++$i ];
431 0           $version_line =~ /^HMMER (\d+)\.(\S+)/;
432 0           ( $data->{ 'version' }, $data->{ 'release' } ) = ( $1, $2 );
433 0           $i +=2;
434 0           last;
435             }
436 0           ++$i;
437             }
438              
439             # next group is program parameters
440             # warn "Parsing Parameters. Line $i\n";
441             # amahurkar:1/15/08 the format has changed and now there is no blank space
442             # after program name, so we are using '- -' as tha match param
443             # to stop parsing for program parameters
444 0           until ( $lines[ $i ] =~ /^\s*$/ ) {
445 0 0         if ( $lines[ $i ] =~ /^HMM file:\s+(\S+)/ ) {
    0          
    0          
    0          
    0          
    0          
446 0           $data->{ 'hmm_file' } = $1;
447             } elsif ( $lines[ $i ] =~ /^Sequence (file|database):\s+(.+)/ ) {
448 0           $data->{ 'sequence_file' } = $2;
449             } elsif ( $lines[ $i ] =~ /^per-sequence score cutoff:\s+(.+)/ ) {
450 0           $data->{ 'total_score_cutoff' } = $1;
451             } elsif ( $lines[ $i ] =~ /^per-domain score cutoff:\s+(.+)/ ) {
452 0           $data->{ 'domain_score_cutoff' } = $1;
453             } elsif ( $lines[ $i ] =~ /^per-sequence E-value cutoff:\s+(.+)/ ) {
454 0           $data->{ 'total_evalue_cutoff' } = $1;
455             } elsif ( $lines[ $i ] =~ /^per-domain E-value cutoff:\s+(.+)/ ) {
456 0           $data->{ 'domain_evalue_cutoff' } = $1;
457             }
458 0           $i++;
459 0 0         die "Failure to parse" if $i > @lines;
460             }
461 0           $i++;
462              
463             # get query info
464             # warn "Parsing Query Info. Line $i\n";
465             #until ( $lines[ $i ] eq "" )
466 0           until ( $lines[$i] =~ /^\s*$/ ) {
467 0 0         if ( $lines[ $i ] =~ /^Query (?:HMM|sequence):\s+(.+)/ ) {
    0          
    0          
468 0           $data->{ 'query' } = $1;
469             }
470             elsif ( $lines[ $i ] =~ /^Accession:\s+(.+)/ ) {
471 0           $data->{ 'query_accession' } = $1;
472             }
473             elsif ( $lines[ $i ] =~ /^Description:\s+(.+)/ ) {
474 0           $data->{ 'query_description' } = $1;
475             }
476 0           $i++;
477 0 0         die "Failure to parse" if $i > @lines;
478             }
479 0           $i++;
480              
481             # next section is global search results
482             # warn "Parsing Global Search Results. Line $i\n";
483 0           my $find_frame = 0; # is datbase nucleotide sequence?
484 0           my $hit_index;
485 0           until ( $lines[$i] =~ /^\s*$/ ) {
486 0 0         if ( $lines[ $i ] =~ /^Scores for/ ) {
    0          
487 0           my $headers = $lines[ ++$i ];
488 0 0         if ( $headers =~ /\bFr\b/ ) {
489 0           $data->{ 'program' } .= "-frames";
490 0           $find_frame = 1;
491             }
492 0           $i++; # this skips the separator row
493             } elsif ( $lines[ $i ] !~ /no hits above thresholds/i ) {
494 0           my @c = split /\s+/, $lines[ $i ];
495 0 0         if ( $find_frame ) {
496 0           $hit_index = $c[ 0 ] . $c[ $#c ];
497 0           $data->{ 'hit' }->{ $hit_index }->{ 'frame' } = pop @c;
498             }
499             else {
500             # $hit_index = $c[ 0 ]; # AP 20090807
501 0           ($hit_index = $c[0]) =~ s/\.\d+$//;
502             }
503 0           $data->{hit}{$hit_index}{accession} = shift @c;
504 0           $data->{hit}{$hit_index}{domain_count} = pop @c;
505 0           $data->{hit}{$hit_index}{total_evalue} = pop @c;
506 0           $data->{hit}{$hit_index}{total_score} = pop @c;
507 0           $data->{hit}{$hit_index}{hit_description} = join " ", @c;
508             }
509             else {
510 0           return $data;
511             }
512 0           $i++;
513 0 0         die "Failure to parse" if $i > @lines;
514             }
515 0           $i++;
516              
517             # next section is domain breakdown
518             # warn "Parsing Domain Breakdown. Line $i\n";
519             #until ( $lines[ $i ] eq "" )
520 0           until ($lines[$i] =~ /^\s*$/) {
521 0 0         if ( $lines[ $i ] =~ /^Parsed for domains/ ) {
    0          
522 0           $i += 2; # to skip header and separator
523             } elsif ( $lines[ $i ] !~ /no hits above thresholds/ ) {
524              
525             # $lines[$i] =~ /^(\w+)\s+(\d+)\/\d+\s+(\d+)\s+(\d+)\s[\[\.\]]{2}\s+(\d+)\s+(\d+)\s[\[\.\]]{2}\s+(-?[\.\d]+)\s+([\.\-e\d]+)\s*(\-?\d)?/;
526 0           my @c = split /\s+/, $lines[ $i ];
527 0 0         if ( $find_frame ) {
528 0           $hit_index = $c[ 0 ] . $c[ $#c ];
529             }
530             else {
531             #$hit_index = $c[ 0 ]; # AP 20090807
532 0           ($hit_index = $c[0]) =~ s/\.\d+$//;
533             }
534 0 0         if ( !defined $data->{ 'hit' }->{ $hit_index } ) {
535 0           warn
536             "Why doesn't '$hit_index' match an existing identifier?";
537             }
538              
539             # $data->{'hit'}->{$hit_index}->{'domain'}->{$2}->{'seq_f'} = $3;
540 0           my ( $d, $t ) = split /\//, $c[ 1 ];
541 0           $data->{hit}{$hit_index}{domain}{$d}{seq_f} = $c[2];
542 0           $data->{hit}{$hit_index}{domain}{$d}{seq_t} = $c[3];
543 0           $data->{hit}{$hit_index}{domain}{$d}{hmm_f} = $c[5];
544 0           $data->{hit}{$hit_index}{domain}{$d}{hmm_t} = $c[6];
545 0           $data->{hit}{$hit_index}{domain}{$d}{domain_score} = $c[8];
546 0           $data->{hit}{$hit_index}{domain}{$d}{domain_evalue} = $c[9];
547             }
548 0           $i++;
549 0 0         die "Failure to parse" if $i > @lines;
550             }
551 0           $i++;
552 0 0         if ( $data->{ 'program' } =~ /hmmsearch/ ) {
553              
554             # next section is alignments
555             # warn "Parsing Alignments. Line $i\n";
556 0 0         if ( $lines[ $i ] =~ /^Alignments of top-scoring domains/ ) {
557 0           $i++;
558             ## kgalens [11/13/2012]
559             ## Not sure how this ever worked? $domain isn't in scope here.
560             ## I've added use strict; and this module wouldn't. So adding the next line
561             ## just so it will run.
562 0           my $domain;
563             my $hit;
564 0           my $hit_index;
565             ## END kgalens [11/13/2012]
566              
567 0           until ( $lines[$i] =~ /^\s*$/ ) {
568 0 0         if ( $lines[ $i ] =~ /^(\S+): domain (\d+)/ ) {
569 0           ($hit_index, $hit, $domain ) = ( $1, $1, $2 );
570 0           $hit =~ s/(.{10}).*/$1/;
571             # warn " Parsing hit $hit_index. Line $i\n";
572 0 0         if ( $find_frame ) {
573 0 0 0       if ( $lines[ $i ] =~ /Fr = ([\-\d]+)/ || $lines[ $i ] =~ /\. frame ([\-\d]+)/ ) {
574 0           $hit_index .= $1;
575             } else {
576 0           warn "ERROR: Couldn't find frame from:\n '$lines[$i]'";
577             }
578             }
579 0 0         if ( !defined $data->{ 'hit' }->{ $hit_index } ) {
580 0           warn "Why doesn't '$hit_index' match an existing identifier?";
581             }
582 0           $i++;
583             }
584 0 0         if ( $lines[ $i ] =~ /\bRF\b/ ) { ## WHY??
585 0           $i++;
586             }
587              
588             # capture aligned hmm consensus
589 0           my $hmm_seq = $lines[ $i ];
590 0           $hmm_seq =~ s/\s+//g;
591 0           $hmm_seq =~ s/[\*\-\>\<]//g;
592              
593 0           $data->{ 'hit' }->{ $hit_index }->{ 'domain' }->{ $domain }->{hmm_seq} .= $hmm_seq;
594 0           until ( $lines[ $i ] =~ /^\s+\Q$hit\E/) { ## changed from /\b\Q$hit\E\b/ )
595 0           $i++;
596             }
597 0           my $prot_seq = $lines[ $i ];
598 0 0         if ( $prot_seq =~ /\w+\s+(\d+|\-)\s+(\S+)\s+(\d+|\-)/ ) {
599 0           $data->{ 'hit' }->{ $hit_index }->{ 'domain' }->{ $domain }->{prot_seq} .= $2;
600             }
601 0           $i += 2; # skip the blank line and move on to the next
602 0 0         die "Failure to parse" if $i > @lines;
603             }
604             }
605 0           $i++;
606              
607             # next (last) section is statistics
608             # warn "Parsing Statistics. Line $i\n";
609 0           my @data;
610 0           while ( $i < @data ) {
611 0 0         if ( $lines[ $i ] =~ /^\s+mu =\s+(-?\d+)/ ) {
    0          
    0          
    0          
    0          
    0          
612 0           $data->{ 'mu' } = $1;
613             } elsif ( $lines[ $i ] =~ /^\s+lambda =\s(-?\d+)/ ) {
614 0           $data->{ 'lambda' } = $1;
615             } elsif ( $lines[ $i ] =~ /chi-sq statistic =\s(\d+)/ ) {
616 0           $data->{ 'chisq' } = $1;
617             } elsif ( $lines[ $i ] =~ /Total sequences searched:\s*(\d+)/ ) {
618 0           $data->{ 'tot_seq_searched' } = $1;
619             } elsif ( $lines[ $i ] =~ /Whole sequence top hits/ ) {
620 0           $lines[ ++$i ] =~ /(\d+)/;
621 0           $data->{ 'total_hits' } = $1;
622 0           $lines[ ++$i ] =~ /(\d+)/;
623 0           $data->{ 'total_hits_above_evalue_cutoff' } = $1;
624             } elsif ( $lines[ $i ] =~ /Domain top hits/ ) {
625 0           $lines[ ++$i ] =~ /(\d+)/;
626 0           $data->{ 'domain_hits' } = $1;
627 0           $lines[ ++$i ] =~ /(\d+)/;
628 0           $data->{ 'domain_hits_above_evalue_cutoff' } = $1;
629             }
630 0           $i++;
631 0 0         die "Failure to parse" if $i > @lines;
632             }
633             }
634 0           return $data;
635             }
636              
637             sub hmm_database_info {
638 0     0 0   my $dbh = shift;
639 0           my $hmm_q =
640             "SELECT hmm_acc, hmm_len, trusted_cutoff, noise_cutoff, hmm_com_name,"
641             . " trusted_cutoff2, noise_cutoff2, gathering_cutoff, gathering_cutoff2"
642             . " FROM hmm2"
643             . " WHERE is_current = 1";
644 0           my $HMM = $dbh->selectall_hashref( $hmm_q, 'hmm_acc' );
645 0           return $HMM;
646             }
647              
648             sub print_htab {
649             #NOTE: This will produce results if the data hash was created through the 'read_hmmer3_output' subroutine
650             # This is because of a change in the naming of some of the property keys (ex 'hit' to 'hits')
651 0     0 0   my $data = shift;
652 0           my $HMM = shift;
653 0           my $output = shift;
654 0           foreach my $qry_id ( keys %{$data->{'queries'}} ) {
  0            
655 0           foreach my $hit (
656             sort {
657             $data->{'queries'}->{$qry_id}->{'hits'}->{ $b }->{ 'total_score' } <=> $data->{'queries'}->{$qry_id}->
658 0           {'hits'}->{ $a }->{ 'total_score' }
659 0           } keys %{ $data->{'queries'}->{$qry_id}->{'hits'} } )
660             {
661 0           my $h = $data->{'queries'}->{$qry_id}->{'hits'}->{ $hit };
662 0 0         next if (scalar keys %{$h->{'domains'}} == 0); #skip model hits that have no domain hits
  0            
663 0           foreach my $domain ( sort { $a <=> $b } keys %{ $h->{ 'domains' } } )
  0            
  0            
664             {
665             # for convenience
666 0           my $dh = $h->{ 'domains' }->{ $domain };
667 0 0         if ( $data->{'info'}->{ 'program' } =~ /hmmsearch/ ) { #hmmer2 is currently deprecated so this will probably error
    0          
668             my $hmm_com_name =
669             $HMM->{ $data->{ 'query' } }->{ 'hmm_com_name' }
670             ? $HMM->{ $data->{ 'query' } }->{ 'hmm_com_name' }
671 0 0         : $data->{ 'query_description' };
672 0           print $output "$data->{query}"
673             . "\t$data->{search_date}"
674             . "\t$HMM->{$data->{query}}->{hmm_len}"
675             . "\t$data->{program}"
676             . "\t$data->{sequence_file}"
677             . "\t$h->{accession}"
678             . "\t$dh->{hmm_f}"
679             . "\t$dh->{hmm_t}"
680             . "\t$dh->{seq_f}"
681             . "\t$dh->{seq_t}"
682             . "\t$h->{frame}"
683             . "\t$dh->{domain_score}"
684             . "\t$h->{total_score}"
685             . "\t$domain"
686             . "\t$h->{domain_count}"
687             . "\t$hmm_com_name"
688             . "\t$h->{hit_description}"
689             . "\t$HMM->{$data->{query}}->{trusted_cutoff}"
690             . "\t$HMM->{$data->{query}}->{noise_cutoff}"
691             . "\t$h->{total_evalue}"
692             . "\t$dh->{domain_evalue}"
693             . "\t$HMM->{$data->{query}}->{trusted_cutoff2}"
694             . "\t$HMM->{$data->{query}}->{noise_cutoff2}"
695             . "\t$HMM->{$data->{query}}->{gathering_cutoff}"
696             . "\t$HMM->{$data->{query}}->{gathering_cutoff2}" . "\n";
697             }
698             elsif ( $data->{'info'}->{ 'program' } =~ /hmmscan|hmmpfam/ ) {
699             my $hmm_com_name =
700             $HMM->{ $hit }->{ 'hmm_com_name' }
701             ? $HMM->{ $hit }->{ 'hmm_com_name' }
702 0 0         : $h->{ 'hit_description' };
703 0           print $output "$h->{accession}"
704             . "\t$data->{'info'}->{search_date}"
705             . "\t$HMM->{$hit}->{hmm_len}"
706             . "\t$data->{'info'}->{program}"
707             . "\t$data->{'info'}->{hmm_file}"
708             . "\t$qry_id"
709             . "\t$dh->{hmm_f}"
710             . "\t$dh->{hmm_t}"
711             . "\t$dh->{seq_f}"
712             . "\t$dh->{seq_t}"
713             . "\t$h->{frame}"
714             . "\t$dh->{domain_score}"
715             . "\t$h->{total_score}"
716             . "\t$domain"
717             . "\t$h->{domain_count}"
718             . "\t$hmm_com_name"
719             . "\t$h->{hit_description}"
720             . "\t$HMM->{$h->{accession}}->{trusted_cutoff}"
721             . "\t$HMM->{$h->{accession}}->{noise_cutoff}"
722             . "\t$h->{total_evalue}"
723             . "\t$dh->{domain_evalue}"
724             . "\t$HMM->{$h->{accession}}->{trusted_cutoff2}"
725             . "\t$HMM->{$h->{accession}}->{noise_cutoff2}"
726             . "\t$HMM->{$h->{accession}}->{gathering_cutoff}"
727             . "\t$HMM->{$h->{accession}}->{gathering_cutoff2}" . "\n";
728             }
729             }
730             }
731             }
732             }
733              
734             sub build_alignment {
735 0     0 0   my $data = shift;
736 0           my $instructions = shift;
737              
738             # build output file name
739 0           my $output_file;
740             $output_file =
741 0           $instructions->{file_prefix} . "." . $instructions->{file_format};
742 0 0         open my $OUT, ">$output_file"
743             or croak "Can't open '$output_file' as output file: $!\n";
744 0           select $OUT;
745              
746             # retrieve aligned sequences
747 0           my %screened;
748 0           foreach my $hit ( keys %{ $data->{ 'hit' } } ) {
  0            
749              
750             # screen for total score cutoffs
751 0 0 0       if ( $data->{hit}->{ $hit }->{total_score} >=
752             $instructions->{total_bit_cutoff}
753             && $data->{hit}->{ $hit }->{total_evalue} <=
754             $instructions->{total_evalue_cutoff} )
755             {
756 0           foreach my $domain ( keys %{ $data->{hit}->{ $hit }->{domain} } )
  0            
757             {
758 0 0 0       if ( $data->{hit}->{ $hit }->{domain}->{ $domain }
759             ->{domain_score} >= $instructions->{domain_bit_cutoff}
760             && $data->{hit}->{ $hit }->{domain}->{ $domain }
761             ->{domain_evalue} <=
762             $instructions->{domain_evalue_cutoff} )
763             {
764 0           $screened{ $hit } = $domain;
765             }
766             }
767             }
768             }
769              
770             # Now that we have sequences aligned to hmm sequence, we have to translate
771             # this into a multiple alignment. Assign each position in each alignment to
772             # a position on the hmm 'sequence', and keep track of gaps in the hmm alignment
773 0           my %DIST;
774             my $ref;
775 0           foreach my $hit ( keys %screened ) {
776             $ref =
777 0           $data->{ 'hit' }->{ $hit }->{ 'domain' }->{ $screened{ $hit } };
778              
779             # split aligned hmm seq and aligned protein seq into arrays
780 0           my @hmma = split / */, $ref->{hmm_seq};
781 0           my @prota = split / */, $ref->{prot_seq};
782              
783             # these should be the same length. If not, there's an error.
784 0 0         if ( @hmma != @prota ) {
785 0           croak "Length of hmm alignment (" . @hmma . ")"
786             . " is not equal to protein alignment ("
787             . @prota . ")"
788             . ": $data->{query}/$data->{hit}->{$hit}->{accession}\n"
789             . "$ref->{hmm_seq}\n@hmma\n$ref->{prot_seq}\n@prota\n";
790             }
791              
792             # assign each position in the protein alignment to its hmm alignment position,
793             # if one exists
794 0           my $hmm_pos = $ref->{hmm_f};
795 0           my $gap = 0;
796 0           for ( my $i = 0 ; $i < @hmma ; $i++ ) {
797 0 0         if ( $hmma[ $i ] ne "." ) {
798              
799             # assign position in the protein alignment to its hmm alignment position,
800 0           $prota[ $i ] = $hmm_pos;
801              
802             # record max gap distance between hmm alignment positions.
803             $DIST{ $hmm_pos } = $gap
804 0 0         if ( $gap >= $DIST{ $hmm_pos } );
805 0           $gap = 0;
806 0           $hmm_pos++;
807             }
808             else {
809 0           $gap++;
810             }
811             }
812 0           $ref->{aln_map} = \@prota;
813             }
814              
815             # Now go back through (now that we've fully expanded the hmm alignment
816             # to include any and all gaps) and make aligned protein sequence
817 0           foreach my $hit ( keys %screened ) {
818             $ref =
819 0           $data->{ 'hit' }->{ $hit }->{ 'domain' }->{ $screened{ $hit } };
820 0           my @prot_seq = split / */, $ref->{prot_seq};
821              
822             # start our aligned protein with any gap resulting from a partial HMM hit.
823 0           my $aln_prot = "." x ( $ref->{hmm_f} - 1 );
824 0           my $insert = 0;
825 0           for ( my $i = 0 ; $i < @prot_seq ; $i++ ) {
826              
827             # grab the hmm alignment position for each protein alignment position
828 0           my $pos = $ref->{aln_map}->[ $i ];
829 0 0         if ( $pos =~ /\d+/ ) {
830              
831             # if it maps to a position, first insert any gap from the hmm alignment
832 0           $aln_prot .= "." x ( $DIST{ $pos } - $insert );
833              
834             # then add the aa.
835 0           $aln_prot .= $prot_seq[ $i ];
836 0           $insert = 0;
837             }
838              
839             # if it is an insertion (ie the hmm alignment shows gap), insert a gap
840             else {
841 0           $aln_prot .= $prot_seq[ $i ];
842 0           $insert++;
843             }
844             }
845 0           $aln_prot =~ s/\-/\./g;
846 0           $ref->{aln_prot} = $aln_prot;
847             }
848              
849             # Now print out in selected format
850             # Stockholm format
851 0 0 0       if ( $instructions->{file_format} eq "mul" ) {
    0          
    0          
852 0           print "# STOCKHOLM 1.0\n";
853 0           foreach my $hit (
854             sort {
855             $data->{ 'hit' }->{ $b }
856             ->{ 'total_score' } <=> $data->{ 'hit' }->{ $a }
857 0           ->{ 'total_score' }
858             } keys %screened
859             )
860             {
861 0           my $domain = $screened{ $hit };
862 0           my $hit_ref = $data->{hit}->{ $hit };
863 0           my $domain_ref = $hit_ref->{domain}->{ $domain };
864              
865             # each line should look like:
866             # prot_acc/coord-coord sequence
867             printf "%-40s%s\n",
868             (
869             "$hit_ref->{accession}/$domain_ref->{seq_f}-$domain_ref->{seq_t}",
870             $domain_ref->{aln_prot}
871 0           );
872             }
873             }
874              
875             # FASTA format
876             elsif ($instructions->{file_format} eq "fasta"
877             || $instructions->{file_format} eq "fa" )
878             {
879 1     1   412 use TIGR::FASTA::Record;
  1         2  
  1         580  
880 0           foreach my $hit (
881             sort {
882             $data->{ 'hit' }->{ $b }
883             ->{ 'total_score' } <=> $data->{ 'hit' }->{ $a }
884 0           ->{ 'total_score' }
885             } keys %screened
886             )
887             {
888 0           my $domain = $screened{ $hit };
889 0           my $hit_ref = $data->{hit}->{ $hit };
890 0           my $domain_ref = $hit_ref->{domain}->{ $domain };
891 0           my $aln_prot = $domain_ref->{aln_prot};
892 0           $aln_prot =~ s/(.{60})/$1\n/g;
893 0           $aln_prot =~ s/\n+/\n/g;
894 0           chomp $aln_prot;
895 0           my $header =
896             ">$hit_ref->{accession}/$domain_ref->{seq_f}-$domain_ref->{seq_t}\n";
897 0           print $header . $aln_prot . "\n";
898             }
899             }
900              
901             # MSF format
902             elsif ( $instructions->{file_format} eq "msf" ) {
903 0           my $head_len = 40;
904 0           my ( %new_acc, %tmp_seq );
905 0           my $header_line = 0;
906 0           my @alignment;
907 0           foreach my $hit (
908             sort {
909             $data->{ 'hit' }->{ $b }
910             ->{ 'total_score' } <=> $data->{ 'hit' }->{ $a }
911 0           ->{ 'total_score' }
912             } keys %screened
913             )
914             {
915 0           my $domain = $screened{ $hit };
916 0           my $hit_ref = $data->{hit}->{ $hit };
917 0           my $domain_ref = $hit_ref->{domain}->{ $domain };
918 0           my $len = length( $domain_ref->{aln_prot} );
919 0 0         if ( $header_line == 0 ) {
920              
921             #added DUMMY checksum for HMMER 2.2g hmmbuild. DHH.
922 0           print
923             "PileUp\n\n MSF: $len Type: P Check: 1111 ..\n\n";
924 0           $header_line = 1;
925             }
926              
927             # print accession list at top
928 0           printf " Name: %-40s Len: $len Check: 0 Weight: 1.0\n",
929             "$hit_ref->{accession}/$domain_ref->{seq_f}-$domain_ref->{seq_t}";
930              
931             # prepare alignment for bottom
932 0           my @tmp_pep = split //, $domain_ref->{aln_prot};
933 0           for ( my $i = 0 ; $i < ( $len / 50 ) ; $i++ ) {
934 0           $alignment[ $i ] .= sprintf "%-40s",
935             "$hit_ref->{accession}/$domain_ref->{seq_f}-$domain_ref->{seq_t}";
936 0           for ( my $b = 0 ; $b < 5 ; $b++ ) {
937 0           for ( my $a = 0 ; $a < 10 ; $a++ ) {
938 0           $alignment[ $i ] .=
939             $tmp_pep[ $a + ( $b * 10 ) + ( 50 * $i ) ];
940             }
941 0           $alignment[ $i ] .= " ";
942             }
943 0           $alignment[ $i ] .= "\n";
944             }
945             }
946 0           print "\n//\n\n\n";
947 0           foreach my $block ( @alignment ) {
948 0           print "$block\n\n";
949             }
950             }
951             else {
952 0           croak
953             "Don't recognize alignment file format $instructions->{file_format}\n";
954             }
955 0           select STDOUT;
956             }
957              
958             sub get_cutoffs_for_hmm_accession {
959 0     0 0   my $dbh = shift;
960 0           my $accession = shift;
961 0           my $hmm_q =
962             "select trusted_cutoff, trusted_cutoff2, noise_cutoff from egad..hmm2 where hmm_acc = '$accession'";
963 0           return $dbh->selectrow_hashref( $hmm_q );
964             }
965              
966             1; # End of TIGR::HmmTools