File Coverage

blib/lib/String/Simrank.pm
Criterion Covered Total %
statement 24 375 6.4
branch 0 154 0.0
condition 0 17 0.0
subroutine 8 17 47.0
pod 3 3 100.0
total 35 566 6.1


line stmt bran cond sub pod time code
1             package String::Simrank;
2              
3             # Note to developers:
4             # enable the VERSIONs (here and in the "use Inline C ..."
5             # statement) to match each other when ready to install in sitelib or
6             # when ready to distribute. Otherwise, disable them
7             # with commenting symbol for testing.
8              
9             # use base 'Exporter';
10             # @EXPORT_OK = qw(add subtract);
11 1     1   31740 use strict;
  1         2  
  1         41  
12 1     1   5 use File::Basename;
  1         3  
  1         128  
13 1     1   1163 use IO::File;
  1         12744  
  1         166  
14 1     1   10 use Fcntl;
  1         3  
  1         499  
15 1     1   11025 use Data::Dumper;
  1         35031  
  1         90  
16 1     1   1139 use Storable;
  1         4409  
  1         90  
17 1     1   14 use vars qw($VERSION $NAME);
  1         2  
  1         82  
18              
19              
20             $VERSION = '0.079';
21             $NAME = 'String::Simrank';
22              
23             # Must specify a NAME and VERSION parameter. The NAME must match your module's package
24             # name. The VERSION parameter must match the module's $VERSION
25             # variable and they must both be of the form /^\d\.\d\d$/.
26             # DATA refers to the __DATA__ section near bottom of this file.
27 1     1   1223 use Inline C => 'DATA';
  1         26131  
  1         9  
28              
29             =head1 NAME
30              
31             String::Simrank - Rapid comparisons of many strings for k-mer similarity.
32             =cut
33              
34             =head1 SYNOPSIS
35              
36             use String::Simrank;
37             my $sr = new String::Simrank(
38             { data => 'db.fasta'});
39             $sr->formatdb();
40             $sr->match_oligos({ query => 'query.fasta' });
41              
42             You may utilize k-mers of various sizes, e.g.:
43             $sr->formatdb( { wordlen => 6 } );
44             =cut
45              
46             =head1 DESCRIPTION
47              
48             The String::Simrank module allows rapid searches for similarity between query
49             strings and a database of strings. This module is maintained by molecular
50             biologists who use it for searching for similarities among strings representing
51             contiguous DNA or RNA sequences. This program does not
52             construct an alignment but rather finds the ratio
53             of nmers (A.K.A. ngrams) that are shared between a query and database
54             records.
55             The input file should be fasta formatted (either aligned or unaligned) multiple
56             sequence file.
57             The memory consumption is moderate and grows linearly with the number of sequences
58             and depends on the nmer size defined by the user. Using 7-mers, ~20,000 strings
59             each ~1,500 characters in length requires ~50 Mb.
60              
61             The module can be used from the command line through the script
62             C provided in the examples folder.
63              
64             By default the output is written to STDOUT and
65             represents the similarity of each query string to the top hits in the
66             the database. The format is query_id, tab, best match database_id, colon, percent similarity, space
67             second best match database_id, colon, percent similarity.
68             query1 EscCol36:100.00 EscCol36:100.00 EscCol43:99.59 EscCol29:99.24 EscCol33:99.17 EscCol10:99.02
69              
70             =cut
71              
72             =head1 METHODS
73              
74             =cut
75              
76             =head2 new( { data_path => $path })
77            
78             my $sr = new String::Simrank( { data => 'many_seqs.fasta' });
79              
80             Instantiates a new String::Simrank object.
81             Each simrank database should live within its own String::Simrank object.
82             In other words, each unique combination of a fasta database and n-mer
83             length will have its own instance.
84             If the database has already been formated then the wordlength will be
85             read from the formatted database's .bin file.
86             Parameters:
87              
88             =over 4
89              
90             =item data
91              
92             Sets the path of where the data fasta file is (Database sequences, fasta format).
93             The same terminal directory is where the formated database will reside with the same
94             base name but the .fasta will be substituted with .bin.
95             Sequence names(identifiers) will be first 10 characters matched between the '>' and the first space character.
96             Be sure the sequences names are unique within this string.
97              
98             =back
99              
100             =cut
101              
102             sub new {
103             # Todd Z. DeSantis, March 2008
104            
105 0     0 1   my ($class, $cl_args) = @_;
106             # Parameter hr is called $cl_args since in
107             # many instances these will come directly from the command line.
108 0           my $self = bless {}, $class;
109 0           $self->_check_arguments_for_new($cl_args);
110             # print STDERR "you are using String::Simrank version $VERSION \n";
111 0           return $self;
112             }
113              
114             =head2 _check_arguments_for_new
115              
116             Internally used function.
117             Validates the 'data' argument needed so this instance knows what data
118             file will be linked. Method checks that fasta file exists and is readable
119             and creates the name which corresponds to the .bin file.
120             If errors are found, program exits.
121             Returns a hash or hash ref
122             depending on the context of the method call.
123              
124             =cut
125              
126              
127             sub _check_arguments_for_new {
128             # Niels Larsen, May 2004.
129             # Todd Z. DeSantis, March 2008.
130            
131 0     0     my ($self,
132             $cl_args, # Command Line ARGument hash
133             # not really required to come from the command line
134             ) = @_;
135              
136 0           my ( @errors, $error);
137              
138 0 0         if ( $cl_args->{"data"} ) {
139             # $cl_args->{"data"} = &Common::File::full_file_path( $cl_args->{"data"} );
140              
141 0 0         if ( -r $cl_args->{"data"}) {
142 0           $cl_args->{"binary"} = $cl_args->{"data"};
143 0           $cl_args->{"binary"} =~ s/\.[^\.]+$//;
144 0           $cl_args->{"binary"} .= ".bin";
145            
146 0           $self->{param}{data} = $cl_args->{"data"};
147 0           $self->{param}{binary} = $cl_args->{"binary"};
148 0 0 0       if (-e $cl_args->{"binary"} && -r $cl_args->{"binary"}) {
149 0           print $cl_args->{"binary"} . " has been formatted\n";
150 0           $self->{binary_ready} = 1;
151             } else {
152 0           $self->{binary_ready} = 0;
153             }
154            
155             } else {
156 0           push @errors, qq (Simrank.pm: Input fasta data file not found or not readable -> "$cl_args->{'data'}");
157             }
158            
159             } else {
160 0           push @errors, qq (Fasta sequence file must be specified with *data* argument);
161             }
162            
163             # Print errors if any and exit,
164              
165 0 0         if ( @errors )
166             {
167 0           foreach $error ( @errors )
168             {
169 0           print STDERR "$error\n";
170             # &Common::Messages::echo_red( "ERROR: ");
171             # &Common::Messages::echo( "$error\n" );
172             }
173            
174 0           exit;
175             }
176             else {
177            
178 0 0         wantarray ? return %{ $cl_args } : return $cl_args;
  0            
179             }
180              
181             }
182              
183             =head2 formatdb({ wordlen => $int, minlen => $int, silent = $bool })
184              
185             $sr->formatdb({wordlen => 8, minlen => 500, silent => 0, valid_chars => 'ACGT'});
186              
187             From an input collection of strings (data), generates a binary file
188             optimized for retrieval of k-mer similarities. The file will contain
189             a pre-computed map of which sequences - given by their index
190             number in the input data file - contain a given k-mer. All the
191             valid k-mers of the given length are mapped. This means each
192             k-mer from a query string can be used to look up the database strings
193             it occurs in, very quickly. The memory consumption is moderate
194             and grows linearly with the number of sequences; 20,000 takes
195             50-55 mb.
196             The first characters between the '>' character and the first space
197             is recognized as the string (sequence) identifier and should be unique for each record.
198             The number of sequences found in the input collection
199             is returned and the file xyz.bin is written to disk.
200              
201             Parameters:
202              
203             =over 4
204              
205             =item pre_subst
206              
207             An integer that determines how to interpret some special characters in the strings.
208             The substitution is done before k-mers are extracted.
209             Use 1 to eliminate all periods(.), hypens(-), and space characters(\s, which includes
210             \s,\t,\n). This is the default behavior.
211             Use 2 to convert same characters as above into underscores(_).
212              
213             =item wordlen
214              
215             An integer that specifies the k-mer length to index for each record in the input data file.
216             Default is 7.
217              
218             =item minlen
219              
220             Specifies the minimum string length for a database sequence to be worthy of indexing.
221             Default is 50.
222              
223             =item valid_chars
224              
225             Specifies the characters to allow for formatting. Entire k-mer must be composed
226             of these characters otherwise it will be ignored. When not defined, all
227             characters are valid.
228             Default is undef (indicating all characters are valid)
229              
230             =item silent
231              
232             Specifies if caller wants progress messages printed to STDERR while formatting.
233             Default is false, meaning not silent.
234              
235             =back
236              
237             =cut
238              
239              
240             sub formatdb {
241             # Niels Larsen, November 2005.
242             # Todd Z. DeSantis, July 2010
243              
244 0     0 1   my ( $self, $cl_args) = @_;
245 0           $self->_check_arguments_for_formatdb($cl_args);
246              
247             # Returns an integer.
248              
249 0           my ( $data_fh, $bin_fh, $entry, @seq_oligos, $oligos, $oli2seqoffs,
250             $seqnum, $oligo, @sids, $sids, $bytstr, $begpos,
251             $seqtot, $bytlen, $olibeg,
252             $oli_counts, $id, $seq, $count, $all_begpos, $all_bytlen,
253             $valid_chars, $lastnums, $lastnum, @seqoffs, $off, @seqnums,
254             $id_len);
255             ## I think I can exlude $lastnums, never written to disk.
256              
257 0           my $cl_wordlen = $self->{param}{wordlen};
258 0           my $cl_silent = $self->{param}{silent};
259 0           my $cl_binary = $self->{param}{binary};
260 0           my $cl_data = $self->{param}{data};
261 0 0         print "cl_data: $cl_data\n" unless ($cl_silent);
262 0 0         print "cl_binary: $cl_binary\n" unless ($cl_silent);
263            
264            
265             # Find the number of sequences and the longest length identifier,
266 0 0         print "Counting new fasta records and measuring size of identifiers... " if not $cl_silent;
267 0           $seqtot = _count_records($cl_data);
268 0 0         print "found $seqtot\n" if not $cl_silent;
269            
270 0           $seqnum = 0;
271 0           $oli_counts = "";
272 0           $valid_chars = $self->{param}{valid_chars};
273 0           $id_len = 0; # init
274              
275 0           $data_fh = new IO::File $self->{param}{data}, "r";
276            
277             {
278 0           local $/ = '>';
  0            
279 0 0         print "Mapping sub-sequences ... \n" unless ($cl_silent);
280 0           while ( <$data_fh> ) {
281 0 0         next if ($_ eq '>'); # toss the first empty record
282 0           chomp $_;
283            
284 0           my ($header, @seq_lines) = split /\n/, $_;
285 0           my ($id) = split /\s/, $header;
286 0           my $sequence = '';
287 0 0         if ($self->{param}{pre_subst} == 1) {
    0          
288 0           $sequence = join '', @seq_lines;
289 0           $sequence =~ s/[\.\-\s]//g;
290             } elsif ($self->{param}{pre_subst} == 2) {
291 0           $sequence = join '_', @seq_lines;
292 0           $sequence =~ s/[\.\-\s]/_/g;
293             }
294 0 0 0       next unless ($id && $sequence);
295            
296 0           push @sids, $id;
297             # format to a uniform character length AFTER longest is found
298 0 0         $id_len = length($id) if (length($id) > $id_len);
299              
300 0           @seq_oligos = _create_oligos( $sequence, $cl_wordlen, $valid_chars);
301            
302 0           foreach $oligo ( @seq_oligos ) {
303 0 0         if ( exists $oli2seqoffs->{ $oligo } ) {
304 0           $lastnum = $lastnums->{ $oligo }; ## I think I can exlude $lastnums, never written to disk.
305 0           $oli2seqoffs->{ $oligo } .= ",". ($seqnum - $lastnum);
306             # Keep track of the hops needed to find this oligo in the seqnums
307             } else {
308 0           $oli2seqoffs->{ $oligo } = $seqnum;
309             }
310              
311 0           $lastnums->{ $oligo } = $seqnum;
312             # keep track of the last $seqnum in which this $oligo was found.
313             ## I think I can exlude $lastnums, never written to disk.
314              
315             }
316              
317 0           $oli_counts .= ",". ( scalar @seq_oligos );
318             # string-encoded ordered list of count of unique oligos in each seqnum
319              
320 0           $seqnum++;
321 0 0 0       if ( !$cl_silent && $seqnum % 5000 == 0 ) {
322 0           print "$seqnum done of $seqtot ," .
323 0           scalar(keys %{ $oli2seqoffs }) . " unique kmers found ...\n";
324             # this message is accurate since we just incremented
325             # $seqnum. So if we just finshed index 4 (which is the
326             # fifth sequence, then $seqnum would have a value of 5"
327             }
328             }
329             }
330            
331 0           $seqtot = $seqnum;
332             # this is true since $seqnum was incremented after last sequence
333              
334 0 0         if ( not $cl_silent )
335             {
336 0           print "Total sequence count: $seqnum of $seqtot ," .
337 0           scalar(keys %{ $oli2seqoffs }) . " unique kmers found.\n";
338             }
339 0           $data_fh->close;
340            
341            
342 0           $self->{db_string_count} = $seqnum;
343 0           $self->{unique_kmer_count} = scalar(keys %{ $oli2seqoffs });
  0            
344              
345             # Write binary file. We write binary representations using syswrite
346             # and save the word length and number of sequences (the rest can be
347             # calculated)
348              
349 0           eval { $bin_fh = IO::File->new( $cl_binary, 'w' ) };
  0            
350 0 0         if ($@) {
351 0           print STDERR $@;
352             # such as: your vendor has not defined Fcntl macro O_LARGEFILE
353 0 0         if ( not $bin_fh = new IO::File $cl_binary, "w") {
354            
355 0           print STDERR "Could not syswrite-open file $cl_binary\n";
356 0           exit;
357             }
358             }
359             # print STDERR Dumper($bin_fh);
360 0 0         unless (defined $bin_fh) {
361 0           print STDERR "A filehandle to $cl_binary has not been defined\n";
362 0           exit;
363             }
364              
365 0 0         if ( not $cl_silent ) {
366 0           print "Total number of unique oligos: " . scalar(keys %{ $oli2seqoffs } ) . "\n";
  0            
367 0           print "Writing oligo map to file ...\n";
368             }
369              
370             # Id length, word_length, and total number of sequences,
371              
372 0           $bytstr = pack "A10", $id_len; # the max_length is encoded as text
373 0           syswrite $bin_fh, $bytstr;
374              
375 0           $bytstr = pack "A10", $cl_wordlen;
376 0           syswrite $bin_fh, $bytstr;
377              
378 0           $bytstr = pack "A10", $seqtot;
379 0           syswrite $bin_fh, $bytstr;
380              
381             # Short ids as a string of fixed-length character words,
382 0           $bytstr = pack "A$id_len" x $seqtot, @sids;
383 0           syswrite $bin_fh, $bytstr;
384              
385             # Write byte strings of matching sequence numbers, while keeping
386             # track of their byte position offsets,
387 0           $begpos = sysseek $bin_fh, 0, 1; # recommended "systell", perl book says
388 0 0         $begpos = 0 if ($begpos =~ /^0/); # sometimes $begpos gets set from sysseek as "0 but true"
389             # I think this has something to do with not using the Fcntl
390             # flags: O_WRONLY|O_CREAT|O_EXCL|O_LARGEFILE
391             # since this behavoir emerged when I stopped using them.
392             # BUT, I expect sysseek to return a positive int
393             # since many bytes have been already written to this filehandle.
394            
395 0           @seq_oligos = ();
396              
397 0           $all_begpos = "";
398 0           $all_bytlen = "";
399              
400 0           $count = 0;
401            
402 0           foreach $oligo ( sort(keys %{ $oli2seqoffs }) ) # "sort" added by tzd Apr-11-2008
  0            
403             {
404            
405 0           @seqoffs = eval $oli2seqoffs->{ $oligo };
406             # print STDERR '@seqoffs:' . join(',', @seqoffs) . "\n";
407              
408 0           @seqnums = shift @seqoffs;
409            
410            
411 0           foreach $off ( @seqoffs )
412             {
413 0           push @seqnums, $seqnums[-1] + $off;
414             # -1 looks-up the value of the last element in the @seqnums
415            
416             }
417             # print STDERR '@seqnums:' . join(',', @seqnums) . "\n";
418            
419 0           $bytstr = pack "I*", @seqnums; # unsigned integers
420             # the rest of this module
421             # assumes that integers will
422             # be 4-byte which is a fairly
423             # safe assumption
424 0           syswrite $bin_fh, $bytstr;
425            
426 0           $bytlen = length $bytstr;
427              
428 0           $all_begpos .= ",$begpos";
429 0           $all_bytlen .= ",$bytlen";
430            
431 0           $begpos += $bytlen;
432              
433 0           delete $oli2seqoffs->{ $oligo };
434              
435 0           push @seq_oligos, $oligo;
436              
437 0           $count += 1;
438             }
439              
440 0           $oli2seqoffs = {};
441              
442 0           $olibeg = $begpos;
443              
444             # Write the oligos found,
445              
446 0           $bytstr = pack "A$cl_wordlen" x ( scalar @seq_oligos ), @seq_oligos;
447 0           syswrite $bin_fh, $bytstr;
448              
449             # The begin positions of the corresponding run of sequence indices,
450             # and their lengths,
451            
452 0           $all_begpos =~ s/^,//; # get rid of initial comma
453 0           $bytstr = pack "I*", eval $all_begpos;
454 0           syswrite $bin_fh, $bytstr;
455              
456 0           $all_bytlen =~ s/^,//;
457 0           $bytstr = pack "I*", eval $all_bytlen;
458 0           syswrite $bin_fh, $bytstr;
459              
460             # Store the number of unique oligos for every sequence,
461              
462 0           $oli_counts =~ s/^,//;
463 0           $bytstr = pack "I*", eval $oli_counts;
464 0           syswrite $bin_fh, $bytstr;
465            
466             # Finally, we need to store the number of oligos encountered and the
467             # byte position of where they were stored,
468              
469 0           $bytstr = pack "A10", scalar @seq_oligos;
470 0           syswrite $bin_fh, $bytstr;
471            
472 0           $bytstr = pack "A10", $olibeg;
473 0           syswrite $bin_fh, $bytstr;
474            
475 0           $bin_fh->close;
476              
477            
478 0 0         print "format binary file complete\n" unless ($cl_silent);
479 0           return $seqnum;
480             }
481              
482             =head2 _check_arguments_for_formatdb
483              
484             It checks that word length and minimum sequence length is reasonable.
485             Sets silent to true by default.
486             Sets pre_subst to 1 by default.
487             Defines binary file name.
488             If errors are found, program exits.
489             Returns a hash or hash ref
490             depending on the context of the method call.
491              
492             =cut
493              
494              
495             sub _check_arguments_for_formatdb {
496             # Niels Larsen, May 2004.
497             # Todd Z. DeSantis, March 2008.
498 0     0     my ( $self,
499             $cl_args, # Command Line ARGument hash
500             # not really reqired to come from the command line
501             ) = @_;
502            
503 0           my ( @errors, $error);
504            
505             ## see if caller wants to constrain the alphabet
506 0 0         if ( exists $cl_args->{valid_chars} ) {
507 0           $self->{param}{valid_chars} = $cl_args->{valid_chars};
508             } else {
509 0           $self->{param}{valid_chars} = undef;
510             }
511              
512             ## see if caller wants to make some pre-substitutions
513 0 0         if ( exists $cl_args->{pre_subst} ) {
514 0           $self->{param}{pre_subst} = $cl_args->{pre_subst};
515             } else {
516 0           $self->{param}{pre_subst} = 1;
517             }
518              
519             # Ensure mininum sequence length is 1 or more and fill in 50
520             # if user didnt specify it
521 0 0         if ( $cl_args->{"minlen"} )
522             {
523 0 0         if ( $cl_args->{"minlen"} < 1 ) {
524 0           push @errors, qq (Minimum sequence length should be 1 or more);
525             }
526             }
527             else {
528 0           $cl_args->{"minlen"} = 50;
529             }
530 0           $self->{param}{minlen} = $cl_args->{"minlen"};
531              
532             # Ensure word length is over 0 and less than
533             # or equal to minlen and fill in 7 if the
534             # user didnt specify it,
535 0 0         if ( $cl_args->{"wordlen"} ) {
536 0 0         if ( $cl_args->{"wordlen"} < 1 ) {
537 0           push @errors, qq (Word length should be at least 1);
538             }
539 0 0         if ( $cl_args->{"wordlen"} > $cl_args->{"minlen"} ) {
540 0           push @errors, qq (Word length no greater than the minlen);
541             }
542             } else {
543 0           $cl_args->{"wordlen"} = 7;
544             }
545 0           $self->{param}{wordlen} = $cl_args->{"wordlen"};
546              
547 0 0         if ( not defined $cl_args->{"silent"} ) {
548 0           $cl_args->{"silent"} = 0
549             }
550 0           $self->{param}{silent} = $cl_args->{"silent"};
551            
552             # Print errors if any and exit,
553 0 0         if ( @errors )
554             {
555 0           foreach $error ( @errors )
556             {
557 0           print STDERR "$error\n";
558             }
559            
560 0           exit;
561             }
562             else {
563 0 0         wantarray ? return %{ $cl_args } : return $cl_args;
  0            
564             }
565              
566             }
567              
568             =head2 match_oligos({query => $path})
569              
570             $sr->match_oligos({query => 'query.fasta'});
571             $sr->match_oligos({ query => 'query.fasta',
572             outlen => 10,
573             minpct => 95,
574             reverse => 1,
575             outfile => '/home/donny/sr_results.txt',
576             });
577            
578             my $matches = $sr->match_oligos({query => 'query.fasta'});
579             print Dumper($matches);
580             foreach my $k (keys %{$matches} ) {
581             print "matches for $k :\n";
582             foreach my $hit ( @{ $matches->{$k} } ) {
583             print "hit id:" . $hit->[0] . " perc:" . $hit->[1] . "\n";
584             }
585             }
586            
587             This routine quickly estimates the overall similarity between
588             a given set of DNA or RNA sequence(s) and a background set
589             of database sequences (usually homologues).
590             It returns a sorted list of similarities as a
591             table. The similarity between sequences A and B are the number
592             of unique k-words (short subsequence) that they share, divided
593             by the smallest total unique k-word count in either A or B. The result
594             are scores that do not depend on sequence lengths. When
595             called in void context, the routine prints to the given output
596             file or STDOUT; otherwise a hash_ref is returned.
597              
598             Parameters:
599              
600             =over 4
601              
602             =item query
603              
604             Sets the path where the query fasta file will be found.
605             Required. In future versions, query could be a data structure instead.
606             Need to build an abstract iterator for this feature to be enabled.
607              
608             =item minpct
609              
610             A real number indicating the the minimum percent match that should
611             be output/returned.
612             Default = 50.
613              
614             =item outlen
615              
616             An integer indicating the maximum number of ranked db matches that
617             should be output/returned.
618             Default = 100.
619              
620             =item valid_chars
621              
622             Specifies the characters to allow for kmer-searching. Entire n-mer must be composed
623             of these characters otherwise it will be ignored. When not defined, all
624             characters are valid.
625             Default is undef (indicating all characters are valid)
626              
627             =item reverse
628              
629             A boolean value. If true, reverses input sequence.
630             Default = false.
631              
632             =item noids
633              
634             A boolean value. If true, prints database index numbers instead of sequence ids.
635             Default = false.
636              
637             =item outfile
638              
639             Sets the path of the output file (instead of sending to STDOUT).
640             Default = false. Meaning output is sent to STDOUT.
641              
642             =item silent
643              
644             Specifies if caller wants progress messages printed to STDERR while matching.
645             Default is false, meaning not silent.
646              
647             =back
648              
649             =cut
650              
651              
652             sub match_oligos {
653             # Niels Larsen, May 2004.
654             # Todd Z. DeSantis, March 2008.
655              
656 0     0 1   my ( $self,
657             $cl_args, # Arguments hash
658             ) = @_;
659 0           $self->_check_arguments_for_match_oligos($cl_args);
660             # Returns a list of matches if not called in void context.
661              
662 0           my ( $q_seqs, $count, $silent, @sids, $wordlen, $seqnum, $bin_fh,
663             $query_fh, $bytstr, $olinum, $length, @oligos, $oligo, @scores,
664             $oli2seqnums, $vector, @begpos, @endpos, $oli2pos, $i, @seqnums,
665             $j, $seqtot, $score, @lengths, @temp, $valid_chars, $minscore,
666             $olibeg, $olitot, $begpos, $endpos, $pack_bits, @scovec, $scovec,
667             $query_sid, $out_fh, $outdir, $outline, $matches, @oli_counts,
668             $oli_count, $scovec_null, $index, $outlen, $pos, $id, $seq,
669             $id_len);
670              
671 0           my $void_context = 1;
672             # Recall 'wantarray' returns
673             # True if the context is looking for a list value,
674             # False if the context is looking for a scalar,
675             # Undef if the context is looking for no value (void context).
676 0 0         $void_context = 0 if (defined wantarray); # caller wants data back
677              
678 0           $silent = $self->{param}{silent};
679              
680             # >>>>>>>>>>>>>>>>>>>>>> FILE MAP SECTION <<<<<<<<<<<<<<<<<<<<<<<<
681              
682             # This section creates $oli2pos, where each key is a subsequence
683             # and values are [ file byte position, byte length to read ]. This
684             # map is used below to sum up similarities.
685              
686 0           eval { $bin_fh = IO::File->new( $self->{param}{binary}, O_RDONLY|O_LARGEFILE ) };
  0            
687 0 0         if ($@) {
688             # such as: your vendor has not defined Fcntl macro O_LARGEFILE
689 0 0         if ( not $bin_fh = new IO::File $self->{param}{binary}, "r") {
690            
691 0           print STDERR "Could not open file $self->{param}{binary} for reading\n";
692 0           exit;
693             }
694             }
695              
696             # Word length and total number of sequences,
697             # reminder: sysread FILEHANDLE,SCALAR,LENGTH
698             # Attempts to read LENGTH bytes of data into variable SCALAR
699             # from the specified FILEHANDLE,
700 0           sysread $bin_fh, $bytstr, 10;
701 0           $id_len = $bytstr * 1;
702            
703 0           sysread $bin_fh, $bytstr, 10;
704 0           $wordlen = $bytstr * 1;
705              
706 0           sysread $bin_fh, $bytstr, 10;
707 0           $seqtot = $bytstr * 1;
708              
709             # String ids as a string of fixed-length character words,
710 0           sysread $bin_fh, $bytstr, $seqtot * $id_len;
711 0           @sids = unpack "A$id_len" x $seqtot, $bytstr;
712            
713             # Get the oligos and the begin and end positions of where the sequence
714             # indices start and their lengths,
715              
716 0           sysseek $bin_fh, -10, 2; # goto 10 bytes before EOF
717 0           sysread $bin_fh, $bytstr, 10;
718 0           $olibeg = $bytstr * 1;
719            
720 0           sysseek $bin_fh, -20, 2; # goto 20 bytes before EOF
721 0           sysread $bin_fh, $bytstr, 10;
722 0           $olitot = $bytstr * 1;
723              
724 0           sysseek $bin_fh, $olibeg, 0;
725 0           sysread $bin_fh, $bytstr, $olitot * $wordlen;
726 0           @oligos = unpack "A$wordlen" x $olitot, $bytstr;
727            
728 0           sysread $bin_fh, $bytstr, 4 * $olitot;
729 0           @begpos = unpack "I*", $bytstr;
730              
731 0           sysread $bin_fh, $bytstr, 4 * $olitot;
732 0           @lengths = unpack "I*", $bytstr;
733              
734 0           sysread $bin_fh, $bytstr, 4 * $seqtot;
735 0           @oli_counts = unpack "I*", $bytstr;
736              
737             # Create a hash that returns the file positions of where to get the
738             # matching sequence indices,
739              
740 0           for ( $i = 0; $i < scalar @oligos; $i++ ) {
741 0           $oli2pos->{ $oligos[$i] } = [ $begpos[$i], $lengths[$i] ];
742             }
743              
744             # >>>>>>>>>>>>>>>>>>>> PROCESS QUERY ENTRIES <<<<<<<<<<<<<<<<<<<<<<<
745              
746 0 0 0       if ( $void_context == 1 && $self->{param}{outfile} ) {
747             # call was made in void context
748 0           $out_fh = new IO::File $self->{param}{outfile}, "w";
749 0 0         print "Outfile opened.\n" unless ($self->{param}{silent});
750             }
751              
752 0           $query_fh = new IO::File $self->{param}{"query"}, "r";
753 0           $scovec_null = pack "I*", (0) x $seqtot;
754            
755 0   0       $minscore = ( $self->{param}{"minpct"} || 0 ) / 100;
756              
757             {
758 0           local $/ = '>';
  0            
759 0           while ( <$query_fh>) {
760 0 0         next if ($_ eq '>'); # toss the first empty record
761 0           chomp $_;
762 0           my ($header, @seq_lines) = split /\n/, $_;
763 0           my ($query_sid) = split /\s/, $header;
764 0           my $sequence = '';
765 0 0         if ($self->{param}{pre_subst} == 1) {
    0          
766 0           $sequence = join '', @seq_lines;
767 0           $sequence =~ s/[\.\-\s]//g;
768             } elsif ($self->{param}{pre_subst} == 2) {
769 0           $sequence = join '_', @seq_lines;
770 0           $sequence =~ s/[\.\-\s]/_/g;
771             }
772            
773 0 0 0       next unless ($query_sid && $sequence);
774            
775 0 0         if ( $self->{param}{"reverse"} ) {
776 0           $sequence = reverse $sequence;
777             }
778              
779 0 0         print "Processing $query_sid ... \n" unless ( $silent );
780              
781 0           $scovec = $scovec_null;
782              
783             # >>>>>>>>>>>>>>>>>>>>>> COUNTING SECTION <<<<<<<<<<<<<<<<<<<<<<<<
784              
785             # Look up the sequences that contain each of the oligos found
786             # in the sequence. The sort makes the disk jump around less,
787             # since the oligos were written in sorted order to the bin file
788            
789 0           @oligos = _create_oligos( $sequence, $wordlen, $self->{param}{valid_chars} );
790              
791 0           foreach $oligo ( sort @oligos ) { # "sort" added by tzd Apr-11-2008
792            
793 0 0         if ( defined ( $pos = $oli2pos->{ $oligo }->[0] ) ) {
794 0           sysseek $bin_fh, $pos, 0;
795            
796 0           $length = $oli2pos->{ $oligo }->[1];
797 0           sysread $bin_fh, $bytstr, $length;
798              
799             # $bytstr is a perl string. Every 4-byte block
800             # in this string encodes a 4 byte integer
801             # so >4 billion integers are available.
802             # but when passed to C
803             # it will be an integer array.
804              
805             # This is a C function for speed, the code is at the end of this
806             # file. The third argument is the maximum index of $bytstr array,
807            
808             ####### testing only
809             # print STDERR join (',', (unpack "I*", $scovec)) . "\n";
810             # print STDERR join (',', (unpack "I*", $bytstr)) . "\n";
811             #######
812              
813 0           &update_scores_C( \$bytstr, \$scovec, ( $length - 1 ) / 4 );
814            
815             ####### testing only
816             # print STDERR '$query_sid:' . $query_sid .
817             # ' $oligo:' . $oligo . "\n" .
818             # Dumper(unpack "I*", $scovec) . "\n";
819             #######
820             }
821             }
822            
823 0           @scovec = unpack "I*", $scovec;
824              
825             # >>>>>>>>>>>>>>>>>>>>>> EXTRACT OUTPUT DATA <<<<<<<<<<<<<<<<<<<<<<
826              
827             # The score vector, @scovec, now contains integer scores in some
828             # slots, zeros in others. To get an ordered list of just the positive
829             # scores we could grep and sort, but that would be slow because the
830             # list is long. Instead, keep a list of scores that is only as long
831             # as the output length; insert values that are higher than the last
832             # element and ignore the rest,
833              
834 0           $oli_count = scalar @oligos;
835 0           $outlen = $self->{param}{"outlen"};
836              
837 0           @scores = ();
838              
839 0           for ( $i = 0; $i < scalar @scovec; $i++ )
840             {
841 0 0         if ( $scovec[$i] ) {
842            
843             #### use for testing
844             # print STDERR '$oli_count:' . $oli_count . ' $oli_counts[$i]:' . $oli_counts[$i] . ' $scovec[$i]:' . $scovec[$i] . "\n";
845             #####
846            
847 0 0         if ( $oli_count < $oli_counts[$i] ) {
848 0           $score = $scovec[$i] / $oli_count;
849             } else {
850 0           $score = $scovec[$i] / $oli_counts[$i];
851             }
852              
853 0 0         if ( $score >= $minscore )
854             {
855 0 0         if ( scalar @scores < $outlen )
    0          
856             {
857 0           $index = _nearest_index( \@scores, $score );
858 0           splice @scores, $index, 0, [ $i, $score ];
859             }
860             elsif ( $score > $scores[0]->[1] )
861             {
862 0           $index = _nearest_index( \@scores, $score );
863 0           splice @scores, $index, 0, [ $i, $score ];
864 0           shift @scores;
865             }
866             }
867             }
868             }
869              
870 0 0         if ( $cl_args->{"noids"} ) {
871 0           foreach $score ( @scores ) {
872 0           $score->[1] = sprintf "%.2f", 100 * $score->[1];
873             }
874             } else {
875 0           foreach $score ( @scores ) {
876 0           $score->[0] = $sids[ $score->[0] ];
877 0           $score->[1] = sprintf "%.2f", 100 * $score->[1];
878             # Can we add a percision parameter here ?
879             # may be able to save disk space in the output file by getting
880             # just the 1/10 place for instance
881             }
882             }
883              
884 0           @scores = reverse @scores;
885              
886             # >>>>>>>>>>>>>>>>>>>>>> OUTPUT SECTION <<<<<<<<<<<<<<<<<<<<<<<<<
887              
888             # If called in non-void context, generate a data structure. If
889             # in void context, print tabular data to stdout or an output file
890             # if specified,
891              
892 0 0         if ( $void_context == 0 ) {
893             # means caller wants a scalar back
894             # which will be a hash ref
895 0           $matches->{ $query_sid } = &Storable::dclone( \@scores );
896             } else {
897 0           $outline = "$query_sid\t" . join "\t", map { $_->[0] .":". $_->[1] } @scores;
  0            
898            
899 0 0         if ( defined $out_fh ) {
900 0           print $out_fh "$outline\n";
901             } else {
902 0           print "$outline\n";
903             }
904             }
905              
906 0 0         unless ( $silent ) {
907 0           print "$query_sid done\n";
908             # &Common::Messages::echo_green( qq (done\n) );
909             }
910             } # end while query_fh
911             } # end local block
912 0           $query_fh->close;
913 0           $bin_fh->close;
914            
915             # Return data structure if non-void context, otherwise nothing,
916 0 0         if ( $void_context == 0 ) {
917             # means caller wants a scalar back
918 0           return $matches;
919             } else {
920 0           return;
921             }
922             }
923              
924             =head2 _check_arguments_for_match_oligos
925              
926             Internally used function.
927             Validates the arguments for outlen, minpct, reverse,
928             query, noids, silent, pre_subst and output.
929             If errors are found, program exits.
930             Returns a hash or hash ref
931             depending on the context of the method call.
932              
933             =cut
934              
935             sub _check_arguments_for_match_oligos {
936             # Niels Larsen, May 2004.
937             # Todd Z. DeSantis, March 2008.
938 0     0     my ( $self,
939             $cl_args, # Command Line ARGument hash
940             # not really reqired to come from the command line
941             ) = @_;
942            
943 0           my ( @errors, $error, $outdir );
944            
945             ## see if caller wants to constrain the alphabet
946 0 0         if ( exists $cl_args->{valid_chars} ) {
947 0           $self->{param}{valid_chars} = $cl_args->{valid_chars};
948             } else {
949 0           $self->{param}{valid_chars} = undef;
950             }
951              
952             ## see if caller wants to make some pre-substitutions
953 0 0         if ( exists $cl_args->{pre_subst} ) {
954 0           $self->{param}{pre_subst} = $cl_args->{pre_subst};
955             } else {
956 0           $self->{param}{pre_subst} = 1;
957             }
958              
959             # Ensure output list has at least one similarity in it and set
960             # it to 100 if the user didnt specify it,
961            
962 0 0         if ( $cl_args->{"outlen"} )
963             {
964 0 0         if ( $cl_args->{"outlen"} < 1 ) {
965 0           push @errors, qq (Output list length should be 1 or more);
966             }
967             }
968             else {
969 0           $cl_args->{"outlen"} = 100;
970             }
971 0           $self->{param}{outlen} = $cl_args->{"outlen"};
972            
973 0 0         if ($cl_args->{"minpct"} ) {
974 0           $self->{param}{minpct} = $cl_args->{"minpct"};
975             } else {
976 0           $self->{param}{minpct} = 0;
977             }
978            
979 0 0         if ($cl_args->{reverse}) {
980 0           $self->{param}{"reverse"} = $cl_args->{reverse};
981             } else {
982 0           $self->{param}{"reverse"} = 0;
983             }
984             # The mandatory query:
985             # must be a file path
986             # in the future, allow a hash ref of the structure
987             # $hr->{$id} = $sequence_string
988            
989             # if file given,
990             # check that its readable. If not given, error,
991 0 0         if ( $cl_args->{"query"} ) {
992             # $cl_args->{"query"} = &Common::File::full_file_path( $cl_args->{"query"} );
993            
994 0 0         if ( not -r $cl_args->{"query"} ) {
995 0           push @errors, qq (Query file not found or not readable -> "$cl_args->{'query'}");
996             }
997             }
998             else {
999 0           push @errors, qq (Query sequence file must be specified);
1000             }
1001 0           $self->{param}{query} = $cl_args->{"query"};
1002            
1003             # If output file given, check if it exists and if its directory
1004             # does not exist
1005 0 0         if ( $cl_args->{"outfile"} ) {
1006            
1007 0 0         if ( -e $cl_args->{"outfile"} ) {
1008 0           push @errors, qq (Output file exists -> "$cl_args->{'outfile'}");
1009             }
1010              
1011 0           $outdir = File::Basename::dirname( $cl_args->{"outfile"} );
1012              
1013 0 0         if ( not -d $outdir ) {
    0          
1014 0           push @errors, qq (Output directory does not exist -> "$outdir");
1015             } elsif ( not -w $outdir ) {
1016 0           push @errors, qq (Output directory is not writable -> "$outdir");
1017             }
1018             } else {
1019 0           $cl_args->{"outfile"} = "";
1020             }
1021 0           $self->{param}{outfile} = $cl_args->{"outfile"};
1022             # print STDERR 'outfile:' . $self->{param}{outfile} . "\n";
1023              
1024 0 0         if ( defined $cl_args->{"silent"} ) {
1025 0           $self->{param}{silent} = $cl_args->{"silent"};
1026             }
1027 0 0         if ( not defined $self->{param}{silent}) {
1028 0           $self->{param}{silent} = 1;
1029             }
1030            
1031             # Print errors if any and exit,
1032              
1033 0 0         if ( @errors )
1034             {
1035 0           foreach $error ( @errors )
1036             {
1037 0           print STDERR "$error\n";
1038             # &Common::Messages::echo_red( "ERROR: ");
1039             # &Common::Messages::echo( "$error\n" );
1040             }
1041            
1042 0           exit;
1043             }
1044             else {
1045 0 0         wantarray ? return %{ $cl_args } : return $cl_args;
  0            
1046             }
1047              
1048             }
1049              
1050             =head2 _create_oligos
1051              
1052             Internal method.
1053             Creates a list of unique words of a given length from a given sequence.
1054             Enforces k-mers composed of purely valid_char if requested.
1055             Converts characters to upper case where possible. This may become an option
1056             in the future.
1057              
1058             =cut
1059              
1060              
1061             sub _create_oligos {
1062             # Niels Larsen, November 2005.
1063             # Todd Z. DeSantis, March 2008
1064              
1065             my (
1066 0     0     $str, # Sequence string
1067             $word_len, # Word length
1068             $valid_chars
1069             ) = @_;
1070            
1071            
1072 0           my (@oligos, %oligos, $i, $len);
1073 0           $str = uc $str;
1074              
1075 0           my @good_spans = ();
1076 0 0         if ($valid_chars) {
1077 0           @good_spans = split (/[^$valid_chars]/, $str); # pattern match done once
1078             # instead of for every k-mer
1079             } else {
1080             # put whole string as first element
1081 0           $good_spans[0] = $str;
1082             }
1083 0           foreach my $good_span (@good_spans) {
1084             # print STDERR "good_span: $good_span\n";
1085 0           my $span_len = length($good_span);
1086 0 0         next if ($span_len < $word_len);
1087 0           my $i;
1088 0           for ( $i = 0; $i <= $span_len - $word_len; $i++ ) {
1089 0           $oligos{ substr $good_span, $i, $word_len } = undef;
1090             }
1091             }
1092            
1093 0           @oligos = keys %oligos;
1094            
1095             ## use for testing:
1096            
1097             # foreach my $o (@oligos) {
1098             # if ($o =~ /[^ACGT]/) {
1099             # print STDERR $o . "\n";
1100             # }
1101             # }
1102            
1103              
1104              
1105 0 0         wantarray ? return @oligos : \@oligos;
1106             }
1107              
1108             =head2 _nearest_index
1109              
1110             Finds the index of a given sorted array where a given number
1111             would fit in the sorted order. The returned index can then
1112             be used to insert the number, for example. The array may
1113             contain integers and/or floats, same for the number.
1114              
1115             =cut
1116              
1117             sub _nearest_index
1118             {
1119             # Niels Larsen, May 2004.
1120 0     0     my ( $array, # Array of numbers
1121             $value, # Lookup value
1122             ) = @_;
1123              
1124             # Returns an integer.
1125              
1126 0           my ( $low, $high ) = ( 0, scalar @{ $array } );
  0            
1127 0           my ( $cur );
1128              
1129 0           while ( $low < $high )
1130             {
1131 0           $cur = int ( ( $low + $high ) / 2 );
1132              
1133 0 0         if ( $array->[$cur]->[1] < $value ) {
1134 0           $low = $cur + 1;
1135             } else {
1136 0           $high = $cur;
1137             }
1138             }
1139              
1140 0           return $low;
1141             }
1142              
1143             =head2 _count_records
1144              
1145             Counts the number of records in a fasta formated file.
1146              
1147             =cut
1148              
1149             sub _count_records
1150             {
1151             # Todd Z. DeSantis, July 2010.
1152 0     0     my ( $fasta_file ) = @_;
1153              
1154             # Returns an integer.
1155 0           my $c = 0;
1156 0           my $ffh = new IO::File $fasta_file, "r";
1157 0           while (<$ffh>) {
1158 0 0         $c++ if ($_ =~ /^\>/);
1159             }
1160 0           $ffh->close;
1161 0           return $c;
1162             }
1163              
1164              
1165             1;
1166              
1167             =head2 _update_scores_C
1168              
1169             Receives a list of indicies, score vector, and the final index
1170             Updates the score vector by incrementing the oligo
1171             match count for the correct sequences.
1172              
1173             =cut
1174              
1175             __DATA__