File Coverage

blib/lib/String/Simrank.pm

Criterion	Covered	Total	%
statement	24	375	6.4
branch	0	154	0.0
condition	0	17	0.0
subroutine	8	17	47.0
pod	3	3	100.0
total	35	566	6.1

line	stmt	bran	cond	sub	pod	time	code
1							package String::Simrank;
2
3							# Note to developers:
4							# enable the VERSIONs (here and in the "use Inline C ..."
5							# statement) to match each other when ready to install in sitelib or
6							# when ready to distribute. Otherwise, disable them
7							# with commenting symbol for testing.
8
9							# use base 'Exporter';
10							# @EXPORT_OK = qw(add subtract);
11	1			1		31740	use strict;
	1					2
	1					41
12	1			1		5	use File::Basename;
	1					3
	1					128
13	1			1		1163	use IO::File;
	1					12744
	1					166
14	1			1		10	use Fcntl;
	1					3
	1					499
15	1			1		11025	use Data::Dumper;
	1					35031
	1					90
16	1			1		1139	use Storable;
	1					4409
	1					90
17	1			1		14	use vars qw($VERSION $NAME);
	1					2
	1					82
18
19
20							$VERSION = '0.079';
21							$NAME = 'String::Simrank';
22
23							# Must specify a NAME and VERSION parameter. The NAME must match your module's package
24							# name. The VERSION parameter must match the module's $VERSION
25							# variable and they must both be of the form /^\d\.\d\d$/.
26							# DATA refers to the __DATA__ section near bottom of this file.
27	1			1		1223	use Inline C => 'DATA';
	1					26131
	1					9
28
29							=head1 NAME
30
31							String::Simrank - Rapid comparisons of many strings for k-mer similarity.
32							=cut
33
34							=head1 SYNOPSIS
35
36							use String::Simrank;
37							my $sr = new String::Simrank(
38							{ data => 'db.fasta'});
39							$sr->formatdb();
40							$sr->match_oligos({ query => 'query.fasta' });
41
42							You may utilize k-mers of various sizes, e.g.:
43							$sr->formatdb( { wordlen => 6 } );
44							=cut
45
46							=head1 DESCRIPTION
47
48							The String::Simrank module allows rapid searches for similarity between query
49							strings and a database of strings. This module is maintained by molecular
50							biologists who use it for searching for similarities among strings representing
51							contiguous DNA or RNA sequences. This program does not
52							construct an alignment but rather finds the ratio
53							of nmers (A.K.A. ngrams) that are shared between a query and database
54							records.
55							The input file should be fasta formatted (either aligned or unaligned) multiple
56							sequence file.
57							The memory consumption is moderate and grows linearly with the number of sequences
58							and depends on the nmer size defined by the user. Using 7-mers, ~20,000 strings
59							each ~1,500 characters in length requires ~50 Mb.
60
61							The module can be used from the command line through the script
62							C provided in the examples folder.
63
64							By default the output is written to STDOUT and
65							represents the similarity of each query string to the top hits in the
66							the database. The format is query_id, tab, best match database_id, colon, percent similarity, space
67							second best match database_id, colon, percent similarity.
68							query1 EscCol36:100.00 EscCol36:100.00 EscCol43:99.59 EscCol29:99.24 EscCol33:99.17 EscCol10:99.02
69
70							=cut
71
72							=head1 METHODS
73
74							=cut
75
76							=head2 new( { data_path => $path })
77
78							my $sr = new String::Simrank( { data => 'many_seqs.fasta' });
79
80							Instantiates a new String::Simrank object.
81							Each simrank database should live within its own String::Simrank object.
82							In other words, each unique combination of a fasta database and n-mer
83							length will have its own instance.
84							If the database has already been formated then the wordlength will be
85							read from the formatted database's .bin file.
86							Parameters:
87
88							=over 4
89
90							=item data
91
92							Sets the path of where the data fasta file is (Database sequences, fasta format).
93							The same terminal directory is where the formated database will reside with the same
94							base name but the .fasta will be substituted with .bin.
95							Sequence names(identifiers) will be first 10 characters matched between the '>' and the first space character.
96							Be sure the sequences names are unique within this string.
97
98							=back
99
100							=cut
101
102							sub new {
103							# Todd Z. DeSantis, March 2008
104
105	0			0	1		my ($class, $cl_args) = @_;
106							# Parameter hr is called $cl_args since in
107							# many instances these will come directly from the command line.
108	0						my $self = bless {}, $class;
109	0						$self->_check_arguments_for_new($cl_args);
110							# print STDERR "you are using String::Simrank version $VERSION \n";
111	0						return $self;
112							}
113
114							=head2 _check_arguments_for_new
115
116							Internally used function.
117							Validates the 'data' argument needed so this instance knows what data
118							file will be linked. Method checks that fasta file exists and is readable
119							and creates the name which corresponds to the .bin file.
120							If errors are found, program exits.
121							Returns a hash or hash ref
122							depending on the context of the method call.
123
124							=cut
125
126
127							sub _check_arguments_for_new {
128							# Niels Larsen, May 2004.
129							# Todd Z. DeSantis, March 2008.
130
131	0			0			my ($self,
132							$cl_args, # Command Line ARGument hash
133							# not really required to come from the command line
134							) = @_;
135
136	0						my ( @errors, $error);
137
138	0	0					if ( $cl_args->{"data"} ) {
139							# $cl_args->{"data"} = &Common::File::full_file_path( $cl_args->{"data"} );
140
141	0	0					if ( -r $cl_args->{"data"}) {
142	0						$cl_args->{"binary"} = $cl_args->{"data"};
143	0						$cl_args->{"binary"} =~ s/\.[^\.]+$//;
144	0						$cl_args->{"binary"} .= ".bin";
145
146	0						$self->{param}{data} = $cl_args->{"data"};
147	0						$self->{param}{binary} = $cl_args->{"binary"};
148	0	0	0				if (-e $cl_args->{"binary"} && -r $cl_args->{"binary"}) {
149	0						print $cl_args->{"binary"} . " has been formatted\n";
150	0						$self->{binary_ready} = 1;
151							} else {
152	0						$self->{binary_ready} = 0;
153							}
154
155							} else {
156	0						push @errors, qq (Simrank.pm: Input fasta data file not found or not readable -> "$cl_args->{'data'}");
157							}
158
159							} else {
160	0						push @errors, qq (Fasta sequence file must be specified with data argument);
161							}
162
163							# Print errors if any and exit,
164
165	0	0					if ( @errors )
166							{
167	0						foreach $error ( @errors )
168							{
169	0						print STDERR "$error\n";
170							# &Common::Messages::echo_red( "ERROR: ");
171							# &Common::Messages::echo( "$error\n" );
172							}
173
174	0						exit;
175							}
176							else {
177
178	0	0					wantarray ? return %{ $cl_args } : return $cl_args;
	0
179							}
180
181							}
182
183							=head2 formatdb({ wordlen => $int, minlen => $int, silent = $bool })
184
185							$sr->formatdb({wordlen => 8, minlen => 500, silent => 0, valid_chars => 'ACGT'});
186
187							From an input collection of strings (data), generates a binary file
188							optimized for retrieval of k-mer similarities. The file will contain
189							a pre-computed map of which sequences - given by their index
190							number in the input data file - contain a given k-mer. All the
191							valid k-mers of the given length are mapped. This means each
192							k-mer from a query string can be used to look up the database strings
193							it occurs in, very quickly. The memory consumption is moderate
194							and grows linearly with the number of sequences; 20,000 takes
195							50-55 mb.
196							The first characters between the '>' character and the first space
197							is recognized as the string (sequence) identifier and should be unique for each record.
198							The number of sequences found in the input collection
199							is returned and the file xyz.bin is written to disk.
200
201							Parameters:
202
203							=over 4
204
205							=item pre_subst
206
207							An integer that determines how to interpret some special characters in the strings.
208							The substitution is done before k-mers are extracted.
209							Use 1 to eliminate all periods(.), hypens(-), and space characters(\s, which includes
210							\s,\t,\n). This is the default behavior.
211							Use 2 to convert same characters as above into underscores(_).
212
213							=item wordlen
214
215							An integer that specifies the k-mer length to index for each record in the input data file.
216							Default is 7.
217
218							=item minlen
219
220							Specifies the minimum string length for a database sequence to be worthy of indexing.
221							Default is 50.
222
223							=item valid_chars
224
225							Specifies the characters to allow for formatting. Entire k-mer must be composed
226							of these characters otherwise it will be ignored. When not defined, all
227							characters are valid.
228							Default is undef (indicating all characters are valid)
229
230							=item silent
231
232							Specifies if caller wants progress messages printed to STDERR while formatting.
233							Default is false, meaning not silent.
234
235							=back
236
237							=cut
238
239
240							sub formatdb {
241							# Niels Larsen, November 2005.
242							# Todd Z. DeSantis, July 2010
243
244	0			0	1		my ( $self, $cl_args) = @_;
245	0						$self->_check_arguments_for_formatdb($cl_args);
246
247							# Returns an integer.
248
249	0						my ( $data_fh, $bin_fh, $entry, @seq_oligos, $oligos, $oli2seqoffs,
250							$seqnum, $oligo, @sids, $sids, $bytstr, $begpos,
251							$seqtot, $bytlen, $olibeg,
252							$oli_counts, $id, $seq, $count, $all_begpos, $all_bytlen,
253							$valid_chars, $lastnums, $lastnum, @seqoffs, $off, @seqnums,
254							$id_len);
255							## I think I can exlude $lastnums, never written to disk.
256
257	0						my $cl_wordlen = $self->{param}{wordlen};
258	0						my $cl_silent = $self->{param}{silent};
259	0						my $cl_binary = $self->{param}{binary};
260	0						my $cl_data = $self->{param}{data};
261	0	0					print "cl_data: $cl_data\n" unless ($cl_silent);
262	0	0					print "cl_binary: $cl_binary\n" unless ($cl_silent);
263
264
265							# Find the number of sequences and the longest length identifier,
266	0	0					print "Counting new fasta records and measuring size of identifiers... " if not $cl_silent;
267	0						$seqtot = _count_records($cl_data);
268	0	0					print "found $seqtot\n" if not $cl_silent;
269
270	0						$seqnum = 0;
271	0						$oli_counts = "";
272	0						$valid_chars = $self->{param}{valid_chars};
273	0						$id_len = 0; # init
274
275	0						$data_fh = new IO::File $self->{param}{data}, "r";
276
277							{
278	0						local $/ = '>';
	0
279	0	0					print "Mapping sub-sequences ... \n" unless ($cl_silent);
280	0						while ( <$data_fh> ) {
281	0	0					next if ($_ eq '>'); # toss the first empty record
282	0						chomp $_;
283
284	0						my ($header, @seq_lines) = split /\n/, $_;
285	0						my ($id) = split /\s/, $header;
286	0						my $sequence = '';
287	0	0					if ($self->{param}{pre_subst} == 1) {
		0
288	0						$sequence = join '', @seq_lines;
289	0						$sequence =~ s/[\.\-\s]//g;
290							} elsif ($self->{param}{pre_subst} == 2) {
291	0						$sequence = join '_', @seq_lines;
292	0						$sequence =~ s/[\.\-\s]/_/g;
293							}
294	0	0	0				next unless ($id && $sequence);
295
296	0						push @sids, $id;
297							# format to a uniform character length AFTER longest is found
298	0	0					$id_len = length($id) if (length($id) > $id_len);
299
300	0						@seq_oligos = _create_oligos( $sequence, $cl_wordlen, $valid_chars);
301
302	0						foreach $oligo ( @seq_oligos ) {
303	0	0					if ( exists $oli2seqoffs->{ $oligo } ) {
304	0						$lastnum = $lastnums->{ $oligo }; ## I think I can exlude $lastnums, never written to disk.
305	0						$oli2seqoffs->{ $oligo } .= ",". ($seqnum - $lastnum);
306							# Keep track of the hops needed to find this oligo in the seqnums
307							} else {
308	0						$oli2seqoffs->{ $oligo } = $seqnum;
309							}
310
311	0						$lastnums->{ $oligo } = $seqnum;
312							# keep track of the last $seqnum in which this $oligo was found.
313							## I think I can exlude $lastnums, never written to disk.
314
315							}
316
317	0						$oli_counts .= ",". ( scalar @seq_oligos );
318							# string-encoded ordered list of count of unique oligos in each seqnum
319
320	0						$seqnum++;
321	0	0	0				if ( !$cl_silent && $seqnum % 5000 == 0 ) {
322	0						print "$seqnum done of $seqtot ," .
323	0						scalar(keys %{ $oli2seqoffs }) . " unique kmers found ...\n";
324							# this message is accurate since we just incremented
325							# $seqnum. So if we just finshed index 4 (which is the
326							# fifth sequence, then $seqnum would have a value of 5"
327							}
328							}
329							}
330
331	0						$seqtot = $seqnum;
332							# this is true since $seqnum was incremented after last sequence
333
334	0	0					if ( not $cl_silent )
335							{
336	0						print "Total sequence count: $seqnum of $seqtot ," .
337	0						scalar(keys %{ $oli2seqoffs }) . " unique kmers found.\n";
338							}
339	0						$data_fh->close;
340
341
342	0						$self->{db_string_count} = $seqnum;
343	0						$self->{unique_kmer_count} = scalar(keys %{ $oli2seqoffs });
	0
344
345							# Write binary file. We write binary representations using syswrite
346							# and save the word length and number of sequences (the rest can be
347							# calculated)
348
349	0						eval { $bin_fh = IO::File->new( $cl_binary, 'w' ) };
	0
350	0	0					if ($@) {
351	0						print STDERR $@;
352							# such as: your vendor has not defined Fcntl macro O_LARGEFILE
353	0	0					if ( not $bin_fh = new IO::File $cl_binary, "w") {
354
355	0						print STDERR "Could not syswrite-open file $cl_binary\n";
356	0						exit;
357							}
358							}
359							# print STDERR Dumper($bin_fh);
360	0	0					unless (defined $bin_fh) {
361	0						print STDERR "A filehandle to $cl_binary has not been defined\n";
362	0						exit;
363							}
364
365	0	0					if ( not $cl_silent ) {
366	0						print "Total number of unique oligos: " . scalar(keys %{ $oli2seqoffs } ) . "\n";
	0
367	0						print "Writing oligo map to file ...\n";
368							}
369
370							# Id length, word_length, and total number of sequences,
371
372	0						$bytstr = pack "A10", $id_len; # the max_length is encoded as text
373	0						syswrite $bin_fh, $bytstr;
374
375	0						$bytstr = pack "A10", $cl_wordlen;
376	0						syswrite $bin_fh, $bytstr;
377
378	0						$bytstr = pack "A10", $seqtot;
379	0						syswrite $bin_fh, $bytstr;
380
381							# Short ids as a string of fixed-length character words,
382	0						$bytstr = pack "A$id_len" x $seqtot, @sids;
383	0						syswrite $bin_fh, $bytstr;
384
385							# Write byte strings of matching sequence numbers, while keeping
386							# track of their byte position offsets,
387	0						$begpos = sysseek $bin_fh, 0, 1; # recommended "systell", perl book says
388	0	0					$begpos = 0 if ($begpos =~ /^0/); # sometimes $begpos gets set from sysseek as "0 but true"
389							# I think this has something to do with not using the Fcntl
390							# flags: O_WRONLY\|O_CREAT\|O_EXCL\|O_LARGEFILE
391							# since this behavoir emerged when I stopped using them.
392							# BUT, I expect sysseek to return a positive int
393							# since many bytes have been already written to this filehandle.
394
395	0						@seq_oligos = ();
396
397	0						$all_begpos = "";
398	0						$all_bytlen = "";
399
400	0						$count = 0;
401
402	0						foreach $oligo ( sort(keys %{ $oli2seqoffs }) ) # "sort" added by tzd Apr-11-2008
	0
403							{
404
405	0						@seqoffs = eval $oli2seqoffs->{ $oligo };
406							# print STDERR '@seqoffs:' . join(',', @seqoffs) . "\n";
407
408	0						@seqnums = shift @seqoffs;
409
410
411	0						foreach $off ( @seqoffs )
412							{
413	0						push @seqnums, $seqnums[-1] + $off;
414							# -1 looks-up the value of the last element in the @seqnums
415
416							}
417							# print STDERR '@seqnums:' . join(',', @seqnums) . "\n";
418
419	0						$bytstr = pack "I*", @seqnums; # unsigned integers
420							# the rest of this module
421							# assumes that integers will
422							# be 4-byte which is a fairly
423							# safe assumption
424	0						syswrite $bin_fh, $bytstr;
425
426	0						$bytlen = length $bytstr;
427
428	0						$all_begpos .= ",$begpos";
429	0						$all_bytlen .= ",$bytlen";
430
431	0						$begpos += $bytlen;
432
433	0						delete $oli2seqoffs->{ $oligo };
434
435	0						push @seq_oligos, $oligo;
436
437	0						$count += 1;
438							}
439
440	0						$oli2seqoffs = {};
441
442	0						$olibeg = $begpos;
443
444							# Write the oligos found,
445
446	0						$bytstr = pack "A$cl_wordlen" x ( scalar @seq_oligos ), @seq_oligos;
447	0						syswrite $bin_fh, $bytstr;
448
449							# The begin positions of the corresponding run of sequence indices,
450							# and their lengths,
451
452	0						$all_begpos =~ s/^,//; # get rid of initial comma
453	0						$bytstr = pack "I*", eval $all_begpos;
454	0						syswrite $bin_fh, $bytstr;
455
456	0						$all_bytlen =~ s/^,//;
457	0						$bytstr = pack "I*", eval $all_bytlen;
458	0						syswrite $bin_fh, $bytstr;
459
460							# Store the number of unique oligos for every sequence,
461
462	0						$oli_counts =~ s/^,//;
463	0						$bytstr = pack "I*", eval $oli_counts;
464	0						syswrite $bin_fh, $bytstr;
465
466							# Finally, we need to store the number of oligos encountered and the
467							# byte position of where they were stored,
468
469	0						$bytstr = pack "A10", scalar @seq_oligos;
470	0						syswrite $bin_fh, $bytstr;
471
472	0						$bytstr = pack "A10", $olibeg;
473	0						syswrite $bin_fh, $bytstr;
474
475	0						$bin_fh->close;
476
477
478	0	0					print "format binary file complete\n" unless ($cl_silent);
479	0						return $seqnum;
480							}
481
482							=head2 _check_arguments_for_formatdb
483
484							It checks that word length and minimum sequence length is reasonable.
485							Sets silent to true by default.
486							Sets pre_subst to 1 by default.
487							Defines binary file name.
488							If errors are found, program exits.
489							Returns a hash or hash ref
490							depending on the context of the method call.
491
492							=cut
493
494
495							sub _check_arguments_for_formatdb {
496							# Niels Larsen, May 2004.
497							# Todd Z. DeSantis, March 2008.
498	0			0			my ( $self,
499							$cl_args, # Command Line ARGument hash
500							# not really reqired to come from the command line
501							) = @_;
502
503	0						my ( @errors, $error);
504
505							## see if caller wants to constrain the alphabet
506	0	0					if ( exists $cl_args->{valid_chars} ) {
507	0						$self->{param}{valid_chars} = $cl_args->{valid_chars};
508							} else {
509	0						$self->{param}{valid_chars} = undef;
510							}
511
512							## see if caller wants to make some pre-substitutions
513	0	0					if ( exists $cl_args->{pre_subst} ) {
514	0						$self->{param}{pre_subst} = $cl_args->{pre_subst};
515							} else {
516	0						$self->{param}{pre_subst} = 1;
517							}
518
519							# Ensure mininum sequence length is 1 or more and fill in 50
520							# if user didnt specify it
521	0	0					if ( $cl_args->{"minlen"} )
522							{
523	0	0					if ( $cl_args->{"minlen"} < 1 ) {
524	0						push @errors, qq (Minimum sequence length should be 1 or more);
525							}
526							}
527							else {
528	0						$cl_args->{"minlen"} = 50;
529							}
530	0						$self->{param}{minlen} = $cl_args->{"minlen"};
531
532							# Ensure word length is over 0 and less than
533							# or equal to minlen and fill in 7 if the
534							# user didnt specify it,
535	0	0					if ( $cl_args->{"wordlen"} ) {
536	0	0					if ( $cl_args->{"wordlen"} < 1 ) {
537	0						push @errors, qq (Word length should be at least 1);
538							}
539	0	0					if ( $cl_args->{"wordlen"} > $cl_args->{"minlen"} ) {
540	0						push @errors, qq (Word length no greater than the minlen);
541							}
542							} else {
543	0						$cl_args->{"wordlen"} = 7;
544							}
545	0						$self->{param}{wordlen} = $cl_args->{"wordlen"};
546
547	0	0					if ( not defined $cl_args->{"silent"} ) {
548	0						$cl_args->{"silent"} = 0
549							}
550	0						$self->{param}{silent} = $cl_args->{"silent"};
551
552							# Print errors if any and exit,
553	0	0					if ( @errors )
554							{
555	0						foreach $error ( @errors )
556							{
557	0						print STDERR "$error\n";
558							}
559
560	0						exit;
561							}
562							else {
563	0	0					wantarray ? return %{ $cl_args } : return $cl_args;
	0
564							}
565
566							}
567
568							=head2 match_oligos({query => $path})
569
570							$sr->match_oligos({query => 'query.fasta'});
571							$sr->match_oligos({ query => 'query.fasta',
572							outlen => 10,
573							minpct => 95,
574							reverse => 1,
575							outfile => '/home/donny/sr_results.txt',
576							});
577
578							my $matches = $sr->match_oligos({query => 'query.fasta'});
579							print Dumper($matches);
580							foreach my $k (keys %{$matches} ) {
581							print "matches for $k :\n";
582							foreach my $hit ( @{ $matches->{$k} } ) {
583							print "hit id:" . $hit->[0] . " perc:" . $hit->[1] . "\n";
584							}
585							}
586
587							This routine quickly estimates the overall similarity between
588							a given set of DNA or RNA sequence(s) and a background set
589							of database sequences (usually homologues).
590							It returns a sorted list of similarities as a
591							table. The similarity between sequences A and B are the number
592							of unique k-words (short subsequence) that they share, divided
593							by the smallest total unique k-word count in either A or B. The result
594							are scores that do not depend on sequence lengths. When
595							called in void context, the routine prints to the given output
596							file or STDOUT; otherwise a hash_ref is returned.
597
598							Parameters:
599
600							=over 4
601
602							=item query
603
604							Sets the path where the query fasta file will be found.
605							Required. In future versions, query could be a data structure instead.
606							Need to build an abstract iterator for this feature to be enabled.
607
608							=item minpct
609
610							A real number indicating the the minimum percent match that should
611							be output/returned.
612							Default = 50.
613
614							=item outlen
615
616							An integer indicating the maximum number of ranked db matches that
617							should be output/returned.
618							Default = 100.
619
620							=item valid_chars
621
622							Specifies the characters to allow for kmer-searching. Entire n-mer must be composed
623							of these characters otherwise it will be ignored. When not defined, all
624							characters are valid.
625							Default is undef (indicating all characters are valid)
626
627							=item reverse
628
629							A boolean value. If true, reverses input sequence.
630							Default = false.
631
632							=item noids
633
634							A boolean value. If true, prints database index numbers instead of sequence ids.
635							Default = false.
636
637							=item outfile
638
639							Sets the path of the output file (instead of sending to STDOUT).
640							Default = false. Meaning output is sent to STDOUT.
641
642							=item silent
643
644							Specifies if caller wants progress messages printed to STDERR while matching.
645							Default is false, meaning not silent.
646
647							=back
648
649							=cut
650
651
652							sub match_oligos {
653							# Niels Larsen, May 2004.
654							# Todd Z. DeSantis, March 2008.
655
656	0			0	1		my ( $self,
657							$cl_args, # Arguments hash
658							) = @_;
659	0						$self->_check_arguments_for_match_oligos($cl_args);
660							# Returns a list of matches if not called in void context.
661
662	0						my ( $q_seqs, $count, $silent, @sids, $wordlen, $seqnum, $bin_fh,
663							$query_fh, $bytstr, $olinum, $length, @oligos, $oligo, @scores,
664							$oli2seqnums, $vector, @begpos, @endpos, $oli2pos, $i, @seqnums,
665							$j, $seqtot, $score, @lengths, @temp, $valid_chars, $minscore,
666							$olibeg, $olitot, $begpos, $endpos, $pack_bits, @scovec, $scovec,
667							$query_sid, $out_fh, $outdir, $outline, $matches, @oli_counts,
668							$oli_count, $scovec_null, $index, $outlen, $pos, $id, $seq,
669							$id_len);
670
671	0						my $void_context = 1;
672							# Recall 'wantarray' returns
673							# True if the context is looking for a list value,
674							# False if the context is looking for a scalar,
675							# Undef if the context is looking for no value (void context).
676	0	0					$void_context = 0 if (defined wantarray); # caller wants data back
677
678	0						$silent = $self->{param}{silent};
679
680							# >>>>>>>>>>>>>>>>>>>>>> FILE MAP SECTION <<<<<<<<<<<<<<<<<<<<<<<<
681
682							# This section creates $oli2pos, where each key is a subsequence
683							# and values are [ file byte position, byte length to read ]. This
684							# map is used below to sum up similarities.
685
686	0						eval { $bin_fh = IO::File->new( $self->{param}{binary}, O_RDONLY\|O_LARGEFILE ) };
	0
687	0	0					if ($@) {
688							# such as: your vendor has not defined Fcntl macro O_LARGEFILE
689	0	0					if ( not $bin_fh = new IO::File $self->{param}{binary}, "r") {
690
691	0						print STDERR "Could not open file $self->{param}{binary} for reading\n";
692	0						exit;
693							}
694							}
695
696							# Word length and total number of sequences,
697							# reminder: sysread FILEHANDLE,SCALAR,LENGTH
698							# Attempts to read LENGTH bytes of data into variable SCALAR
699							# from the specified FILEHANDLE,
700	0						sysread $bin_fh, $bytstr, 10;
701	0						$id_len = $bytstr * 1;
702
703	0						sysread $bin_fh, $bytstr, 10;
704	0						$wordlen = $bytstr * 1;
705
706	0						sysread $bin_fh, $bytstr, 10;
707	0						$seqtot = $bytstr * 1;
708
709							# String ids as a string of fixed-length character words,
710	0						sysread $bin_fh, $bytstr, $seqtot * $id_len;
711	0						@sids = unpack "A$id_len" x $seqtot, $bytstr;
712
713							# Get the oligos and the begin and end positions of where the sequence
714							# indices start and their lengths,
715
716	0						sysseek $bin_fh, -10, 2; # goto 10 bytes before EOF
717	0						sysread $bin_fh, $bytstr, 10;
718	0						$olibeg = $bytstr * 1;
719
720	0						sysseek $bin_fh, -20, 2; # goto 20 bytes before EOF
721	0						sysread $bin_fh, $bytstr, 10;
722	0						$olitot = $bytstr * 1;
723
724	0						sysseek $bin_fh, $olibeg, 0;
725	0						sysread $bin_fh, $bytstr, $olitot * $wordlen;
726	0						@oligos = unpack "A$wordlen" x $olitot, $bytstr;
727
728	0						sysread $bin_fh, $bytstr, 4 * $olitot;
729	0						@begpos = unpack "I*", $bytstr;
730
731	0						sysread $bin_fh, $bytstr, 4 * $olitot;
732	0						@lengths = unpack "I*", $bytstr;
733
734	0						sysread $bin_fh, $bytstr, 4 * $seqtot;
735	0						@oli_counts = unpack "I*", $bytstr;
736
737							# Create a hash that returns the file positions of where to get the
738							# matching sequence indices,
739
740	0						for ( $i = 0; $i < scalar @oligos; $i++ ) {
741	0						$oli2pos->{ $oligos[$i] } = [ $begpos[$i], $lengths[$i] ];
742							}
743
744							# >>>>>>>>>>>>>>>>>>>> PROCESS QUERY ENTRIES <<<<<<<<<<<<<<<<<<<<<<<
745
746	0	0	0				if ( $void_context == 1 && $self->{param}{outfile} ) {
747							# call was made in void context
748	0						$out_fh = new IO::File $self->{param}{outfile}, "w";
749	0	0					print "Outfile opened.\n" unless ($self->{param}{silent});
750							}
751
752	0						$query_fh = new IO::File $self->{param}{"query"}, "r";
753	0						$scovec_null = pack "I*", (0) x $seqtot;
754
755	0		0				$minscore = ( $self->{param}{"minpct"} \|\| 0 ) / 100;
756
757							{
758	0						local $/ = '>';
	0
759	0						while ( <$query_fh>) {
760	0	0					next if ($_ eq '>'); # toss the first empty record
761	0						chomp $_;
762	0						my ($header, @seq_lines) = split /\n/, $_;
763	0						my ($query_sid) = split /\s/, $header;
764	0						my $sequence = '';
765	0	0					if ($self->{param}{pre_subst} == 1) {
		0
766	0						$sequence = join '', @seq_lines;
767	0						$sequence =~ s/[\.\-\s]//g;
768							} elsif ($self->{param}{pre_subst} == 2) {
769	0						$sequence = join '_', @seq_lines;
770	0						$sequence =~ s/[\.\-\s]/_/g;
771							}
772
773	0	0	0				next unless ($query_sid && $sequence);
774
775	0	0					if ( $self->{param}{"reverse"} ) {
776	0						$sequence = reverse $sequence;
777							}
778
779	0	0					print "Processing $query_sid ... \n" unless ( $silent );
780
781	0						$scovec = $scovec_null;
782
783							# >>>>>>>>>>>>>>>>>>>>>> COUNTING SECTION <<<<<<<<<<<<<<<<<<<<<<<<
784
785							# Look up the sequences that contain each of the oligos found
786							# in the sequence. The sort makes the disk jump around less,
787							# since the oligos were written in sorted order to the bin file
788
789	0						@oligos = _create_oligos( $sequence, $wordlen, $self->{param}{valid_chars} );
790
791	0						foreach $oligo ( sort @oligos ) { # "sort" added by tzd Apr-11-2008
792
793	0	0					if ( defined ( $pos = $oli2pos->{ $oligo }->[0] ) ) {
794	0						sysseek $bin_fh, $pos, 0;
795
796	0						$length = $oli2pos->{ $oligo }->[1];
797	0						sysread $bin_fh, $bytstr, $length;
798
799							# $bytstr is a perl string. Every 4-byte block
800							# in this string encodes a 4 byte integer
801							# so >4 billion integers are available.
802							# but when passed to C
803							# it will be an integer array.
804
805							# This is a C function for speed, the code is at the end of this
806							# file. The third argument is the maximum index of $bytstr array,
807
808							####### testing only
809							# print STDERR join (',', (unpack "I*", $scovec)) . "\n";
810							# print STDERR join (',', (unpack "I*", $bytstr)) . "\n";
811							#######
812
813	0						&update_scores_C( \$bytstr, \$scovec, ( $length - 1 ) / 4 );
814
815							####### testing only
816							# print STDERR '$query_sid:' . $query_sid .
817							# ' $oligo:' . $oligo . "\n" .
818							# Dumper(unpack "I*", $scovec) . "\n";
819							#######
820							}
821							}
822
823	0						@scovec = unpack "I*", $scovec;
824
825							# >>>>>>>>>>>>>>>>>>>>>> EXTRACT OUTPUT DATA <<<<<<<<<<<<<<<<<<<<<<
826
827							# The score vector, @scovec, now contains integer scores in some
828							# slots, zeros in others. To get an ordered list of just the positive
829							# scores we could grep and sort, but that would be slow because the
830							# list is long. Instead, keep a list of scores that is only as long
831							# as the output length; insert values that are higher than the last
832							# element and ignore the rest,
833
834	0						$oli_count = scalar @oligos;
835	0						$outlen = $self->{param}{"outlen"};
836
837	0						@scores = ();
838
839	0						for ( $i = 0; $i < scalar @scovec; $i++ )
840							{
841	0	0					if ( $scovec[$i] ) {
842
843							#### use for testing
844							# print STDERR '$oli_count:' . $oli_count . ' $oli_counts[$i]:' . $oli_counts[$i] . ' $scovec[$i]:' . $scovec[$i] . "\n";
845							#####
846
847	0	0					if ( $oli_count < $oli_counts[$i] ) {
848	0						$score = $scovec[$i] / $oli_count;
849							} else {
850	0						$score = $scovec[$i] / $oli_counts[$i];
851							}
852
853	0	0					if ( $score >= $minscore )
854							{
855	0	0					if ( scalar @scores < $outlen )
		0
856							{
857	0						$index = _nearest_index( \@scores, $score );
858	0						splice @scores, $index, 0, [ $i, $score ];
859							}
860							elsif ( $score > $scores[0]->[1] )
861							{
862	0						$index = _nearest_index( \@scores, $score );
863	0						splice @scores, $index, 0, [ $i, $score ];
864	0						shift @scores;
865							}
866							}
867							}
868							}
869
870	0	0					if ( $cl_args->{"noids"} ) {
871	0						foreach $score ( @scores ) {
872	0						$score->[1] = sprintf "%.2f", 100 * $score->[1];
873							}
874							} else {
875	0						foreach $score ( @scores ) {
876	0						$score->[0] = $sids[ $score->[0] ];
877	0						$score->[1] = sprintf "%.2f", 100 * $score->[1];
878							# Can we add a percision parameter here ?
879							# may be able to save disk space in the output file by getting
880							# just the 1/10 place for instance
881							}
882							}
883
884	0						@scores = reverse @scores;
885
886							# >>>>>>>>>>>>>>>>>>>>>> OUTPUT SECTION <<<<<<<<<<<<<<<<<<<<<<<<<
887
888							# If called in non-void context, generate a data structure. If
889							# in void context, print tabular data to stdout or an output file
890							# if specified,
891
892	0	0					if ( $void_context == 0 ) {
893							# means caller wants a scalar back
894							# which will be a hash ref
895	0						$matches->{ $query_sid } = &Storable::dclone( \@scores );
896							} else {
897	0						$outline = "$query_sid\t" . join "\t", map { $_->[0] .":". $_->[1] } @scores;
	0
898
899	0	0					if ( defined $out_fh ) {
900	0						print $out_fh "$outline\n";
901							} else {
902	0						print "$outline\n";
903							}
904							}
905
906	0	0					unless ( $silent ) {
907	0						print "$query_sid done\n";
908							# &Common::Messages::echo_green( qq (done\n) );
909							}
910							} # end while query_fh
911							} # end local block
912	0						$query_fh->close;
913	0						$bin_fh->close;
914
915							# Return data structure if non-void context, otherwise nothing,
916	0	0					if ( $void_context == 0 ) {
917							# means caller wants a scalar back
918	0						return $matches;
919							} else {
920	0						return;
921							}
922							}
923
924							=head2 _check_arguments_for_match_oligos
925
926							Internally used function.
927							Validates the arguments for outlen, minpct, reverse,
928							query, noids, silent, pre_subst and output.
929							If errors are found, program exits.
930							Returns a hash or hash ref
931							depending on the context of the method call.
932
933							=cut
934
935							sub _check_arguments_for_match_oligos {
936							# Niels Larsen, May 2004.
937							# Todd Z. DeSantis, March 2008.
938	0			0			my ( $self,
939							$cl_args, # Command Line ARGument hash
940							# not really reqired to come from the command line
941							) = @_;
942
943	0						my ( @errors, $error, $outdir );
944
945							## see if caller wants to constrain the alphabet
946	0	0					if ( exists $cl_args->{valid_chars} ) {
947	0						$self->{param}{valid_chars} = $cl_args->{valid_chars};
948							} else {
949	0						$self->{param}{valid_chars} = undef;
950							}
951
952							## see if caller wants to make some pre-substitutions
953	0	0					if ( exists $cl_args->{pre_subst} ) {
954	0						$self->{param}{pre_subst} = $cl_args->{pre_subst};
955							} else {
956	0						$self->{param}{pre_subst} = 1;
957							}
958
959							# Ensure output list has at least one similarity in it and set
960							# it to 100 if the user didnt specify it,
961
962	0	0					if ( $cl_args->{"outlen"} )
963							{
964	0	0					if ( $cl_args->{"outlen"} < 1 ) {
965	0						push @errors, qq (Output list length should be 1 or more);
966							}
967							}
968							else {
969	0						$cl_args->{"outlen"} = 100;
970							}
971	0						$self->{param}{outlen} = $cl_args->{"outlen"};
972
973	0	0					if ($cl_args->{"minpct"} ) {
974	0						$self->{param}{minpct} = $cl_args->{"minpct"};
975							} else {
976	0						$self->{param}{minpct} = 0;
977							}
978
979	0	0					if ($cl_args->{reverse}) {
980	0						$self->{param}{"reverse"} = $cl_args->{reverse};
981							} else {
982	0						$self->{param}{"reverse"} = 0;
983							}
984							# The mandatory query:
985							# must be a file path
986							# in the future, allow a hash ref of the structure
987							# $hr->{$id} = $sequence_string
988
989							# if file given,
990							# check that its readable. If not given, error,
991	0	0					if ( $cl_args->{"query"} ) {
992							# $cl_args->{"query"} = &Common::File::full_file_path( $cl_args->{"query"} );
993
994	0	0					if ( not -r $cl_args->{"query"} ) {
995	0						push @errors, qq (Query file not found or not readable -> "$cl_args->{'query'}");
996							}
997							}
998							else {
999	0						push @errors, qq (Query sequence file must be specified);
1000							}
1001	0						$self->{param}{query} = $cl_args->{"query"};
1002
1003							# If output file given, check if it exists and if its directory
1004							# does not exist
1005	0	0					if ( $cl_args->{"outfile"} ) {
1006
1007	0	0					if ( -e $cl_args->{"outfile"} ) {
1008	0						push @errors, qq (Output file exists -> "$cl_args->{'outfile'}");
1009							}
1010
1011	0						$outdir = File::Basename::dirname( $cl_args->{"outfile"} );
1012
1013	0	0					if ( not -d $outdir ) {
		0
1014	0						push @errors, qq (Output directory does not exist -> "$outdir");
1015							} elsif ( not -w $outdir ) {
1016	0						push @errors, qq (Output directory is not writable -> "$outdir");
1017							}
1018							} else {
1019	0						$cl_args->{"outfile"} = "";
1020							}
1021	0						$self->{param}{outfile} = $cl_args->{"outfile"};
1022							# print STDERR 'outfile:' . $self->{param}{outfile} . "\n";
1023
1024	0	0					if ( defined $cl_args->{"silent"} ) {
1025	0						$self->{param}{silent} = $cl_args->{"silent"};
1026							}
1027	0	0					if ( not defined $self->{param}{silent}) {
1028	0						$self->{param}{silent} = 1;
1029							}
1030
1031							# Print errors if any and exit,
1032
1033	0	0					if ( @errors )
1034							{
1035	0						foreach $error ( @errors )
1036							{
1037	0						print STDERR "$error\n";
1038							# &Common::Messages::echo_red( "ERROR: ");
1039							# &Common::Messages::echo( "$error\n" );
1040							}
1041
1042	0						exit;
1043							}
1044							else {
1045	0	0					wantarray ? return %{ $cl_args } : return $cl_args;
	0
1046							}
1047
1048							}
1049
1050							=head2 _create_oligos
1051
1052							Internal method.
1053							Creates a list of unique words of a given length from a given sequence.
1054							Enforces k-mers composed of purely valid_char if requested.
1055							Converts characters to upper case where possible. This may become an option
1056							in the future.
1057
1058							=cut
1059
1060
1061							sub _create_oligos {
1062							# Niels Larsen, November 2005.
1063							# Todd Z. DeSantis, March 2008
1064
1065							my (
1066	0			0			$str, # Sequence string
1067							$word_len, # Word length
1068							$valid_chars
1069							) = @_;
1070
1071
1072	0						my (@oligos, %oligos, $i, $len);
1073	0						$str = uc $str;
1074
1075	0						my @good_spans = ();
1076	0	0					if ($valid_chars) {
1077	0						@good_spans = split (/[^$valid_chars]/, $str); # pattern match done once
1078							# instead of for every k-mer
1079							} else {
1080							# put whole string as first element
1081	0						$good_spans[0] = $str;
1082							}
1083	0						foreach my $good_span (@good_spans) {
1084							# print STDERR "good_span: $good_span\n";
1085	0						my $span_len = length($good_span);
1086	0	0					next if ($span_len < $word_len);
1087	0						my $i;
1088	0						for ( $i = 0; $i <= $span_len - $word_len; $i++ ) {
1089	0						$oligos{ substr $good_span, $i, $word_len } = undef;
1090							}
1091							}
1092
1093	0						@oligos = keys %oligos;
1094
1095							## use for testing:
1096
1097							# foreach my $o (@oligos) {
1098							# if ($o =~ /[^ACGT]/) {
1099							# print STDERR $o . "\n";
1100							# }
1101							# }
1102
1103
1104
1105	0	0					wantarray ? return @oligos : \@oligos;
1106							}
1107
1108							=head2 _nearest_index
1109
1110							Finds the index of a given sorted array where a given number
1111							would fit in the sorted order. The returned index can then
1112							be used to insert the number, for example. The array may
1113							contain integers and/or floats, same for the number.
1114
1115							=cut
1116
1117							sub _nearest_index
1118							{
1119							# Niels Larsen, May 2004.
1120	0			0			my ( $array, # Array of numbers
1121							$value, # Lookup value
1122							) = @_;
1123
1124							# Returns an integer.
1125
1126	0						my ( $low, $high ) = ( 0, scalar @{ $array } );
	0
1127	0						my ( $cur );
1128
1129	0						while ( $low < $high )
1130							{
1131	0						$cur = int ( ( $low + $high ) / 2 );
1132
1133	0	0					if ( $array->[$cur]->[1] < $value ) {
1134	0						$low = $cur + 1;
1135							} else {
1136	0						$high = $cur;
1137							}
1138							}
1139
1140	0						return $low;
1141							}
1142
1143							=head2 _count_records
1144
1145							Counts the number of records in a fasta formated file.
1146
1147							=cut
1148
1149							sub _count_records
1150							{
1151							# Todd Z. DeSantis, July 2010.
1152	0			0			my ( $fasta_file ) = @_;
1153
1154							# Returns an integer.
1155	0						my $c = 0;
1156	0						my $ffh = new IO::File $fasta_file, "r";
1157	0						while (<$ffh>) {
1158	0	0					$c++ if ($_ =~ /^\>/);
1159							}
1160	0						$ffh->close;
1161	0						return $c;
1162							}
1163
1164
1165							1;
1166
1167							=head2 _update_scores_C
1168
1169							Receives a list of indicies, score vector, and the final index
1170							Updates the score vector by incrementing the oligo
1171							match count for the correct sequences.
1172
1173							=cut
1174
1175							__DATA__