File Coverage

lib/Bio/Kmer.pm
Criterion Covered Total %
statement 43 305 14.1
branch 1 100 1.0
condition 0 25 0.0
subroutine 14 35 40.0
pod 11 18 61.1
total 69 483 14.2


line stmt bran cond sub pod time code
1             #!/usr/bin/env perl
2              
3             # Kmer.pm: a kmer counting module
4             # Author: Lee Katz
5              
6             package Bio::Kmer;
7             require 5.10.0;
8             our $VERSION=0.41;
9              
10 4     4   22662 use strict;
  4         8  
  4         130  
11 4     4   21 use warnings;
  4         7  
  4         130  
12              
13 4     4   22 use List::Util qw/max/;
  4         6  
  4         288  
14 4     4   25 use File::Basename qw/basename fileparse dirname/;
  4         22  
  4         279  
15 4     4   817 use File::Temp qw/tempdir tempfile/;
  4         17327  
  4         234  
16 4     4   25 use File::Path qw/remove_tree/;
  4         8  
  4         206  
17 4     4   24 use Data::Dumper qw/Dumper/;
  4         8  
  4         230  
18 4     4   594 use IO::Uncompress::Gunzip;
  4         38204  
  4         311  
19 4     4   1535 use File::Which qw/which/;
  4         3171  
  4         230  
20 4     4   28 use Carp qw/croak carp confess/;
  4         9  
  4         433  
21              
22             our $iThreads; # boolean for whether threads are loaded
23             BEGIN{
24 4     4   15 eval{
25 4         2336 require threads;
26 0         0 require threads::shared;
27 0         0 $iThreads = 1;
28             };
29 4 50       4311 if($@){
30 4         114 $iThreads = 0;
31             }
32             }
33              
34 4     4   23 use Exporter qw/import/;
  4         9  
  4         555  
35             our @EXPORT_OK = qw(
36             );
37              
38             our @fastqExt=qw(.fastq.gz .fastq .fq .fq.gz);
39             our @fastaExt=qw(.fasta .fna .faa .mfa .fas .fa);
40             our @bamExt=qw(.sorted.bam .bam);
41             our @vcfExt=qw(.vcf.gz .vcf);
42             our @richseqExt=qw(.gbk .gbf .gb .embl);
43             our @sffExt=qw(.sff);
44             our @samExt=qw(.sam .bam);
45              
46 4     4   71 our $fhStick :shared; # Helps us open only one file at a time
  4         10  
  4         27  
47 4     4   315 our $enqueueStick :shared; # Helps control access to the kmer queue
  4         7  
  4         15  
48              
49              
50             # TODO if 'die' is imported by a script, redefine
51             # sig die in that script as this function.
52             local $SIG{'__DIE__'} = sub { my $e = $_[0]; $e =~ s/(at [^\s]+? line \d+\.$)/\nStopped $1/; die("$0: ".(caller(1))[3].": ".$e); };
53              
54             my $startTime = time();
55             sub logmsg{
56 0     0 0   local $0 = basename $0;
57 0           my $tid = 0;
58 0 0         if($iThreads){
59 0           $tid = threads->tid;
60             }
61 0           my $elapsedTime = time() - $startTime;
62 0           print STDERR "$0.$tid $elapsedTime @_\n";
63             }
64              
65             =pod
66              
67             =head1 NAME
68              
69             Bio::Kmer - Helper module for Kmer Analysis.
70              
71             =head1 SYNOPSIS
72              
73             A module for helping with kmer analysis.
74              
75             use strict;
76             use warnings;
77             use Bio::Kmer;
78            
79             my $kmer=Bio::Kmer->new("file.fastq.gz",{kmercounter=>"jellyfish",numcpus=>4});
80             my $kmerHash=$kmer->kmers();
81             my $countOfCounts=$kmer->histogram();
82              
83             The BioPerl way
84              
85             use strict;
86             use warnings;
87             use Bio::SeqIO;
88             use Bio::Kmer;
89              
90             # Load up any Bio::SeqIO object. Quality values will be
91             # faked internally to help with compatibility even if
92             # a fastq file is given.
93             my $seqin = Bio::SeqIO->new(-file=>"input.fasta");
94             my $kmer=Bio::Kmer->new($seqin);
95             my $kmerHash=$kmer->kmers();
96             my $countOfCounts=$kmer->histogram();
97              
98             =head1 DESCRIPTION
99              
100             A module for helping with kmer analysis. The basic methods help count kmers and can produce a count of counts. Currently this module only supports fastq format. Although this module can count kmers with pure perl, it is recommended to give the option for a different kmer counter such as Jellyfish.
101              
102             =pod
103              
104             =head1 VARIABLES
105              
106             =over
107              
108             =item $Bio::Kmer::iThreads
109              
110             Boolean describing whether the module instance is using threads
111              
112             =back
113              
114             =head1 METHODS
115              
116             =over
117              
118             =item Bio::Kmer->new($filename, \%options)
119              
120             Create a new instance of the kmer counter. One object per file.
121              
122             Filename can be either a file path or a Bio::SeqIO object.
123              
124             Applicable arguments for \%options:
125             Argument Default Description
126             kmercounter perl What kmer counter software to use.
127             Choices: Perl, Jellyfish.
128             kmerlength|k 21 Kmer length
129             numcpus 1 This module uses perl
130             multithreading with pure perl or
131             can supply this option to other
132             software like jellyfish.
133             gt 1 If the count of kmers is fewer
134             than this, ignore the kmer. This
135             might help speed analysis if you
136             do not care about low-count kmers.
137             sample 1 Retain only a percentage of kmers.
138             1 is 100%; 0 is 0%
139             Only works with the perl kmer counter.
140             verbose 0 Print more messages.
141              
142             Examples:
143             my $kmer=Bio::Kmer->new("file.fastq.gz",{kmercounter=>"jellyfish",numcpus=>4});
144              
145             =back
146              
147             =cut
148              
149             sub new{
150 0     0 1   my($class,$seqfile,$settings)=@_;
151              
152 0 0         croak "ERROR: need a sequence file or a Bio::SeqIO object" if(!$seqfile);
153              
154             # Set optional parameter defaults
155 0   0       $$settings{kmerlength} = $$settings{kmerlength} || $$settings{k} || 21;
156 0   0       $$settings{numcpus} ||= 1;
157 0   0       $$settings{gt} ||= 1;
158 0   0       $$settings{kmercounter} ||= "perl";
159 0   0       $$settings{tempdir} ||= tempdir("Kmer.pm.XXXXXX",TMPDIR=>1,CLEANUP=>1);
160 0 0         $$settings{sample} = 1 if(!defined($$settings{sample}));
161 0   0       $$settings{verbose} ||= 0;
162              
163             # If the first parameter $seqfile is a Bio::SeqIO object,
164             # then send it to a file to dovetail with the rest of
165             # this module.
166 0 0 0       if(ref($seqfile) && $seqfile->isa("Bio::SeqIO")){
167             # BioPerl isn't required at compile tile but is
168             # required if the user starts with a BioPerl object.
169 0           eval {
170 0           require Bio::SeqIO;
171 0           require Bio::Seq::Quality;
172             };
173 0 0         if($@){
174 0           croak "ERROR: cannot load Bio::SeqIO and Bio::Seq::Quality, but I need to because you supplied a Bio::SeqIO object";
175             }
176 0           my $tempfile="$$settings{tempdir}/bioperl_input.fastq";
177 0           my $out=Bio::SeqIO->new(-file=>">".$tempfile);
178 0           while(my $seq=$seqfile->next_seq){
179 0   0       my $qual = $seq->qual || "I" x $seq->length;
180 0           my $seqWithQual = Bio::Seq::Quality->new(
181             # TODO preserve qual values if they exist, but
182             # for now, it doesn't really matter.
183             -qual=> $qual,
184             -seq => $seq->seq,
185             -id => $seq->id,
186             -verbose => -1,
187             );
188 0           $out->write_seq($seqWithQual);
189             }
190 0           $out->close; # make sure output is flushed
191              
192             # now redefine the seqfile as the new file on disk.
193 0           $seqfile=$tempfile;
194             }
195              
196             # Check if we have a valid sequence file path or at
197             # the very least, double check that the file path we just
198             # made from the Bio::SeqIO object is valid.
199 0 0         croak "ERROR: could not locate the sequence file $seqfile" if(!-e $seqfile);
200              
201             # Initialize the object and then bless it
202             my $self={
203             seqfile =>$seqfile,
204             kmerlength =>$$settings{kmerlength},
205             numcpus =>$$settings{numcpus},
206             tempdir =>$$settings{tempdir},
207             gt =>$$settings{gt},
208             kmercounter=>$$settings{kmercounter},
209             sample =>$$settings{sample},
210             verbose =>$$settings{verbose},
211              
212             # Jellyfish-specific
213 0           jellyfish =>scalar(which("jellyfish")),
214              
215             # Values that will be filled in after analysis
216             _kmers =>{}, # hash of kmer=>count
217             _hist =>[], # histogram. 0th element is counts of kmers occuring no times, 1st is elements occuring exactly once, etc
218             _ntcount =>0, # Total number of nucleotides
219             };
220             # Add in some other temporary files
221             #($$self{kmerfileFh},$$self{kmerfile}) = tempfile("KMER.XXXXXX", DIR=>$$self{tempdir}, SUFFIX=>".tsv");
222 0           ($$self{histfileFh},$$self{histfile}) = tempfile("HIST.XXXXXX", DIR=>$$self{tempdir}, SUFFIX=>".tsv");
223 0           ($$self{jellyfishdbFh},$$self{jellyfishdb})= tempfile("JF.XXXXXX", DIR=>$$self{tempdir}, SUFFIX=>".jf");
224              
225             # Make some values lc
226 0           $$self{$_}=lc($$self{$_}) for(qw(kmercounter));
227              
228             # Set an exact parameter for the kmer counter
229 0 0         if($$self{kmercounter}=~ /(pure)?.*perl/){
    0          
230 0           $$self{kmercounter}="perl";
231             } elsif($self->{kmercounter} =~ /jellyfish/){
232 0           $$self{kmercounter}="jellyfish";
233             }
234              
235 0           bless($self);
236              
237 0           $self->count; # start off the kmer counting ASAP
238              
239 0           return $self;
240             }
241              
242             =pod
243              
244             =over
245              
246             =item $kmer->ntcount()
247              
248             Returns the number of base pairs counted.
249             In some cases such as when counting with Jellyfish,
250             that number is not calculated; instead the length
251             is calculated by the total length of kmers.
252             Internally, this number is stored as $kmer->{_ntcount}.
253              
254             Note: internally runs $kmer->histogram() if
255             $kmer->{_ntcount} is not initially found.
256              
257             Arguments: None
258             Returns: integer
259              
260             =back
261              
262             =cut
263              
264             sub ntcount{
265 0     0 1   my($self) = @_;
266              
267             # Get the histogram just in case it's needed.
268             # Do not wait until later to get the histogram
269             # because I want to make sure this sub is
270             # streamlined despite which underlying mechanism
271             # is used.
272 0           my $hist = $self->histogram();
273              
274             # If the length was already calculated, return it
275 0 0         if($$self{_ntcount}){
276 0           return $$self{_ntcount};
277             }
278              
279             # Approximate the length by just counting
280             # how many unique kmers there are.
281 0           my $length = 0;
282 0           $length += $_ for(@$hist);
283 0           $$self{_ntcount} = $length;
284 0 0         if($length < 1){
285 0           croak "ERROR: could not calculate total length for some reason (internal error)";
286             }
287 0           return $length;
288             }
289              
290             =pod
291              
292             =over
293              
294             =item $kmer->count()
295              
296             Count kmers. This method is called as soon as new() is called
297             and so you should never have to run this method.
298             Internally caches the kmer counts to ram.
299              
300             Arguments: None
301             Returns: None
302              
303             =back
304              
305             =cut
306              
307             sub count{
308 0     0 1   my($self)=@_;
309              
310 0           my $seqfile=$self->{seqfile};
311 0           my $kmerlength=$self->{kmerlength};
312              
313 0           my $kmerHash={};
314              
315 0 0         if($self->{kmercounter} eq "perl"){
    0          
316 0 0         if(!$iThreads){
317 0           croak("Warning: perl iThreads are not enabled in this version of perl, but you requested iThreads when running ". ref($self));
318             }
319 0           $self->countKmersPurePerl($seqfile,$kmerlength);
320             } elsif($self->{kmercounter} eq "jellyfish"){
321             # TODO make JF DB file $self->{jellyfishDb} and do not return
322             # a kmer count
323 0           $self->countKmersJellyfish($seqfile,$kmerlength);
324             } else {
325 0           croak "ERROR: I do not understand the kmer counter $self->{kmercounter}";
326             }
327              
328 0 0         if($$self{sample} < 1){
329              
330 0           my $sample = $$self{sample};
331 0           my $kmers = $self->kmers();
332            
333 0           my %subsampledKmers;
334 0           for my $kmer(keys(%$kmers)){
335 0           my $count = $$kmers{$kmer};
336 0           for(my $i=0;$i<$count;$i++){
337 0 0         if($sample >= rand(1)){
338 0           $subsampledKmers{$kmer}++;
339             }
340             }
341             }
342            
343 0           $self->clearCache();
344 0           $$self{_kmers} = \%subsampledKmers;
345             }
346             }
347              
348             =pod
349              
350             =over
351              
352             =item $kmer->clearCache
353              
354             Clears kmer counts and histogram counts. You should probably never use
355             this method.
356              
357             Arguments: None
358             Returns: None
359              
360             =back
361              
362             =cut
363              
364             sub clearCache{
365 0     0 1   my($self) = @_;
366              
367 0           $$self{_kmers} = {};
368 0           $$self{_hist} = [];
369             }
370              
371             =pod
372              
373             =over
374              
375             =item $kmer->query($queryString)
376              
377             Query the set of kmers with your own query
378              
379             Arguments: query (string)
380             Returns: Count of kmers.
381             0 indicates that the kmer was not found.
382             -1 indicates an invalid kmer (e.g., invalid length)
383              
384             =back
385              
386             =cut
387              
388             sub query{
389 0     0 1   my($self,$query)=@_;
390              
391 0 0         if(length($query) != $self->{kmerlength}){
392 0           return -1;
393             }
394              
395 0           my $count=0;
396 0 0         if($self->{kmercounter} eq "perl"){
    0          
397 0           my $kmers=$self->kmers();
398 0   0       $count=$$kmers{uc($query)} || 0;
399             } elsif($self->{kmercounter} eq "jellyfish"){
400 0 0         open(my $queryFh, "$$self{jellyfish} query ".$self->{jellyfishdb}." |") or croak "ERROR: could not run jellyfish query: $!";
401 0           my $db=$self->{jellyfishdb};
402 0           my $tmp=`jellyfish query $db $query`;
403 0 0         croak "ERROR: could not run jellyfish query" if $?;
404 0           chomp($tmp);
405 0           (undef,$count)=split(/\s+/,$tmp);
406             }
407            
408 0           return $count;
409             }
410              
411             =pod
412              
413             =over
414              
415             =item $kmer->histogram()
416              
417             Count the frequency of kmers. Internally caches the histogram to ram.
418              
419             Arguments: none
420             Returns: Reference to an array of counts. The index of
421             the array is the frequency.
422              
423             =back
424              
425             =cut
426              
427             sub histogram{
428 0     0 1   my($self)=@_;
429              
430 0 0         if(@{ $$self{_hist} } > 0){
  0            
431 0           return $$self{_hist};
432             }
433              
434 0 0         if($self->{kmercounter} eq "jellyfish"){
435 0           return $self->histogramJellyfish();
436             } else {
437 0           return $self->histogramPerl();
438             }
439             }
440              
441             sub histogramJellyfish{
442 0     0 0   my($self)=@_;
443            
444 0           close $self->{histfileFh};
445              
446             # Run jellyfish histo
447 0           my $jellyfishXopts = join(" ","-t", $self->{numcpus}, "-o", $self->{histfile}, $self->{jellyfishdb});
448 0           system("$$self{jellyfish} histo $jellyfishXopts");
449 0 0         croak "ERROR with jellyfish histo" if $?;
450            
451             # Read the output file
452 0           my @hist=(0);
453 0 0         open(my $fh, $self->{histfile}) or croak "ERROR: reading $self->{histfile}: $!";
454 0           while(<$fh>){
455 0           s/^\s+|\s+$//g;
456 0           my($count, $countOfCounts)=split /\s+/;
457 0           $hist[$count]=$countOfCounts;
458             }
459 0           close $fh;
460              
461             # Fill in gaps in the histogram
462 0           for(@hist){
463 0   0       $_||=0;
464             }
465              
466 0           $self->{_hist} = \@hist;
467 0           return \@hist;
468             }
469              
470             sub histogramPerl{
471 0     0 0   my($self)=@_;
472 0           my %hist=();
473              
474 0           my @hist=(0); # initialize the histogram with a count of zero kmers happening zero times
475             #$hist[0]=4**$self->{kmerlength}; # or maybe it should be initialized to the total number of possibilities.
476              
477 0           for my $kmercount(values(%{ $self->kmers() } )){
  0            
478 0           $hist{$kmercount}++;
479             }
480              
481             # Turn this hash into an array
482 0           for(1..max(keys(%hist))){
483 0   0       $hist[$_] = $hist{$_} || 0;
484             #$hist[0]=$hist[0] - $hist[$_]; # subtract off from the total space of possible kmers
485             }
486              
487 0           $self->{_hist}=\@hist;
488 0           return \@hist;
489             }
490              
491             # Pure perl to make this standalone... the only reason
492             # we are counting kmers in Perl instead of C.
493             sub countKmersPurePerl{
494 0     0 0   my($self,$seqfile,$kmerlength)=@_;
495              
496 0           my @allSeqs;
497              
498             # Record the total number of nucleotides
499 0           my $ntCount = 0;
500              
501             # Save all seqs to an array, for passing out to individual threads.
502 0           my $fastqFh=$self->openFastq($seqfile);
503 0           my $i=0;
504 0           my @buffer=();
505 0           while(<$fastqFh>){ # burn the read ID line
506 0           $i++;
507 0           my $seq=<$fastqFh>;
508 0           chomp($seq);
509 0           push(@allSeqs, uc($seq));
510 0           $ntCount += length($seq);
511             # Burn the quality score lines
512 0           <$fastqFh>;
513 0           <$fastqFh>;
514             }
515 0           close $fastqFh;
516              
517 0           $$self{_ntcount}=$ntCount;
518              
519             # The number of sequences per thread is divided evenly but cautions
520             # toward having one extra sequence per thread in the first threads
521             # rather than accidentally leaving some off at the end.
522 0           my $numSeqsPerThread = int(scalar(@allSeqs)/$self->{numcpus}) + 1;
523              
524             # Multithreading
525 0           my @thr;
526 0           for(0..$self->{numcpus}-1){
527             # Get a set of sequences to kmerize
528 0           my @threadSeqs = splice(@allSeqs, 0, $numSeqsPerThread);
529             # Set up a place for kmers to land
530 0           $thr[$_]=threads->new(\&_countKmersPurePerlWorker,$kmerlength,\@threadSeqs,$self->{sample});
531             #logmsg "Kicking off thread ".$thr[$_]->tid." with ".scalar(@threadSeqs)." sequences";
532             }
533            
534             # Join the threads and put everything into a large kmer hash
535 0           my %kmer;
536 0           for(@thr){
537 0           my $kmerArr = $_->join;
538 0           for my $kmer(@$kmerArr){
539 0           $kmer{$kmer}++;
540             }
541             }
542            
543 0           my($fh, $filename) = tempfile("KMER.XXXXXX", DIR=>$$self{tempdir}, SUFFIX=>".tsv");
544 0           ($$self{kmerfileFh},$$self{kmerfile}) = ($fh, $filename);
545              
546             # Write everything to file. The FH should still be open.
547             # Do not return the kmer.
548             # Make a new method that returns the kmer hash
549             # Do the same for jellyfish
550 0           while(my($kmer,$count)=each(%kmer)){
551             # Filtering step
552 0 0         if($count < $self->{gt}){
553             #delete($kmer{$kmer});
554 0           next;
555             }
556            
557 0           print $fh "$kmer\t$count\n";
558             }
559 0           close $fh;
560              
561 0           return 1;
562             }
563              
564             =pod
565              
566             =over
567              
568             =item $kmer->kmers
569              
570             Return actual kmers
571              
572             Arguments: None
573             Returns: Reference to a hash of kmers and their counts
574              
575             =back
576              
577             =cut
578              
579              
580             sub kmers{
581 0     0 1   my($self)=@_;
582              
583             # Look for the cached results before trying to read the file.
584 0 0         return $self->{_kmers} if(keys(%{ $self->{_kmers} }) > 0);
  0            
585              
586             # Dump the kmers to a tab delimited file if we are using
587             # jellyfish and the user invokes this method.
588 0 0         if($self->{kmercounter} eq "jellyfish"){
589 0           $self->_dumpKmersJellyfish();
590             }
591              
592 0           my %kmer;
593 0 0         open(my $fh, $self->{kmerfile}) or croak "ERROR: could not read the kmer file: $!";
594 0           while(<$fh>){
595 0           chomp;
596 0           my($kmer,$count)=split /\t/;
597 0           $kmer{$kmer}=$count;
598             }
599 0           close $fh;
600              
601 0           $self->{_kmers}=\%kmer;
602              
603 0           return \%kmer;
604             }
605              
606             =pod
607              
608             =over
609              
610             =item $kmer->union($kmer2)
611              
612             Finds the union between two sets of kmers
613              
614             Arguments: Another Bio::Kmer object
615             Returns: List of kmers
616              
617             =back
618              
619             =cut
620              
621             sub union{
622 0     0 1   my($self,$other)=@_;
623            
624 0 0         if(!$self->_checkCompatibility($other)){
625 0           croak "ERROR: two objects are not compatible in a union: ".ref($self)." / ".ref($other);
626             }
627              
628             # See what kmers are in common using hashes
629 0           my %union;
630 0           my $kmer1 = $self->kmers;
631 0           my $kmer2 = $other->kmers;
632 0           for my $kmer(keys(%{ $self->kmers }), keys(%{ $other->kmers })){
  0            
  0            
633 0           $union{$kmer}=1;
634             }
635              
636 0           return [keys(%union)];
637             }
638              
639             =pod
640              
641             =over
642              
643             =item $kmer->intersection($kmer2)
644              
645             Finds the intersection between two sets of kmers
646              
647             Arguments: Another Bio::Kmer object
648             Returns: List of kmers
649              
650             =back
651              
652             =cut
653              
654             sub intersection{
655 0     0 1   my($self,$other)=@_;
656              
657 0 0         if(!$self->_checkCompatibility($other)){
658 0           croak "ERROR: two objects are not compatible in an intersection: ".ref($self)." / ".ref($other);
659             }
660              
661 0           my @intersection;
662 0           my $kmer2 = $other->kmers;
663 0           for my $kmer(keys(%{ $self->kmers })){
  0            
664 0 0         if($$kmer2{$kmer}){
665 0           push(@intersection, $kmer);
666             }
667             }
668              
669 0           return \@intersection;
670             }
671              
672             =pod
673              
674             =over
675              
676             =item $kmer->subtract($kmer2)
677              
678             Finds the set of kmers unique to this Bio::Kmer object.
679              
680             Arguments: Another Bio::Kmer object
681             Returns: List of kmers
682              
683             =back
684              
685             =cut
686              
687             sub subtract{
688 0     0 1   my($self,$other)=@_;
689              
690 0 0         if(!$self->_checkCompatibility($other)){
691 0           croak "ERROR: trying to subtract two incompatible ".ref($self)." objects.";
692             }
693              
694 0           my %subtractKmers = %{ $self->kmers };
  0            
695 0           for my $kmer(keys(%{ $other->kmers })){
  0            
696 0           delete($subtractKmers{$kmer});
697             }
698            
699 0           return [keys(%subtractKmers)];
700             }
701            
702            
703              
704             # See if another Bio::Kmer is the same kind as this one.
705             # Return Boolean
706             sub _checkCompatibility{
707 0     0     my($self,$other)=@_;
708              
709 0 0         if($self->{kmerlength} != $other->{kmerlength}){
710 0 0         carp "WARNING: kmer lengths do not match\n" if($self->{verbose});
711 0           return 0;
712             }
713              
714 0           return 1;
715             }
716              
717             sub _countKmersPurePerlWorker{
718 0     0     my($kmerlength,$seqArr,$sample)=@_;
719              
720 0           my $outKmerArray=[];
721 0           for my $seq(@$seqArr){
722              
723 0           my $numKmersInRead=length($seq)-$kmerlength;
724              
725             # Count kmers in a sliding window.
726             # We must keep this loop optimized for speed.
727 0           for(my $j=0;$j<$numKmersInRead;$j++){
728             #next if($sample < rand(1)); # subsample
729             #$kmer{substr($seq,$j,$kmerlength)}++;
730 0           push(@$outKmerArray, substr($seq,$j,$kmerlength));
731             }
732              
733             }
734 0           return $outKmerArray;
735             }
736              
737              
738             sub countKmersJellyfish{
739 0     0 0   my($self,$seqfile,$kmerlength)=@_;
740 0           my $basename=basename($seqfile);
741              
742             # Version checking
743 0           my $jfVersion=`jellyfish --version`; chomp($jfVersion);
  0            
744             # e.g., jellyfish 2.2.6
745 0 0         if($jfVersion =~ /(jellyfish\s+)?(\d+)?/){
746 0           my $majorVersion=$2;
747 0 0         if($majorVersion < 2){
748 0           croak "ERROR: Jellyfish v2 or greater is required for ".ref($self);
749             }
750             }
751            
752 0           my $outfile=$self->{jellyfishdb};
753              
754             # Counting
755 0           my $jellyfishCountOptions="-s 10000000 -m $kmerlength -o $outfile -t $self->{numcpus}";
756 0           my $uncompressedFastq="$self->{tempdir}/$basename.fastq";
757 0           my $zcat = which("zcat");
758 0 0         if($seqfile=~/\.gz$/i){
759 0 0         if(!-e $zcat){
760 0           croak "ERROR: could not find zcat in PATH for uncompressing $seqfile";
761             }
762 0 0         system("$zcat \Q$seqfile\E > $uncompressedFastq"); croak "ERROR uncompressing $seqfile" if $?;
  0            
763 0           system("$$self{jellyfish} count $jellyfishCountOptions \Q$uncompressedFastq\E");
764             } else {
765 0           system("$$self{jellyfish} count $jellyfishCountOptions \Q$seqfile\E");
766             }
767 0           close $self->{jellyfishdbFh};
768 0 0         croak "Error: problem with jellyfish" if $?;
769             }
770              
771             sub _dumpKmersJellyfish{
772 0     0     my($self)=@_;
773 0           my $kmerTsv=$self->{kmerfile};
774 0           my $jfDb=$self->{jellyfishdb};
775              
776 0 0         return if(-s $kmerTsv > 0);
777              
778             # Dump the kmers to a tab-delimited file if it doesn't
779             # already have contents
780 0 0         if(-s $kmerTsv < 1){
781 0           my $lowerCount=$self->{gt}-1;
782 0           system("$$self{jellyfish} dump --lower-count=$lowerCount --column --tab -o \Q$kmerTsv\E \Q$jfDb\E");
783 0 0         croak "ERROR dumping kmers from jellyfish database $jfDb" if $?;
784             }
785             }
786              
787             # Opens a fastq file in a thread-safe way.
788             # Returns a file handle.
789             sub openFastq{
790 0     0 0   my($self,$fastq)=@_;
791              
792 0           my $fh;
793              
794 0           my($name,$dir,$ext)=fileparse($fastq,@fastqExt);
795              
796 0 0         if(!grep(/$ext/,@fastqExt)){
797 0           croak "ERROR: could not read $fastq as a fastq file";
798             }
799              
800             # Open the file in different ways, depending on if it
801             # is gzipped or if the user has gzip installed.
802 0           lock($fhStick);
803 0 0         if($ext =~/\.gz$/){
804             # use binary gzip if we can... why not take advantage
805             # of the compiled binary's speedup?
806 0           my $gzip = which('gzip');
807 0 0         if(-e $gzip){
808 0 0         open($fh,"$gzip -cd \Q$fastq\E | ") or croak "ERROR: could not open $fastq for reading!: $!";
809             }else{
810 0 0         $fh=new IO::Uncompress::Gunzip($fastq) or croak "ERROR: could not read $fastq using native perl module IO::Uncompress::Gunzip: $!";
811             }
812             } else {
813 0 0         open($fh,"<",$fastq) or croak "ERROR: could not open $fastq for reading!: $!";
814             }
815              
816 0           return $fh;
817             }
818             # In case I accidentally do $Kmer->closeFastq without thinking
819             # how ridiculous that is, let's just avoid that problem with
820             # this subroutine.
821             sub closeFastq{
822 0     0 0   my($self,$fastq)=@_;
823 0           close $fastq;
824 0           return 1;
825             }
826              
827             =pod
828              
829             =over
830              
831             =item $kmer->close()
832              
833             Cleans the temporary directory and removes this object from
834             RAM. Good for when you might be counting kmers for many
835             things but want to keep your overhead low.
836              
837             Arguments: None
838             Returns: 1
839              
840             =back
841              
842             =cut
843              
844             sub close{
845 0     0 1   my($self)=@_;
846              
847 0           remove_tree($self->{tempdir});
848 0           undef($self);
849              
850 0           return 1;
851             }
852              
853             =pod
854              
855             =head1 COPYRIGHT AND LICENSE
856              
857             MIT license. Go nuts.
858              
859             =head1 AUTHOR
860              
861             Author: Lee Katz
862              
863             For additional help, go to https://github.com/lskatz/Bio--Kmer
864              
865             CPAN module at http://search.cpan.org/~lskatz/Bio-Kmer/
866              
867             =for html
868              
869             =cut
870              
871             1;