File Coverage

lib/Bio/Kmer.pm
Criterion Covered Total %
statement 43 291 14.7
branch 1 96 1.0
condition 0 25 0.0
subroutine 14 34 41.1
pod 10 17 58.8
total 68 463 14.6


line stmt bran cond sub pod time code
1             #!/usr/bin/env perl
2              
3             # Kmer.pm: a kmer counting module
4             # Author: Lee Katz <lkatz@cdc.gov>
5              
6             package Bio::Kmer;
7             require 5.10.0;
8             our $VERSION=0.26;
9              
10 4     4   3353 use strict;
  4         9  
  4         118  
11 4     4   27 use warnings;
  4         8  
  4         113  
12              
13 4     4   21 use List::Util qw/max/;
  4         9  
  4         256  
14 4     4   26 use File::Basename qw/basename fileparse dirname/;
  4         7  
  4         238  
15 4     4   708 use File::Temp qw/tempdir tempfile/;
  4         16294  
  4         248  
16 4     4   28 use File::Path qw/remove_tree/;
  4         9  
  4         185  
17 4     4   24 use Data::Dumper qw/Dumper/;
  4         7  
  4         190  
18 4     4   594 use IO::Uncompress::Gunzip;
  4         36904  
  4         237  
19 4     4   1457 use File::Which qw/which/;
  4         3542  
  4         213  
20 4     4   30 use Carp qw/croak carp confess/;
  4         8  
  4         410  
21              
22             our $iThreads; # boolean for whether threads are loaded
23             BEGIN{
24 4     4   15 eval{
25 4         2257 require threads;
26 0         0 require threads::shared;
27 0         0 $iThreads = 1;
28             };
29 4 50       4258 if($@){
30 4         106 $iThreads = 0;
31             }
32             }
33              
34 4     4   24 use Exporter qw/import/;
  4         9  
  4         528  
35             our @EXPORT_OK = qw(
36             );
37              
38             our @fastqExt=qw(.fastq.gz .fastq .fq .fq.gz);
39             our @fastaExt=qw(.fasta .fna .faa .mfa .fas .fa);
40             our @bamExt=qw(.sorted.bam .bam);
41             our @vcfExt=qw(.vcf.gz .vcf);
42             our @richseqExt=qw(.gbk .gbf .gb .embl);
43             our @sffExt=qw(.sff);
44             our @samExt=qw(.sam .bam);
45              
46 4     4   50 our $fhStick :shared; # Helps us open only one file at a time
  4         8  
  4         27  
47 4     4   311 our $enqueueStick :shared; # Helps control access to the kmer queue
  4         8  
  4         19  
48              
49              
50             # TODO if 'die' is imported by a script, redefine
51             # sig die in that script as this function.
52             local $SIG{'__DIE__'} = sub { my $e = $_[0]; $e =~ s/(at [^\s]+? line \d+\.$)/\nStopped $1/; die("$0: ".(caller(1))[3].": ".$e); };
53              
54             my $startTime = time();
55             sub logmsg{
56 0     0 0   local $0 = basename $0;
57 0           my $tid = 0;
58 0 0         if($iThreads){
59 0           $tid = threads->tid;
60             }
61 0           my $elapsedTime = time() - $startTime;
62 0           print STDERR "$0.$tid $elapsedTime @_\n";
63             }
64              
65             =pod
66              
67             =head1 NAME
68              
69             Bio::Kmer - Helper module for Kmer Analysis.
70              
71             =head1 SYNOPSIS
72              
73             A module for helping with kmer analysis.
74              
75             use strict;
76             use warnings;
77             use Bio::Kmer;
78            
79             my $kmer=Bio::Kmer->new("file.fastq.gz",{kmercounter=>"jellyfish",numcpus=>4});
80             my $kmerHash=$kmer->kmers();
81             my $countOfCounts=$kmer->histogram();
82              
83             The BioPerl way
84              
85             use strict;
86             use warnings;
87             use Bio::SeqIO;
88             use Bio::Kmer;
89              
90             # Load up any Bio::SeqIO object. Quality values will be
91             # faked internally to help with compatibility even if
92             # a fastq file is given.
93             my $seqin = Bio::SeqIO->new(-file=>"input.fasta");
94             my $kmer=Bio::Kmer->new($seqin);
95             my $kmerHash=$kmer->kmers();
96             my $countOfCounts=$kmer->histogram();
97              
98             =head1 DESCRIPTION
99              
100             A module for helping with kmer analysis. The basic methods help count kmers and can produce a count of counts. Currently this module only supports fastq format. Although this module can count kmers with pure perl, it is recommended to give the option for a different kmer counter such as Jellyfish.
101              
102             =pod
103              
104             =head1 VARIABLES
105              
106             =over
107              
108             =item $Bio::Kmer::iThreads
109              
110             Boolean describing whether the module instance is using threads
111              
112             =back
113              
114             =head1 METHODS
115              
116             =over
117              
118             =item Bio::Kmer->new($filename, \%options)
119              
120             Create a new instance of the kmer counter. One object per file.
121              
122             Filename can be either a file path or a Bio::SeqIO object.
123              
124             Applicable arguments for \%options:
125             Argument Default Description
126             kmercounter perl What kmer counter software to use.
127             Choices: Perl, Jellyfish.
128             kmerlength|k 21 Kmer length
129             numcpus 1 This module uses perl
130             multithreading with pure perl or
131             can supply this option to other
132             software like jellyfish.
133             gt 1 If the count of kmers is fewer
134             than this, ignore the kmer. This
135             might help speed analysis if you
136             do not care about low-count kmers.
137             sample 1 Retain only a percentage of kmers.
138             1 is 100%; 0 is 0%
139             Only works with the perl kmer counter.
140             verbose 0 Print more messages.
141              
142             Examples:
143             my $kmer=Bio::Kmer->new("file.fastq.gz",{kmercounter=>"jellyfish",numcpus=>4});
144              
145             =back
146              
147             =cut
148              
149             sub new{
150 0     0 1   my($class,$seqfile,$settings)=@_;
151              
152 0 0         croak "ERROR: need a sequence file or a Bio::SeqIO object" if(!$seqfile);
153              
154             # Set optional parameter defaults
155 0   0       $$settings{kmerlength} = $$settings{kmerlength} || $$settings{k} || 21;
156 0   0       $$settings{numcpus} ||= 1;
157 0   0       $$settings{gt} ||= 1;
158 0   0       $$settings{kmercounter} ||= "perl";
159 0   0       $$settings{tempdir} ||= tempdir("Kmer.pm.XXXXXX",TMPDIR=>1,CLEANUP=>1);
160 0 0         $$settings{sample} = 1 if(!defined($$settings{sample}));
161 0   0       $$settings{verbose} ||= 0;
162              
163             # If the first parameter $seqfile is a Bio::SeqIO object,
164             # then send it to a file to dovetail with the rest of
165             # this module.
166 0 0 0       if(ref($seqfile) && $seqfile->isa("Bio::SeqIO")){
167             # BioPerl isn't required at compile tile but is
168             # required if the user starts with a BioPerl object.
169 0           eval {
170 0           require Bio::SeqIO;
171 0           require Bio::Seq::Quality;
172             };
173 0 0         if($@){
174 0           croak "ERROR: cannot load Bio::SeqIO and Bio::Seq::Quality, but I need to because you supplied a Bio::SeqIO object";
175             }
176 0           my $tempfile="$$settings{tempdir}/bioperl_input.fastq";
177 0           my $out=Bio::SeqIO->new(-file=>">".$tempfile);
178 0           while(my $seq=$seqfile->next_seq){
179 0   0       my $qual = $seq->qual || "I" x $seq->length;
180 0           my $seqWithQual = Bio::Seq::Quality->new(
181             # TODO preserve qual values if they exist, but
182             # for now, it doesn't really matter.
183             -qual=> $qual,
184             -seq => $seq->seq,
185             -id => $seq->id,
186             -verbose => -1,
187             );
188 0           $out->write_seq($seqWithQual);
189             }
190 0           $out->close; # make sure output is flushed
191              
192             # now redefine the seqfile as the new file on disk.
193 0           $seqfile=$tempfile;
194             }
195              
196             # Check if we have a valid sequence file path or at
197             # the very least, double check that the file path we just
198             # made from the Bio::SeqIO object is valid.
199 0 0         croak "ERROR: could not locate the sequence file $seqfile" if(!-e $seqfile);
200              
201             # Initialize the object and then bless it
202             my $self={
203             seqfile =>$seqfile,
204             kmerlength =>$$settings{kmerlength},
205             numcpus =>$$settings{numcpus},
206             tempdir =>$$settings{tempdir},
207             gt =>$$settings{gt},
208             kmercounter=>$$settings{kmercounter},
209             sample =>$$settings{sample},
210             verbose =>$$settings{verbose},
211              
212             # Jellyfish-specific
213 0           jellyfish =>scalar(which("jellyfish")),
214              
215             # Values that will be filled in after analysis
216             _kmers =>{}, # hash of kmer=>count
217             _hist =>[], # histogram. 0th element is counts of kmers occuring no times, 1st is elements occuring exactly once, etc
218             };
219             # Add in some other temporary files
220             #($$self{kmerfileFh},$$self{kmerfile}) = tempfile("KMER.XXXXXX", DIR=>$$self{tempdir}, SUFFIX=>".tsv");
221 0           ($$self{histfileFh},$$self{histfile}) = tempfile("HIST.XXXXXX", DIR=>$$self{tempdir}, SUFFIX=>".tsv");
222 0           ($$self{jellyfishdbFh},$$self{jellyfishdb})= tempfile("JF.XXXXXX", DIR=>$$self{tempdir}, SUFFIX=>".jf");
223              
224             # Make some values lc
225 0           $$self{$_}=lc($$self{$_}) for(qw(kmercounter));
226              
227             # Set an exact parameter for the kmer counter
228 0 0         if($$self{kmercounter}=~ /(pure)?.*perl/){
    0          
229 0           $$self{kmercounter}="perl";
230             } elsif($self->{kmercounter} =~ /jellyfish/){
231 0           $$self{kmercounter}="jellyfish";
232             }
233              
234 0           bless($self);
235              
236 0           $self->count; # start off the kmer counting ASAP
237              
238 0           return $self;
239             }
240              
241             =pod
242              
243             =over
244              
245             =item $kmer->count()
246              
247             Count kmers. This method is called as soon as new() is called
248             and so you should never have to run this method.
249             Internally caches the kmer counts to ram.
250              
251             Arguments: None
252             Returns: None
253              
254             =back
255              
256             =cut
257              
258             sub count{
259 0     0 1   my($self)=@_;
260              
261 0           my $seqfile=$self->{seqfile};
262 0           my $kmerlength=$self->{kmerlength};
263              
264 0           my $kmerHash={};
265              
266 0 0         if($self->{kmercounter} eq "perl"){
    0          
267 0 0         if(!$iThreads){
268 0           croak("Warning: perl iThreads are not enabled in this version of perl, but you requested iThreads when running ". ref($self));
269             }
270 0           $self->countKmersPurePerl($seqfile,$kmerlength);
271             } elsif($self->{kmercounter} eq "jellyfish"){
272             # TODO make JF DB file $self->{jellyfishDb} and do not return
273             # a kmer count
274 0           $self->countKmersJellyfish($seqfile,$kmerlength);
275             } else {
276 0           croak "ERROR: I do not understand the kmer counter $self->{kmercounter}";
277             }
278              
279 0 0         if($$self{sample} < 1){
280              
281 0           my $sample = $$self{sample};
282 0           my $kmers = $self->kmers();
283            
284 0           my %subsampledKmers;
285 0           for my $kmer(keys(%$kmers)){
286 0           my $count = $$kmers{$kmer};
287 0           for(my $i=0;$i<$count;$i++){
288 0 0         if($sample >= rand(1)){
289 0           $subsampledKmers{$kmer}++;
290             }
291             }
292             }
293            
294 0           $self->clearCache();
295 0           $$self{_kmers} = \%subsampledKmers;
296             }
297             }
298              
299             =pod
300              
301             =over
302              
303             =item $kmer->clearCache
304              
305             Clears kmer counts and histogram counts. You should probably never use
306             this method.
307              
308             Arguments: None
309             Returns: None
310              
311             =back
312              
313             =cut
314              
315             sub clearCache{
316 0     0 1   my($self) = @_;
317              
318 0           $$self{_kmers} = {};
319 0           $$self{_hist} = [];
320             }
321              
322             =pod
323              
324             =over
325              
326             =item $kmer->query($queryString)
327              
328             Query the set of kmers with your own query
329              
330             Arguments: query (string)
331             Returns: Count of kmers.
332             0 indicates that the kmer was not found.
333             -1 indicates an invalid kmer (e.g., invalid length)
334              
335             =back
336              
337             =cut
338              
339             sub query{
340 0     0 1   my($self,$query)=@_;
341              
342 0 0         if(length($query) != $self->{kmerlength}){
343 0           return -1;
344             }
345              
346 0           my $count=0;
347 0 0         if($self->{kmercounter} eq "perl"){
    0          
348 0           my $kmers=$self->kmers();
349 0   0       $count=$$kmers{uc($query)} || 0;
350             } elsif($self->{kmercounter} eq "jellyfish"){
351 0 0         open(my $queryFh, "$$self{jellyfish} query ".$self->{jellyfishdb}." |") or croak "ERROR: could not run jellyfish query: $!";
352 0           my $db=$self->{jellyfishdb};
353 0           my $tmp=`jellyfish query $db $query`;
354 0 0         croak "ERROR: could not run jellyfish query" if $?;
355 0           chomp($tmp);
356 0           (undef,$count)=split(/\s+/,$tmp);
357             }
358            
359 0           return $count;
360             }
361              
362             =pod
363              
364             =over
365              
366             =item $kmer->histogram()
367              
368             Count the frequency of kmers. Internally caches the histogram to ram.
369              
370             Arguments: none
371             Returns: Reference to an array of counts. The index of
372             the array is the frequency.
373              
374             =back
375              
376             =cut
377              
378             sub histogram{
379 0     0 1   my($self)=@_;
380              
381 0 0         if(@{ $$self{_hist} } > 0){
  0            
382 0           return $$self{_hist};
383             }
384              
385 0 0         if($self->{kmercounter} eq "jellyfish"){
386 0           return $self->histogramJellyfish();
387             } else {
388 0           return $self->histogramPerl();
389             }
390             }
391              
392             sub histogramJellyfish{
393 0     0 0   my($self)=@_;
394            
395 0           close $self->{histfileFh};
396              
397             # Run jellyfish histo
398 0           my $jellyfishXopts = join(" ","-t", $self->{numcpus}, "-o", $self->{histfile}, $self->{jellyfishdb});
399 0           system("$$self{jellyfish} histo $jellyfishXopts");
400 0 0         croak "ERROR with jellyfish histo" if $?;
401            
402             # Read the output file
403 0           my @hist=(0);
404 0 0         open(my $fh, $self->{histfile}) or croak "ERROR: reading $self->{histfile}: $!";
405 0           while(<$fh>){
406 0           s/^\s+|\s+$//g;
407 0           my($count, $countOfCounts)=split /\s+/;
408 0           $hist[$count]=$countOfCounts;
409             }
410 0           close $fh;
411              
412             # Fill in gaps in the histogram
413 0           for(@hist){
414 0   0       $_||=0;
415             }
416              
417 0           $self->{_hist} = \@hist;
418 0           return \@hist;
419             }
420              
421             sub histogramPerl{
422 0     0 0   my($self)=@_;
423 0           my %hist=();
424              
425 0           my @hist=(0); # initialize the histogram with a count of zero kmers happening zero times
426             #$hist[0]=4**$self->{kmerlength}; # or maybe it should be initialized to the total number of possibilities.
427              
428 0           for my $kmercount(values(%{ $self->kmers() } )){
  0            
429 0           $hist{$kmercount}++;
430             }
431              
432             # Turn this hash into an array
433 0           for(1..max(keys(%hist))){
434 0   0       $hist[$_] = $hist{$_} || 0;
435             #$hist[0]=$hist[0] - $hist[$_]; # subtract off from the total space of possible kmers
436             }
437              
438 0           $self->{_hist}=\@hist;
439 0           return \@hist;
440             }
441              
442             # Pure perl to make this standalone... the only reason
443             # we are counting kmers in Perl instead of C.
444             sub countKmersPurePerl{
445 0     0 0   my($self,$seqfile,$kmerlength)=@_;
446              
447 0           my @allSeqs;
448              
449             # Save all seqs to an array, for passing out to individual threads.
450 0           my $fastqFh=$self->openFastq($seqfile);
451 0           my $i=0;
452 0           my @buffer=();
453 0           while(<$fastqFh>){ # burn the read ID line
454 0           $i++;
455 0           my $seq=<$fastqFh>;
456 0           push(@allSeqs, uc($seq));
457             # Burn the quality score lines
458 0           <$fastqFh>;
459 0           <$fastqFh>;
460             }
461 0           close $fastqFh;
462              
463             # The number of sequences per thread is divided evenly but cautions
464             # toward having one extra sequence per thread in the first threads
465             # rather than accidentally leaving some off at the end.
466 0           my $numSeqsPerThread = int(scalar(@allSeqs)/$self->{numcpus}) + 1;
467              
468             # Multithreading
469 0           my @thr;
470 0           for(0..$self->{numcpus}-1){
471             # Get a set of sequences to kmerize
472 0           my @threadSeqs = splice(@allSeqs, 0, $numSeqsPerThread);
473             # Set up a place for kmers to land
474 0           $thr[$_]=threads->new(\&_countKmersPurePerlWorker,$kmerlength,\@threadSeqs,$self->{sample});
475             #logmsg "Kicking off thread ".$thr[$_]->tid." with ".scalar(@threadSeqs)." sequences";
476             }
477            
478             # Join the threads and put everything into a large kmer hash
479 0           my %kmer;
480 0           for(@thr){
481 0           my $kmerArr = $_->join;
482 0           for my $kmer(@$kmerArr){
483 0           $kmer{$kmer}++;
484             }
485             }
486            
487 0           my($fh, $filename) = tempfile("KMER.XXXXXX", DIR=>$$self{tempdir}, SUFFIX=>".tsv");
488 0           ($$self{kmerfileFh},$$self{kmerfile}) = ($fh, $filename);
489              
490             # Write everything to file. The FH should still be open.
491             # Do not return the kmer.
492             # Make a new method that returns the kmer hash
493             # Do the same for jellyfish
494 0           while(my($kmer,$count)=each(%kmer)){
495             # Filtering step
496 0 0         if($count < $self->{gt}){
497             #delete($kmer{$kmer});
498 0           next;
499             }
500            
501 0           print $fh "$kmer\t$count\n";
502             }
503 0           close $fh;
504              
505 0           return 1;
506             }
507              
508             =pod
509              
510             =over
511              
512             =item $kmer->kmers
513              
514             Return actual kmers
515              
516             Arguments: None
517             Returns: Reference to a hash of kmers and their counts
518              
519             =back
520              
521             =cut
522              
523              
524             sub kmers{
525 0     0 1   my($self)=@_;
526              
527             # Look for the cached results before trying to read the file.
528 0 0         return $self->{_kmers} if(keys(%{ $self->{_kmers} }) > 0);
  0            
529              
530             # Dump the kmers to a tab delimited file if we are using
531             # jellyfish and the user invokes this method.
532 0 0         if($self->{kmercounter} eq "jellyfish"){
533 0           $self->_dumpKmersJellyfish();
534             }
535              
536 0           my %kmer;
537 0 0         open(my $fh, $self->{kmerfile}) or croak "ERROR: could not read the kmer file: $!";
538 0           while(<$fh>){
539 0           chomp;
540 0           my($kmer,$count)=split /\t/;
541 0           $kmer{$kmer}=$count;
542             }
543 0           close $fh;
544              
545 0           $self->{_kmers}=\%kmer;
546              
547 0           return \%kmer;
548             }
549              
550             =pod
551              
552             =over
553              
554             =item $kmer->union($kmer2)
555              
556             Finds the union between two sets of kmers
557              
558             Arguments: Another Bio::Kmer object
559             Returns: List of kmers
560              
561             =back
562              
563             =cut
564              
565             sub union{
566 0     0 1   my($self,$other)=@_;
567            
568 0 0         if(!$self->_checkCompatibility($other)){
569 0           croak "ERROR: two objects are not compatible in a union: ".ref($self)." / ".ref($other);
570             }
571              
572             # See what kmers are in common using hashes
573 0           my %union;
574 0           my $kmer1 = $self->kmers;
575 0           my $kmer2 = $other->kmers;
576 0           for my $kmer(keys(%{ $self->kmers }), keys(%{ $other->kmers })){
  0            
  0            
577 0           $union{$kmer}=1;
578             }
579              
580 0           return [keys(%union)];
581             }
582              
583             =pod
584              
585             =over
586              
587             =item $kmer->intersection($kmer2)
588              
589             Finds the intersection between two sets of kmers
590              
591             Arguments: Another Bio::Kmer object
592             Returns: List of kmers
593              
594             =back
595              
596             =cut
597              
598             sub intersection{
599 0     0 1   my($self,$other)=@_;
600              
601 0 0         if(!$self->_checkCompatibility($other)){
602 0           croak "ERROR: two objects are not compatible in an intersection: ".ref($self)." / ".ref($other);
603             }
604              
605 0           my @intersection;
606 0           my $kmer2 = $other->kmers;
607 0           for my $kmer(keys(%{ $self->kmers })){
  0            
608 0 0         if($$kmer2{$kmer}){
609 0           push(@intersection, $kmer);
610             }
611             }
612              
613 0           return \@intersection;
614             }
615              
616             =pod
617              
618             =over
619              
620             =item $kmer->subtract($kmer2)
621              
622             Finds the set of kmers unique to this Bio::Kmer object.
623              
624             Arguments: Another Bio::Kmer object
625             Returns: List of kmers
626              
627             =back
628              
629             =cut
630              
631             sub subtract{
632 0     0 1   my($self,$other)=@_;
633              
634 0 0         if(!$self->_checkCompatibility($other)){
635 0           croak "ERROR: trying to subtract two incompatible ".ref($self)." objects.";
636             }
637              
638 0           my %subtractKmers = %{ $self->kmers };
  0            
639 0           for my $kmer(keys(%{ $other->kmers })){
  0            
640 0           delete($subtractKmers{$kmer});
641             }
642            
643 0           return [keys(%subtractKmers)];
644             }
645            
646            
647              
648             # See if another Bio::Kmer is the same kind as this one.
649             # Return Boolean
650             sub _checkCompatibility{
651 0     0     my($self,$other)=@_;
652              
653 0 0         if($self->{kmerlength} != $other->{kmerlength}){
654 0 0         carp "WARNING: kmer lengths do not match\n" if($self->{verbose});
655 0           return 0;
656             }
657              
658 0           return 1;
659             }
660              
661             sub _countKmersPurePerlWorker{
662 0     0     my($kmerlength,$seqArr,$sample)=@_;
663              
664 0           my $outKmerArray=[];
665 0           for my $seq(@$seqArr){
666              
667 0           my $numKmersInRead=length($seq)-$kmerlength;
668              
669             # Count kmers in a sliding window.
670             # We must keep this loop optimized for speed.
671 0           for(my $j=0;$j<$numKmersInRead;$j++){
672             #next if($sample < rand(1)); # subsample
673             #$kmer{substr($seq,$j,$kmerlength)}++;
674 0           push(@$outKmerArray, substr($seq,$j,$kmerlength));
675             }
676              
677             }
678 0           return $outKmerArray;
679             }
680              
681              
682             sub countKmersJellyfish{
683 0     0 0   my($self,$seqfile,$kmerlength)=@_;
684 0           my $basename=basename($seqfile);
685              
686             # Version checking
687 0           my $jfVersion=`jellyfish --version`; chomp($jfVersion);
  0            
688             # e.g., jellyfish 2.2.6
689 0 0         if($jfVersion =~ /(jellyfish\s+)?(\d+)?/){
690 0           my $majorVersion=$2;
691 0 0         if($majorVersion < 2){
692 0           croak "ERROR: Jellyfish v2 or greater is required for ".ref($self);
693             }
694             }
695            
696 0           my $outfile=$self->{jellyfishdb};
697              
698             # Counting
699 0           my $jellyfishCountOptions="-s 10000000 -m $kmerlength -o $outfile -t $self->{numcpus}";
700 0           my $uncompressedFastq="$self->{tempdir}/$basename.fastq";
701 0           my $zcat = which("zcat");
702 0 0         if($seqfile=~/\.gz$/i){
703 0 0         if(!-e $zcat){
704 0           croak "ERROR: could not find zcat in PATH for uncompressing $seqfile";
705             }
706 0 0         system("$zcat \Q$seqfile\E > $uncompressedFastq"); croak "ERROR uncompressing $seqfile" if $?;
  0            
707 0           system("$$self{jellyfish} count $jellyfishCountOptions \Q$uncompressedFastq\E");
708             } else {
709 0           system("$$self{jellyfish} count $jellyfishCountOptions \Q$seqfile\E");
710             }
711 0           close $self->{jellyfishdbFh};
712 0 0         croak "Error: problem with jellyfish" if $?;
713             }
714              
715             sub _dumpKmersJellyfish{
716 0     0     my($self)=@_;
717 0           my $kmerTsv=$self->{kmerfile};
718 0           my $jfDb=$self->{jellyfishdb};
719              
720 0 0         return if(-s $kmerTsv > 0);
721              
722             # Dump the kmers to a tab-delimited file if it doesn't
723             # already have contents
724 0 0         if(-s $kmerTsv < 1){
725 0           my $lowerCount=$self->{gt}-1;
726 0           system("$$self{jellyfish} dump --lower-count=$lowerCount --column --tab -o \Q$kmerTsv\E \Q$jfDb\E");
727 0 0         croak "ERROR dumping kmers from jellyfish database $jfDb" if $?;
728             }
729             }
730              
731             # Opens a fastq file in a thread-safe way.
732             # Returns a file handle.
733             sub openFastq{
734 0     0 0   my($self,$fastq)=@_;
735              
736 0           my $fh;
737              
738 0           my($name,$dir,$ext)=fileparse($fastq,@fastqExt);
739              
740 0 0         if(!grep(/$ext/,@fastqExt)){
741 0           croak "ERROR: could not read $fastq as a fastq file";
742             }
743              
744             # Open the file in different ways, depending on if it
745             # is gzipped or if the user has gzip installed.
746 0           lock($fhStick);
747 0 0         if($ext =~/\.gz$/){
748             # use binary gzip if we can... why not take advantage
749             # of the compiled binary's speedup?
750 0           my $gzip = which('gzip');
751 0 0         if(-e $gzip){
752 0 0         open($fh,"$gzip -cd \Q$fastq\E | ") or croak "ERROR: could not open $fastq for reading!: $!";
753             }else{
754 0 0         $fh=new IO::Uncompress::Gunzip($fastq) or croak "ERROR: could not read $fastq using native perl module IO::Uncompress::Gunzip: $!";
755             }
756             } else {
757 0 0         open($fh,"<",$fastq) or croak "ERROR: could not open $fastq for reading!: $!";
758             }
759              
760 0           return $fh;
761             }
762             # In case I accidentally do $Kmer->closeFastq without thinking
763             # how ridiculous that is, let's just avoid that problem with
764             # this subroutine.
765             sub closeFastq{
766 0     0 0   my($self,$fastq)=@_;
767 0           close $fastq;
768 0           return 1;
769             }
770              
771             =pod
772              
773             =over
774              
775             =item $kmer->close()
776              
777             Cleans the temporary directory and removes this object from
778             RAM. Good for when you might be counting kmers for many
779             things but want to keep your overhead low.
780              
781             Arguments: None
782             Returns: 1
783              
784             =back
785              
786             =cut
787              
788             sub close{
789 0     0 1   my($self)=@_;
790              
791 0           remove_tree($self->{tempdir});
792 0           undef($self);
793              
794 0           return 1;
795             }
796              
797             =pod
798              
799             =head1 COPYRIGHT AND LICENSE
800              
801             MIT license. Go nuts.
802              
803             =head1 AUTHOR
804              
805             Author: Lee Katz <lkatz@cdc.gov>
806              
807             For additional help, go to https://github.com/lskatz/Bio--Kmer
808              
809             CPAN module at http://search.cpan.org/~lskatz/Bio-Kmer/
810              
811             =for html <a href="https://travis-ci.org/lskatz/Bio--Kmer"><img src="https://travis-ci.org/lskatz/Bio--Kmer.svg?branch=master"></a>
812              
813             =cut
814              
815             1;