File Coverage

lib/Bio/Kmer.pm

Criterion	Covered	Total	%
statement	43	291	14.7
branch	1	96	1.0
condition	0	25	0.0
subroutine	14	34	41.1
pod	10	17	58.8
total	68	463	14.6

line	stmt	bran	cond	sub	pod	time	code
1							#!/usr/bin/env perl
2
3							# Kmer.pm: a kmer counting module
4							# Author: Lee Katz <lkatz@cdc.gov>
5
6							package Bio::Kmer;
7							require 5.10.0;
8							our $VERSION=0.26;
9
10	4			4		3353	use strict;
	4					9
	4					118
11	4			4		27	use warnings;
	4					8
	4					113
12
13	4			4		21	use List::Util qw/max/;
	4					9
	4					256
14	4			4		26	use File::Basename qw/basename fileparse dirname/;
	4					7
	4					238
15	4			4		708	use File::Temp qw/tempdir tempfile/;
	4					16294
	4					248
16	4			4		28	use File::Path qw/remove_tree/;
	4					9
	4					185
17	4			4		24	use Data::Dumper qw/Dumper/;
	4					7
	4					190
18	4			4		594	use IO::Uncompress::Gunzip;
	4					36904
	4					237
19	4			4		1457	use File::Which qw/which/;
	4					3542
	4					213
20	4			4		30	use Carp qw/croak carp confess/;
	4					8
	4					410
21
22							our $iThreads; # boolean for whether threads are loaded
23							BEGIN{
24	4			4		15	eval{
25	4					2257	require threads;
26	0					0	require threads::shared;
27	0					0	$iThreads = 1;
28							};
29	4	50				4258	if($@){
30	4					106	$iThreads = 0;
31							}
32							}
33
34	4			4		24	use Exporter qw/import/;
	4					9
	4					528
35							our @EXPORT_OK = qw(
36							);
37
38							our @fastqExt=qw(.fastq.gz .fastq .fq .fq.gz);
39							our @fastaExt=qw(.fasta .fna .faa .mfa .fas .fa);
40							our @bamExt=qw(.sorted.bam .bam);
41							our @vcfExt=qw(.vcf.gz .vcf);
42							our @richseqExt=qw(.gbk .gbf .gb .embl);
43							our @sffExt=qw(.sff);
44							our @samExt=qw(.sam .bam);
45
46	4			4		50	our $fhStick :shared; # Helps us open only one file at a time
	4					8
	4					27
47	4			4		311	our $enqueueStick :shared; # Helps control access to the kmer queue
	4					8
	4					19
48
49
50							# TODO if 'die' is imported by a script, redefine
51							# sig die in that script as this function.
52							local $SIG{'__DIE__'} = sub { my $e = $_[0]; $e =~ s/(at [^\s]+? line \d+\.$)/\nStopped $1/; die("$0: ".(caller(1))[3].": ".$e); };
53
54							my $startTime = time();
55							sub logmsg{
56	0			0	0		local $0 = basename $0;
57	0						my $tid = 0;
58	0	0					if($iThreads){
59	0						$tid = threads->tid;
60							}
61	0						my $elapsedTime = time() - $startTime;
62	0						print STDERR "$0.$tid $elapsedTime @_\n";
63							}
64
65							=pod
66
67							=head1 NAME
68
69							Bio::Kmer - Helper module for Kmer Analysis.
70
71							=head1 SYNOPSIS
72
73							A module for helping with kmer analysis.
74
75							use strict;
76							use warnings;
77							use Bio::Kmer;
78
79							my $kmer=Bio::Kmer->new("file.fastq.gz",{kmercounter=>"jellyfish",numcpus=>4});
80							my $kmerHash=$kmer->kmers();
81							my $countOfCounts=$kmer->histogram();
82
83							The BioPerl way
84
85							use strict;
86							use warnings;
87							use Bio::SeqIO;
88							use Bio::Kmer;
89
90							# Load up any Bio::SeqIO object. Quality values will be
91							# faked internally to help with compatibility even if
92							# a fastq file is given.
93							my $seqin = Bio::SeqIO->new(-file=>"input.fasta");
94							my $kmer=Bio::Kmer->new($seqin);
95							my $kmerHash=$kmer->kmers();
96							my $countOfCounts=$kmer->histogram();
97
98							=head1 DESCRIPTION
99
100							A module for helping with kmer analysis. The basic methods help count kmers and can produce a count of counts. Currently this module only supports fastq format. Although this module can count kmers with pure perl, it is recommended to give the option for a different kmer counter such as Jellyfish.
101
102							=pod
103
104							=head1 VARIABLES
105
106							=over
107
108							=item $Bio::Kmer::iThreads
109
110							Boolean describing whether the module instance is using threads
111
112							=back
113
114							=head1 METHODS
115
116							=over
117
118							=item Bio::Kmer->new($filename, \%options)
119
120							Create a new instance of the kmer counter. One object per file.
121
122							Filename can be either a file path or a Bio::SeqIO object.
123
124							Applicable arguments for \%options:
125							Argument Default Description
126							kmercounter perl What kmer counter software to use.
127							Choices: Perl, Jellyfish.
128							kmerlength\|k 21 Kmer length
129							numcpus 1 This module uses perl
130							multithreading with pure perl or
131							can supply this option to other
132							software like jellyfish.
133							gt 1 If the count of kmers is fewer
134							than this, ignore the kmer. This
135							might help speed analysis if you
136							do not care about low-count kmers.
137							sample 1 Retain only a percentage of kmers.
138							1 is 100%; 0 is 0%
139							Only works with the perl kmer counter.
140							verbose 0 Print more messages.
141
142							Examples:
143							my $kmer=Bio::Kmer->new("file.fastq.gz",{kmercounter=>"jellyfish",numcpus=>4});
144
145							=back
146
147							=cut
148
149							sub new{
150	0			0	1		my($class,$seqfile,$settings)=@_;
151
152	0	0					croak "ERROR: need a sequence file or a Bio::SeqIO object" if(!$seqfile);
153
154							# Set optional parameter defaults
155	0		0				$$settings{kmerlength} = $$settings{kmerlength} \|\| $$settings{k} \|\| 21;
156	0		0				$$settings{numcpus} \|\|= 1;
157	0		0				$$settings{gt} \|\|= 1;
158	0		0				$$settings{kmercounter} \|\|= "perl";
159	0		0				$$settings{tempdir} \|\|= tempdir("Kmer.pm.XXXXXX",TMPDIR=>1,CLEANUP=>1);
160	0	0					$$settings{sample} = 1 if(!defined($$settings{sample}));
161	0		0				$$settings{verbose} \|\|= 0;
162
163							# If the first parameter $seqfile is a Bio::SeqIO object,
164							# then send it to a file to dovetail with the rest of
165							# this module.
166	0	0	0				if(ref($seqfile) && $seqfile->isa("Bio::SeqIO")){
167							# BioPerl isn't required at compile tile but is
168							# required if the user starts with a BioPerl object.
169	0						eval {
170	0						require Bio::SeqIO;
171	0						require Bio::Seq::Quality;
172							};
173	0	0					if($@){
174	0						croak "ERROR: cannot load Bio::SeqIO and Bio::Seq::Quality, but I need to because you supplied a Bio::SeqIO object";
175							}
176	0						my $tempfile="$$settings{tempdir}/bioperl_input.fastq";
177	0						my $out=Bio::SeqIO->new(-file=>">".$tempfile);
178	0						while(my $seq=$seqfile->next_seq){
179	0		0				my $qual = $seq->qual \|\| "I" x $seq->length;
180	0						my $seqWithQual = Bio::Seq::Quality->new(
181							# TODO preserve qual values if they exist, but
182							# for now, it doesn't really matter.
183							-qual=> $qual,
184							-seq => $seq->seq,
185							-id => $seq->id,
186							-verbose => -1,
187							);
188	0						$out->write_seq($seqWithQual);
189							}
190	0						$out->close; # make sure output is flushed
191
192							# now redefine the seqfile as the new file on disk.
193	0						$seqfile=$tempfile;
194							}
195
196							# Check if we have a valid sequence file path or at
197							# the very least, double check that the file path we just
198							# made from the Bio::SeqIO object is valid.
199	0	0					croak "ERROR: could not locate the sequence file $seqfile" if(!-e $seqfile);
200
201							# Initialize the object and then bless it
202							my $self={
203							seqfile =>$seqfile,
204							kmerlength =>$$settings{kmerlength},
205							numcpus =>$$settings{numcpus},
206							tempdir =>$$settings{tempdir},
207							gt =>$$settings{gt},
208							kmercounter=>$$settings{kmercounter},
209							sample =>$$settings{sample},
210							verbose =>$$settings{verbose},
211
212							# Jellyfish-specific
213	0						jellyfish =>scalar(which("jellyfish")),
214
215							# Values that will be filled in after analysis
216							_kmers =>{}, # hash of kmer=>count
217							_hist =>[], # histogram. 0th element is counts of kmers occuring no times, 1st is elements occuring exactly once, etc
218							};
219							# Add in some other temporary files
220							#($$self{kmerfileFh},$$self{kmerfile}) = tempfile("KMER.XXXXXX", DIR=>$$self{tempdir}, SUFFIX=>".tsv");
221	0						($$self{histfileFh},$$self{histfile}) = tempfile("HIST.XXXXXX", DIR=>$$self{tempdir}, SUFFIX=>".tsv");
222	0						($$self{jellyfishdbFh},$$self{jellyfishdb})= tempfile("JF.XXXXXX", DIR=>$$self{tempdir}, SUFFIX=>".jf");
223
224							# Make some values lc
225	0						$$self{$_}=lc($$self{$_}) for(qw(kmercounter));
226
227							# Set an exact parameter for the kmer counter
228	0	0					if($$self{kmercounter}=~ /(pure)?.*perl/){
		0
229	0						$$self{kmercounter}="perl";
230							} elsif($self->{kmercounter} =~ /jellyfish/){
231	0						$$self{kmercounter}="jellyfish";
232							}
233
234	0						bless($self);
235
236	0						$self->count; # start off the kmer counting ASAP
237
238	0						return $self;
239							}
240
241							=pod
242
243							=over
244
245							=item $kmer->count()
246
247							Count kmers. This method is called as soon as new() is called
248							and so you should never have to run this method.
249							Internally caches the kmer counts to ram.
250
251							Arguments: None
252							Returns: None
253
254							=back
255
256							=cut
257
258							sub count{
259	0			0	1		my($self)=@_;
260
261	0						my $seqfile=$self->{seqfile};
262	0						my $kmerlength=$self->{kmerlength};
263
264	0						my $kmerHash={};
265
266	0	0					if($self->{kmercounter} eq "perl"){
		0
267	0	0					if(!$iThreads){
268	0						croak("Warning: perl iThreads are not enabled in this version of perl, but you requested iThreads when running ". ref($self));
269							}
270	0						$self->countKmersPurePerl($seqfile,$kmerlength);
271							} elsif($self->{kmercounter} eq "jellyfish"){
272							# TODO make JF DB file $self->{jellyfishDb} and do not return
273							# a kmer count
274	0						$self->countKmersJellyfish($seqfile,$kmerlength);
275							} else {
276	0						croak "ERROR: I do not understand the kmer counter $self->{kmercounter}";
277							}
278
279	0	0					if($$self{sample} < 1){
280
281	0						my $sample = $$self{sample};
282	0						my $kmers = $self->kmers();
283
284	0						my %subsampledKmers;
285	0						for my $kmer(keys(%$kmers)){
286	0						my $count = $$kmers{$kmer};
287	0						for(my $i=0;$i<$count;$i++){
288	0	0					if($sample >= rand(1)){
289	0						$subsampledKmers{$kmer}++;
290							}
291							}
292							}
293
294	0						$self->clearCache();
295	0						$$self{_kmers} = \%subsampledKmers;
296							}
297							}
298
299							=pod
300
301							=over
302
303							=item $kmer->clearCache
304
305							Clears kmer counts and histogram counts. You should probably never use
306							this method.
307
308							Arguments: None
309							Returns: None
310
311							=back
312
313							=cut
314
315							sub clearCache{
316	0			0	1		my($self) = @_;
317
318	0						$$self{_kmers} = {};
319	0						$$self{_hist} = [];
320							}
321
322							=pod
323
324							=over
325
326							=item $kmer->query($queryString)
327
328							Query the set of kmers with your own query
329
330							Arguments: query (string)
331							Returns: Count of kmers.
332							0 indicates that the kmer was not found.
333							-1 indicates an invalid kmer (e.g., invalid length)
334
335							=back
336
337							=cut
338
339							sub query{
340	0			0	1		my($self,$query)=@_;
341
342	0	0					if(length($query) != $self->{kmerlength}){
343	0						return -1;
344							}
345
346	0						my $count=0;
347	0	0					if($self->{kmercounter} eq "perl"){
		0
348	0						my $kmers=$self->kmers();
349	0		0				$count=$$kmers{uc($query)} \|\| 0;
350							} elsif($self->{kmercounter} eq "jellyfish"){
351	0	0					open(my $queryFh, "$$self{jellyfish} query ".$self->{jellyfishdb}." \|") or croak "ERROR: could not run jellyfish query: $!";
352	0						my $db=$self->{jellyfishdb};
353	0						my $tmp=`jellyfish query $db $query`;
354	0	0					croak "ERROR: could not run jellyfish query" if $?;
355	0						chomp($tmp);
356	0						(undef,$count)=split(/\s+/,$tmp);
357							}
358
359	0						return $count;
360							}
361
362							=pod
363
364							=over
365
366							=item $kmer->histogram()
367
368							Count the frequency of kmers. Internally caches the histogram to ram.
369
370							Arguments: none
371							Returns: Reference to an array of counts. The index of
372							the array is the frequency.
373
374							=back
375
376							=cut
377
378							sub histogram{
379	0			0	1		my($self)=@_;
380
381	0	0					if(@{ $$self{_hist} } > 0){
	0
382	0						return $$self{_hist};
383							}
384
385	0	0					if($self->{kmercounter} eq "jellyfish"){
386	0						return $self->histogramJellyfish();
387							} else {
388	0						return $self->histogramPerl();
389							}
390							}
391
392							sub histogramJellyfish{
393	0			0	0		my($self)=@_;
394
395	0						close $self->{histfileFh};
396
397							# Run jellyfish histo
398	0						my $jellyfishXopts = join(" ","-t", $self->{numcpus}, "-o", $self->{histfile}, $self->{jellyfishdb});
399	0						system("$$self{jellyfish} histo $jellyfishXopts");
400	0	0					croak "ERROR with jellyfish histo" if $?;
401
402							# Read the output file
403	0						my @hist=(0);
404	0	0					open(my $fh, $self->{histfile}) or croak "ERROR: reading $self->{histfile}: $!";
405	0						while(<$fh>){
406	0						s/^\s+\|\s+$//g;
407	0						my($count, $countOfCounts)=split /\s+/;
408	0						$hist[$count]=$countOfCounts;
409							}
410	0						close $fh;
411
412							# Fill in gaps in the histogram
413	0						for(@hist){
414	0		0				$_\|\|=0;
415							}
416
417	0						$self->{_hist} = \@hist;
418	0						return \@hist;
419							}
420
421							sub histogramPerl{
422	0			0	0		my($self)=@_;
423	0						my %hist=();
424
425	0						my @hist=(0); # initialize the histogram with a count of zero kmers happening zero times
426							#$hist[0]=4**$self->{kmerlength}; # or maybe it should be initialized to the total number of possibilities.
427
428	0						for my $kmercount(values(%{ $self->kmers() } )){
	0
429	0						$hist{$kmercount}++;
430							}
431
432							# Turn this hash into an array
433	0						for(1..max(keys(%hist))){
434	0		0				$hist[$_] = $hist{$_} \|\| 0;
435							#$hist[0]=$hist[0] - $hist[$_]; # subtract off from the total space of possible kmers
436							}
437
438	0						$self->{_hist}=\@hist;
439	0						return \@hist;
440							}
441
442							# Pure perl to make this standalone... the only reason
443							# we are counting kmers in Perl instead of C.
444							sub countKmersPurePerl{
445	0			0	0		my($self,$seqfile,$kmerlength)=@_;
446
447	0						my @allSeqs;
448
449							# Save all seqs to an array, for passing out to individual threads.
450	0						my $fastqFh=$self->openFastq($seqfile);
451	0						my $i=0;
452	0						my @buffer=();
453	0						while(<$fastqFh>){ # burn the read ID line
454	0						$i++;
455	0						my $seq=<$fastqFh>;
456	0						push(@allSeqs, uc($seq));
457							# Burn the quality score lines
458	0						<$fastqFh>;
459	0						<$fastqFh>;
460							}
461	0						close $fastqFh;
462
463							# The number of sequences per thread is divided evenly but cautions
464							# toward having one extra sequence per thread in the first threads
465							# rather than accidentally leaving some off at the end.
466	0						my $numSeqsPerThread = int(scalar(@allSeqs)/$self->{numcpus}) + 1;
467
468							# Multithreading
469	0						my @thr;
470	0						for(0..$self->{numcpus}-1){
471							# Get a set of sequences to kmerize
472	0						my @threadSeqs = splice(@allSeqs, 0, $numSeqsPerThread);
473							# Set up a place for kmers to land
474	0						$thr[$_]=threads->new(\&_countKmersPurePerlWorker,$kmerlength,\@threadSeqs,$self->{sample});
475							#logmsg "Kicking off thread ".$thr[$_]->tid." with ".scalar(@threadSeqs)." sequences";
476							}
477
478							# Join the threads and put everything into a large kmer hash
479	0						my %kmer;
480	0						for(@thr){
481	0						my $kmerArr = $_->join;
482	0						for my $kmer(@$kmerArr){
483	0						$kmer{$kmer}++;
484							}
485							}
486
487	0						my($fh, $filename) = tempfile("KMER.XXXXXX", DIR=>$$self{tempdir}, SUFFIX=>".tsv");
488	0						($$self{kmerfileFh},$$self{kmerfile}) = ($fh, $filename);
489
490							# Write everything to file. The FH should still be open.
491							# Do not return the kmer.
492							# Make a new method that returns the kmer hash
493							# Do the same for jellyfish
494	0						while(my($kmer,$count)=each(%kmer)){
495							# Filtering step
496	0	0					if($count < $self->{gt}){
497							#delete($kmer{$kmer});
498	0						next;
499							}
500
501	0						print $fh "$kmer\t$count\n";
502							}
503	0						close $fh;
504
505	0						return 1;
506							}
507
508							=pod
509
510							=over
511
512							=item $kmer->kmers
513
514							Return actual kmers
515
516							Arguments: None
517							Returns: Reference to a hash of kmers and their counts
518
519							=back
520
521							=cut
522
523
524							sub kmers{
525	0			0	1		my($self)=@_;
526
527							# Look for the cached results before trying to read the file.
528	0	0					return $self->{_kmers} if(keys(%{ $self->{_kmers} }) > 0);
	0
529
530							# Dump the kmers to a tab delimited file if we are using
531							# jellyfish and the user invokes this method.
532	0	0					if($self->{kmercounter} eq "jellyfish"){
533	0						$self->_dumpKmersJellyfish();
534							}
535
536	0						my %kmer;
537	0	0					open(my $fh, $self->{kmerfile}) or croak "ERROR: could not read the kmer file: $!";
538	0						while(<$fh>){
539	0						chomp;
540	0						my($kmer,$count)=split /\t/;
541	0						$kmer{$kmer}=$count;
542							}
543	0						close $fh;
544
545	0						$self->{_kmers}=\%kmer;
546
547	0						return \%kmer;
548							}
549
550							=pod
551
552							=over
553
554							=item $kmer->union($kmer2)
555
556							Finds the union between two sets of kmers
557
558							Arguments: Another Bio::Kmer object
559							Returns: List of kmers
560
561							=back
562
563							=cut
564
565							sub union{
566	0			0	1		my($self,$other)=@_;
567
568	0	0					if(!$self->_checkCompatibility($other)){
569	0						croak "ERROR: two objects are not compatible in a union: ".ref($self)." / ".ref($other);
570							}
571
572							# See what kmers are in common using hashes
573	0						my %union;
574	0						my $kmer1 = $self->kmers;
575	0						my $kmer2 = $other->kmers;
576	0						for my $kmer(keys(%{ $self->kmers }), keys(%{ $other->kmers })){
	0
	0
577	0						$union{$kmer}=1;
578							}
579
580	0						return [keys(%union)];
581							}
582
583							=pod
584
585							=over
586
587							=item $kmer->intersection($kmer2)
588
589							Finds the intersection between two sets of kmers
590
591							Arguments: Another Bio::Kmer object
592							Returns: List of kmers
593
594							=back
595
596							=cut
597
598							sub intersection{
599	0			0	1		my($self,$other)=@_;
600
601	0	0					if(!$self->_checkCompatibility($other)){
602	0						croak "ERROR: two objects are not compatible in an intersection: ".ref($self)." / ".ref($other);
603							}
604
605	0						my @intersection;
606	0						my $kmer2 = $other->kmers;
607	0						for my $kmer(keys(%{ $self->kmers })){
	0
608	0	0					if($$kmer2{$kmer}){
609	0						push(@intersection, $kmer);
610							}
611							}
612
613	0						return \@intersection;
614							}
615
616							=pod
617
618							=over
619
620							=item $kmer->subtract($kmer2)
621
622							Finds the set of kmers unique to this Bio::Kmer object.
623
624							Arguments: Another Bio::Kmer object
625							Returns: List of kmers
626
627							=back
628
629							=cut
630
631							sub subtract{
632	0			0	1		my($self,$other)=@_;
633
634	0	0					if(!$self->_checkCompatibility($other)){
635	0						croak "ERROR: trying to subtract two incompatible ".ref($self)." objects.";
636							}
637
638	0						my %subtractKmers = %{ $self->kmers };
	0
639	0						for my $kmer(keys(%{ $other->kmers })){
	0
640	0						delete($subtractKmers{$kmer});
641							}
642
643	0						return [keys(%subtractKmers)];
644							}
645
646
647
648							# See if another Bio::Kmer is the same kind as this one.
649							# Return Boolean
650							sub _checkCompatibility{
651	0			0			my($self,$other)=@_;
652
653	0	0					if($self->{kmerlength} != $other->{kmerlength}){
654	0	0					carp "WARNING: kmer lengths do not match\n" if($self->{verbose});
655	0						return 0;
656							}
657
658	0						return 1;
659							}
660
661							sub _countKmersPurePerlWorker{
662	0			0			my($kmerlength,$seqArr,$sample)=@_;
663
664	0						my $outKmerArray=[];
665	0						for my $seq(@$seqArr){
666
667	0						my $numKmersInRead=length($seq)-$kmerlength;
668
669							# Count kmers in a sliding window.
670							# We must keep this loop optimized for speed.
671	0						for(my $j=0;$j<$numKmersInRead;$j++){
672							#next if($sample < rand(1)); # subsample
673							#$kmer{substr($seq,$j,$kmerlength)}++;
674	0						push(@$outKmerArray, substr($seq,$j,$kmerlength));
675							}
676
677							}
678	0						return $outKmerArray;
679							}
680
681
682							sub countKmersJellyfish{
683	0			0	0		my($self,$seqfile,$kmerlength)=@_;
684	0						my $basename=basename($seqfile);
685
686							# Version checking
687	0						my $jfVersion=`jellyfish --version`; chomp($jfVersion);
	0
688							# e.g., jellyfish 2.2.6
689	0	0					if($jfVersion =~ /(jellyfish\s+)?(\d+)?/){
690	0						my $majorVersion=$2;
691	0	0					if($majorVersion < 2){
692	0						croak "ERROR: Jellyfish v2 or greater is required for ".ref($self);
693							}
694							}
695
696	0						my $outfile=$self->{jellyfishdb};
697
698							# Counting
699	0						my $jellyfishCountOptions="-s 10000000 -m $kmerlength -o $outfile -t $self->{numcpus}";
700	0						my $uncompressedFastq="$self->{tempdir}/$basename.fastq";
701	0						my $zcat = which("zcat");
702	0	0					if($seqfile=~/\.gz$/i){
703	0	0					if(!-e $zcat){
704	0						croak "ERROR: could not find zcat in PATH for uncompressing $seqfile";
705							}
706	0	0					system("$zcat \Q$seqfile\E > $uncompressedFastq"); croak "ERROR uncompressing $seqfile" if $?;
	0
707	0						system("$$self{jellyfish} count $jellyfishCountOptions \Q$uncompressedFastq\E");
708							} else {
709	0						system("$$self{jellyfish} count $jellyfishCountOptions \Q$seqfile\E");
710							}
711	0						close $self->{jellyfishdbFh};
712	0	0					croak "Error: problem with jellyfish" if $?;
713							}
714
715							sub _dumpKmersJellyfish{
716	0			0			my($self)=@_;
717	0						my $kmerTsv=$self->{kmerfile};
718	0						my $jfDb=$self->{jellyfishdb};
719
720	0	0					return if(-s $kmerTsv > 0);
721
722							# Dump the kmers to a tab-delimited file if it doesn't
723							# already have contents
724	0	0					if(-s $kmerTsv < 1){
725	0						my $lowerCount=$self->{gt}-1;
726	0						system("$$self{jellyfish} dump --lower-count=$lowerCount --column --tab -o \Q$kmerTsv\E \Q$jfDb\E");
727	0	0					croak "ERROR dumping kmers from jellyfish database $jfDb" if $?;
728							}
729							}
730
731							# Opens a fastq file in a thread-safe way.
732							# Returns a file handle.
733							sub openFastq{
734	0			0	0		my($self,$fastq)=@_;
735
736	0						my $fh;
737
738	0						my($name,$dir,$ext)=fileparse($fastq,@fastqExt);
739
740	0	0					if(!grep(/$ext/,@fastqExt)){
741	0						croak "ERROR: could not read $fastq as a fastq file";
742							}
743
744							# Open the file in different ways, depending on if it
745							# is gzipped or if the user has gzip installed.
746	0						lock($fhStick);
747	0	0					if($ext =~/\.gz$/){
748							# use binary gzip if we can... why not take advantage
749							# of the compiled binary's speedup?
750	0						my $gzip = which('gzip');
751	0	0					if(-e $gzip){
752	0	0					open($fh,"$gzip -cd \Q$fastq\E \| ") or croak "ERROR: could not open $fastq for reading!: $!";
753							}else{
754	0	0					$fh=new IO::Uncompress::Gunzip($fastq) or croak "ERROR: could not read $fastq using native perl module IO::Uncompress::Gunzip: $!";
755							}
756							} else {
757	0	0					open($fh,"<",$fastq) or croak "ERROR: could not open $fastq for reading!: $!";
758							}
759
760	0						return $fh;
761							}
762							# In case I accidentally do $Kmer->closeFastq without thinking
763							# how ridiculous that is, let's just avoid that problem with
764							# this subroutine.
765							sub closeFastq{
766	0			0	0		my($self,$fastq)=@_;
767	0						close $fastq;
768	0						return 1;
769							}
770
771							=pod
772
773							=over
774
775							=item $kmer->close()
776
777							Cleans the temporary directory and removes this object from
778							RAM. Good for when you might be counting kmers for many
779							things but want to keep your overhead low.
780
781							Arguments: None
782							Returns: 1
783
784							=back
785
786							=cut
787
788							sub close{
789	0			0	1		my($self)=@_;
790
791	0						remove_tree($self->{tempdir});
792	0						undef($self);
793
794	0						return 1;
795							}
796
797							=pod
798
799							=head1 COPYRIGHT AND LICENSE
800
801							MIT license. Go nuts.
802
803							=head1 AUTHOR
804
805							Author: Lee Katz <lkatz@cdc.gov>
806
807							For additional help, go to https://github.com/lskatz/Bio--Kmer
808
809							CPAN module at http://search.cpan.org/~lskatz/Bio-Kmer/
810
811							=for html <a href="https://travis-ci.org/lskatz/Bio--Kmer"><img src="https://travis-ci.org/lskatz/Bio--Kmer.svg?branch=master"></a>
812
813							=cut
814
815							1;