File Coverage

lib/Bio/Kmer.pm

Criterion	Covered	Total	%
statement	25	27	92.5
branch			n/a
condition			n/a
subroutine	9	9	100.0
pod			n/a
total	34	36	94.4

line	stmt	sub	time	code
1				#!/usr/bin/env perl
2
3				# Kmer.pm: a kmer counting module
4				# Author: Lee Katz
5
6				package Bio::Kmer;
7				require 5.10.0;
8				our $VERSION=0.21;
9
10	2	2	1622	use strict;
	2		5
	2		64
11	2	2	12	use warnings;
	2		5
	2		78
12
13	2	2	12	use List::Util qw/max/;
	2		4
	2		212
14	2	2	13	use File::Basename qw/basename fileparse dirname/;
	2		4
	2		135
15	2	2	481	use File::Temp qw/tempdir tempfile/;
	2		17396
	2		166
16	2	2	19	use File::Path qw/remove_tree/;
	2		6
	2		129
17	2	2	427	use Data::Dumper qw/Dumper/;
	2		5628
	2		258
18	2	2	543	use IO::Uncompress::Gunzip;
	2		29381
	2		118
19
20	2	2	918	use threads;
	0
	0
21				use threads::shared;
22				use Thread::Queue;
23
24				use Exporter qw/import/;
25				our @EXPORT_OK = qw(
26				);
27
28				our @fastqExt=qw(.fastq.gz .fastq .fq .fq.gz);
29				our @fastaExt=qw(.fasta .fna .faa .mfa .fas .fa);
30				our @bamExt=qw(.sorted.bam .bam);
31				our @vcfExt=qw(.vcf.gz .vcf);
32				our @richseqExt=qw(.gbk .gbf .gb .embl);
33				our @sffExt=qw(.sff);
34				our @samExt=qw(.sam .bam);
35
36				our $fhStick :shared; # Helps us open only one file at a time
37
38
39				# TODO if 'die' is imported by a script, redefine
40				# sig die in that script as this function.
41				local $SIG{'__DIE__'} = sub { my $e = $_[0]; $e =~ s/(at [^\s]+? line \d+\.$)/\nStopped $1/; die("$0: ".(caller(1))[3].": ".$e); };
42
43				=pod
44
45				=head1 NAME
46
47				Bio::Kmer
48
49				=head1 SYNOPSIS
50
51				A module for helping with kmer analysis.
52
53				use strict;
54				use warnings;
55				use Bio::Kmer;
56
57				my $kmer=Bio::Kmer->new("file.fastq.gz",{kmercounter=>"jellyfish",numcpus=>4});
58				my $kmerHash=$kmer->kmers();
59				my $countOfCounts=$kmer->histogram();
60
61				The BioPerl way
62
63				use strict;
64				use warnings;
65				use Bio::SeqIO;
66				use Bio::Kmer;
67
68				# Load up any Bio::SeqIO object. Quality values will be
69				# faked internally to help with compatibility even if
70				# a fastq file is given.
71				my $seqin = Bio::SeqIO->new(-file=>"input.fasta");
72				my $kmer=Bio::Kmer->new($seqin);
73				my $kmerHash=$kmer->kmers();
74				my $countOfCounts=$kmer->histogram();
75
76				=head1 DESCRIPTION
77
78				A module for helping with kmer analysis. The basic methods help count kmers and can produce a count of counts. Currently this module only supports fastq format. Although this module can count kmers with pure perl, it is recommended to give the option for a different kmer counter such as Jellyfish.
79
80				=pod
81
82				=head1 METHODS
83
84				=over
85
86				=item Bio::Kmer->new($filename, \%options)
87
88				Create a new instance of the kmer counter. One object per file.
89
90				Applicable arguments:
91				Argument Default Description
92				kmercounter perl What kmer counter software to use.
93				Choices: Perl, Jellyfish.
94				kmerlength 21 Kmer length
95				numcpus 1 This module uses perl
96				multithreading with pure perl or
97				can supply this option to other
98				software like jellyfish.
99				gt 1 If the count of kmers is fewer
100				than this, ignore the kmer. This
101				might help speed analysis if you
102				do not care about low-count kmers.
103				sample 1 Retain only a percentage of kmers.
104				1 is 100%; 0 is 0%
105				Only works with the perl kmer counter.
106
107				Examples:
108				my $kmer=Bio::Kmer->new("file.fastq.gz",{kmercounter=>"jellyfish",numcpus=>4});
109
110				=back
111
112				=cut
113
114				sub new{
115				my($class,$seqfile,$settings)=@_;
116
117				die "ERROR: need a sequence file or a Bio::SeqIO object" if(!$seqfile);
118
119				# Set optional parameter defaults
120				$$settings{kmerlength} \|\|=21;
121				$$settings{numcpus} \|\|=1;
122				$$settings{gt} \|\|=1;
123				$$settings{kmercounter} \|\|="perl";
124				$$settings{tempdir} \|\|=tempdir("Kmer.pm.XXXXXX",TMPDIR=>1,CLEANUP=>1);
125				$$settings{sample} =1 if(!defined($$settings{sample}));
126
127				# If the first parameter $seqfile is a Bio::SeqIO object,
128				# then send it to a file to dovetail with the rest of
129				# this module.
130				if(ref($seqfile) && $seqfile->isa("Bio::SeqIO")){
131				# BioPerl isn't required at compile tile but is
132				# required if the user starts with a BioPerl object.
133				eval {
134				require Bio::SeqIO;
135				require Bio::Seq::Quality;
136				};
137				if($@){
138				die "ERROR: cannot load Bio::SeqIO and Bio::Seq::Quality, but you supplied a Bio::SeqIO object";
139				}
140				my $tempfile="$$settings{tempdir}/bioperl_input.fastq";
141				my $out=Bio::SeqIO->new(-file=>">".$tempfile);
142				while(my $seq=$seqfile->next_seq){
143				my $seqWithQual = Bio::Seq::Quality->new(
144				# TODO preserve qual values if they exist, but
145				# for now, it doesn't really matter.
146				-qual=> "I" x $seq->length,
147				-seq => $seq->seq,
148				-id => $seq->id,
149				-verbose => -1,
150				);
151				$out->write_seq($seqWithQual);
152				}
153				$out->close; # make sure output is flushed
154
155				# now redefine the seqfile as the new file on disk.
156				$seqfile=$tempfile;
157				}
158
159				# Check if we have a valid sequence file path or at
160				# the very least, double check that the file path we just
161				# made from the Bio::SeqIO object is valid.
162				die "ERROR: could not locate the sequence file $seqfile" if(!-e $seqfile);
163
164				# Initialize the object and then bless it
165				my $self={
166				seqfile =>$seqfile,
167				kmerlength =>$$settings{kmerlength},
168				numcpus =>$$settings{numcpus},
169				tempdir =>$$settings{tempdir},
170				gt =>$$settings{gt},
171				kmercounter=>$$settings{kmercounter},
172				sample =>$$settings{sample},
173
174				# Values that will be filled in after analysis
175				_kmers =>{},
176				};
177				# Add in some other temporary files
178				($$self{kmerfileFh},$$self{kmerfile}) = tempfile("KMER.XXXXXX", DIR=>$$self{tempdir}, SUFFIX=>".tsv");
179				($$self{histfileFh},$$self{histfile}) = tempfile("HIST.XXXXXX", DIR=>$$self{tempdir}, SUFFIX=>".tsv");
180				($$self{jellyfishdbFh},$$self{jellyfishdb})= tempfile("JF.XXXXXX", DIR=>$$self{tempdir}, SUFFIX=>".jf");
181
182				# Make some values lc
183				$$self{$_}=lc($$self{$_}) for(qw(kmercounter));
184
185				# Set an exact parameter for the kmer counter
186				if($$self{kmercounter}=~ /(pure)?.*perl/){
187				$$self{kmercounter}="perl";
188				} elsif($self->{kmercounter} =~ /jellyfish/){
189				$$self{kmercounter}="jellyfish";
190				}
191
192				bless($self);
193
194				$self->count; # start off the kmer counting ASAP
195
196				return $self;
197				}
198
199				# Count kmers with faster programs in this order of
200				# priority: jellyfish (TODO: KAnalyze)
201				# and lastly, pure perl.
202				sub count{
203				my($self)=@_;
204
205				my $seqfile=$self->{seqfile};
206				my $kmerlength=$self->{kmerlength};
207
208				my $kmerHash={};
209
210				if($self->{kmercounter} eq "perl"){
211				$self->countKmersPurePerl($seqfile,$kmerlength);
212				} elsif($self->{kmercounter} eq "jellyfish"){
213				# TODO make JF DB file $self->{jellyfishDb} and do not return
214				# a kmer count
215				$self->countKmersJellyfish($seqfile,$kmerlength);
216				} else {
217				die "ERROR: I do not understand the kmer counter $self->{kmercounter}";
218				}
219				}
220
221				=pod
222
223				=over
224
225				=item $kmer->query($queryString)
226
227				Query the set of kmers with your own query
228
229				Arguments: query (string)
230				Returns: Count of kmers.
231				0 indicates that the kmer was not found.
232				-1 indicates an invalid kmer (e.g., invalid length)
233
234				=back
235
236				=cut
237
238				sub query{
239				my($self,$query)=@_;
240
241				if(length($query) != $self->{kmerlength}){
242				return -1;
243				}
244
245				my $count=0;
246				if($self->{kmercounter} eq "perl"){
247				my $kmers=$self->kmers();
248				$count=$$kmers{uc($query)} \|\| 0;
249				} elsif($self->{kmercounter} eq "jellyfish"){
250				open(my $queryFh, "jellyfish query ".$self->{jellyfishdb}." \|") or die "ERROR: could not run jellyfish query: $!";
251				my $db=$self->{jellyfishdb};
252				my $tmp=`jellyfish query $db $query`;
253				die "ERROR: could not run jellyfish query" if $?;
254				chomp($tmp);
255				(undef,$count)=split(/\s+/,$tmp);
256				}
257
258				return $count;
259				}
260
261				=pod
262
263				=over
264
265				=item $kmer->histogram()
266
267				Count the frequency of kmers.
268
269				Arguments: none
270				Returns: Reference to an array of counts. The index of
271				the array is the frequency.
272
273				=back
274
275				=cut
276
277				sub histogram{
278				my($self)=@_;
279
280				if($self->{kmercounter} eq "jellyfish"){
281				return $self->histogramJellyfish();
282				} else {
283				return $self->histogramPerl();
284				}
285				}
286
287				sub histogramJellyfish{
288				my($self)=@_;
289
290				close $self->{histfileFh};
291
292				# Run jellyfish histo
293				my $jellyfishXopts = join(" ","-t", $self->{numcpus}, "-o", $self->{histfile}, $self->{jellyfishdb});
294				system("jellyfish histo $jellyfishXopts");
295				die "ERROR with jellyfish histo" if $?;
296
297				# Read the output file
298				my @hist=(0);
299				open(my $fh, $self->{histfile}) or die "ERROR: reading $self->{histfile}: $!";
300				while(<$fh>){
301				s/^\s+\|\s+$//g;
302				my($count, $countOfCounts)=split /\s+/;
303				$hist[$count]=$countOfCounts;
304				}
305				close $fh;
306
307				# Fill in gaps in the histogram
308				for(@hist){
309				$_\|\|=0;
310				}
311
312				return \@hist;
313				}
314
315				sub histogramPerl{
316				my($self)=@_;
317				my %hist=();
318
319				my @hist=(0); # initialize the histogram with a count of zero kmers happening zero times
320				#$hist[0]=4**$self->{kmerlength}; # or maybe it should be initialized to the total number of possibilities.
321
322				for my $kmercount(values(%{ $self->kmers() } )){
323				$hist{$kmercount}++;
324				}
325
326				# Turn this hash into an array
327				for(1..max(keys(%hist))){
328				$hist[$_] = $hist{$_} \|\| 0;
329				#$hist[0]=$hist[0] - $hist[$_]; # subtract off from the total space of possible kmers
330				}
331
332				$self->{hist}=\@hist;
333				return \@hist;
334				}
335
336				sub countKmersPurePerl{
337				my($self,$seqfile,$kmerlength)=@_;
338
339				# Multithreading
340				my $seqQ=Thread::Queue->new;
341				my @thr;
342				for(0..$self->{numcpus}-1){
343				$thr[$_]=threads->new(\&_countKmersPurePerlWorker,$kmerlength,$seqQ,$self->{sample});
344				}
345
346				# Pure perl to make this standalone... the only reason
347				# we are counting kmers in Perl instead of C.
348				my $fastqFh=$self->openFastq($seqfile);
349				my $i=0;
350				my @buffer=();
351				while(<$fastqFh>){ # burn the read ID line
352				$i++;
353				my $seq=<$fastqFh>;
354				push(@buffer, uc($seq));
355
356				if($i % 1000000 == 0){
357				$seqQ->enqueue(@buffer);
358				@buffer=();
359				}
360
361				# Burn the quality score lines
362				<$fastqFh>;
363				<$fastqFh>;
364				}
365				close $fastqFh;
366
367				$seqQ->enqueue(@buffer);
368
369				# Send the termination signal
370				$seqQ->enqueue(undef) for(@thr);
371
372				while($seqQ->pending > @thr){
373				for(1..60){
374				last if($seqQ->pending <= @thr);
375				sleep 1;
376				}
377				}
378
379				# Join the threads and put everything into a large kmer hash
380				my %kmer=();
381				for(@thr){
382				my $threadKmer=$_->join;
383				for my $kmer(keys(%$threadKmer)){
384				$kmer{$kmer}+=$$threadKmer{$kmer};
385				}
386				}
387
388				# Write everything to file. The FH should still be open.
389				# Do not return the kmer.
390				# Make a new method that returns the kmer hash
391				# Do the same for jellyfish
392				my $fh=$self->{kmerfileFh};
393				while(my($kmer,$count)=each(%kmer)){
394				# Filtering step
395				if($count < $self->{gt}){
396				#delete($kmer{$kmer});
397				next;
398				}
399
400				print $fh "$kmer\t$count\n";
401				}
402				close $fh;
403
404				return 1;
405				}
406
407				=pod
408
409				=over
410
411				=item $kmer->kmers
412
413				Return actual kmers
414
415				Arguments: None
416				Returns: Reference to a hash of kmers and their counts
417
418				=back
419
420				=cut
421
422
423				sub kmers{
424				my($self)=@_;
425
426				return $self->{_kmers} if(keys(%{ $self->{_kmers} }) > 0);
427
428				# Dump the kmers to a tab delimited file if we are using
429				# jellyfish and the user invokes this method.
430				if($self->{kmercounter} eq "jellyfish"){
431				$self->_dumpKmersJellyfish();
432				}
433
434				my %kmer;
435				open(my $fh, $self->{kmerfile}) or die "ERROR: could not read the kmer file: $!";
436				while(<$fh>){
437				chomp;
438				my($kmer,$count)=split /\t/;
439				$kmer{$kmer}=$count;
440				}
441				close $fh;
442
443				$self->{_kmers}=\%kmer;
444
445				return \%kmer;
446				}
447
448				=pod
449
450				=over
451
452				=item $kmer->union($kmer2)
453
454				Finds the union between two sets of kmers
455
456				Arguments: Another Bio::Kmer object
457				Returns: List of kmers
458
459				=back
460
461				=cut
462
463				sub union{
464				my($self,$other)=@_;
465
466				if(!$self->_checkCompatibility($other,{verbose=>1})){
467				die;
468				}
469
470				# See what kmers are in common using hashes
471				my %union;
472				my $kmer1 = $self->kmers;
473				my $kmer2 = $other->kmers;
474				for my $kmer(keys(%{ $self->kmers }), keys(%{ $other->kmers })){
475				$union{$kmer}=1;
476				}
477
478				return [keys(%union)];
479				}
480
481				=pod
482
483				=over
484
485				=item $kmer->intersection($kmer2)
486
487				Finds the intersection between two sets of kmers
488
489				Arguments: Another Bio::Kmer object
490				Returns: List of kmers
491
492				=back
493
494				=cut
495
496				sub intersection{
497				my($self,$other)=@_;
498
499				if(!$self->_checkCompatibility($other,{verbose=>1})){
500				die;
501				}
502
503				my @intersection;
504				my $kmer2 = $other->kmers;
505				for my $kmer(keys(%{ $self->kmers })){
506				if($$kmer2{$kmer}){
507				push(@intersection, $kmer);
508				}
509				}
510
511				return \@intersection;
512				}
513
514				=pod
515
516				=over
517
518				=item $kmer->subtract($kmer2)
519
520				Finds the set of kmers unique to this Bio::Kmer object.
521
522				Arguments: Another Bio::Kmer object
523				Returns: List of kmers
524
525				=back
526
527				=cut
528
529				sub subtract{
530				my($self,$other)=@_;
531
532				if(!$self->_checkCompatibility($other,{verbose=>1})){
533				die;
534				}
535
536				my %subtractKmers = %{ $self->kmers };
537				for my $kmer(keys(%{ $other->kmers })){
538				delete($subtractKmers{$kmer});
539				}
540
541				return [keys(%subtractKmers)];
542				}
543
544
545
546				# See if another Bio::Kmer is the same kind as this one.
547				# Return Boolean
548				sub _checkCompatibility{
549				my($self,$other,$settings)=@_;
550
551				if($self->{kmerlength} != $other->{kmerlength}){
552				warn "WARNING: kmer lengths do not match\n" if($$settings{verbose});
553				return 0;
554				}
555
556				return 1;
557				}
558
559				sub _countKmersPurePerlWorker{
560				my($kmerlength,$seqQ,$sample)=@_;
561
562				my %kmer;
563				while(defined(my $seq=$seqQ->dequeue)){
564
565				my $numKmersInRead=length($seq)-$kmerlength;
566
567				# Count kmers in a sliding window.
568				# We must keep this loop optimized for speed.
569				for(my $j=0;$j<$numKmersInRead;$j++){
570				next if($sample < rand(1)); # subsample
571				$kmer{substr($seq,$j,$kmerlength)}++;
572				}
573
574				}
575
576				return \%kmer;
577				}
578
579
580				sub countKmersJellyfish{
581				my($self,$seqfile,$kmerlength)=@_;
582				my $basename=basename($seqfile);
583
584				# Version checking
585				my $jfVersion=`jellyfish --version`; chomp($jfVersion);
586				# e.g., jellyfish 2.2.6
587				if($jfVersion =~ /(jellyfish\s+)?(\d+)?/){
588				my $majorVersion=$2;
589				if($majorVersion < 2){
590				die "ERROR: Jellyfish v2 or greater is required";
591				}
592				}
593
594				my $outfile=$self->{jellyfishdb};
595
596				# Counting
597				my $jellyfishCountOptions="-s 10000000 -m $kmerlength -o $outfile -t $self->{numcpus}";
598				my $uncompressedFastq="$self->{tempdir}/$basename.fastq";
599				if($seqfile=~/\.gz$/i){
600				system("zcat $seqfile > $uncompressedFastq"); die if $?;
601				system("jellyfish count $jellyfishCountOptions $uncompressedFastq");
602				} else {
603				system("jellyfish count $jellyfishCountOptions $seqfile");
604				}
605				close $self->{jellyfishdbFh};
606				die "Error: problem with jellyfish" if $?;
607				}
608
609				sub _dumpKmersJellyfish{
610				my($self)=@_;
611				my $kmerTsv=$self->{kmerfile};
612				my $jfDb=$self->{jellyfishdb};
613
614				return if(-s $kmerTsv > 0);
615
616				# Dump the kmers to a tab-delimited file if it doesn't
617				# already have contents
618				if(-s $kmerTsv < 1){
619				my $lowerCount=$self->{gt}-1;
620				system("jellyfish dump --lower-count=$lowerCount --column --tab -o $kmerTsv $jfDb");
621				die if $?;
622				}
623				}
624
625				# http://www.perlmonks.org/?node_id=761662
626				sub which{
627				my($self,$exe)=@_;
628
629				my $tool_path="";
630				for my $path ( split /:/, $ENV{PATH} ) {
631				if ( -f "$path/$exe" && -x "$path/$exe" ) {
632				$tool_path = "$path/$exe";
633				last;
634				}
635				}
636
637				return $tool_path;
638				}
639
640				# Opens a fastq file in a thread-safe way.
641				# Returns a file handle.
642				sub openFastq{
643				my($self,$fastq)=@_;
644
645				my $fh;
646
647				my($name,$dir,$ext)=fileparse($fastq,@fastqExt);
648
649				if(!grep(/$ext/,@fastqExt)){
650				die "ERROR: could not read $fastq as a fastq file";
651				}
652
653				# Open the file in different ways, depending on if it
654				# is gzipped or if the user has gzip installed.
655				lock($fhStick);
656				if($ext =~/\.gz$/){
657				# use binary gzip if we can... why not take advantage
658				# of the compiled binary's speedup?
659				if(-e "/usr/bin/gzip"){
660				open($fh,"gzip -cd $fastq \| ") or die "ERROR: could not open $fastq for reading!: $!";
661				}else{
662				$fh=new IO::Uncompress::Gunzip($fastq) or die "ERROR: could not read $fastq: $!";
663				}
664				} else {
665				open($fh,"<",$fastq) or die "ERROR: could not open $fastq for reading!: $!";
666				}
667
668				return $fh;
669				}
670				# In case I accidentally do $Kmer->closeFastq without thinking
671				# how ridiculous that is, let's just avoid that problem with
672				# this subroutine.
673				sub closeFastq{
674				my($self,$fastq)=@_;
675				close $fastq;
676				return 1;
677				}
678
679				=pod
680
681				=over
682
683				=item $kmer->close()
684
685				Cleans the temporary directory and removes this object from
686				RAM. Good for when you might be counting kmers for many
687				things but want to keep your overhead low.
688
689				Arguments: None
690				Returns: 1
691
692				=back
693
694				=cut
695
696				sub close{
697				my($self)=@_;
698
699				remove_tree($self->{tempdir});
700				undef($self);
701
702				return 1;
703				}
704
705				=pod
706
707				=head1 COPYRIGHT AND LICENSE
708
709				MIT license. Go nuts.
710
711				=head1 AUTHOR
712
713				Author: Lee Katz
714
715				For additional help, go to https://github.com/lskatz/Bio--Kmer
716
717				CPAN module at http://search.cpan.org/~lskatz/Bio-Kmer/
718
719				=for html
720
721				=cut
722
723				1;