File Coverage

blib/lib/Bio/Kmer.pm

Criterion	Covered	Total	%
statement	22	24	91.6
branch			n/a
condition			n/a
subroutine	8	8	100.0
pod			n/a
total	30	32	93.7

line	stmt	sub	time	code
1				#!/usr/bin/env perl
2
3				# Kmer.pm: a kmer counting module
4				# Author: Lee Katz
5
6				=pod SCRIPT CATEGORIES
7
8				CPAN
9
10				=cut
11
12
13				package Bio::Kmer;
14				require 5.12.0;
15				our $VERSION="0.01";
16
17	1	1	467	use strict;
	1		1
	1		21
18	1	1	3	use warnings;
	1		1
	1		25
19
20	1	1	13	use List::Util qw/max/;
	1		1
	1		74
21	1	1	4	use File::Basename qw/basename fileparse dirname/;
	1		1
	1		48
22	1	1	619	use File::Temp ('tempdir');
	1		13596
	1		66
23	1	1	715	use Data::Dumper;
	1		5729
	1		54
24	1	1	503	use IO::Uncompress::Gunzip;
	1		28857
	1		44
25
26	1	1	520	use threads;
	0
	0
27				use threads::shared;
28				use Thread::Queue;
29
30				use Exporter qw/import/;
31				our @EXPORT_OK = qw(
32				);
33
34				our @fastqExt=qw(.fastq.gz .fastq .fq .fq.gz);
35				our @fastaExt=qw(.fasta .fna .faa .mfa .fas .fa);
36				our @bamExt=qw(.sorted.bam .bam);
37				our @vcfExt=qw(.vcf.gz .vcf);
38				our @richseqExt=qw(.gbk .gbf .gb .embl);
39				our @sffExt=qw(.sff);
40				our @samExt=qw(.sam .bam);
41
42				our $fhStick :shared; # Helps us open only one file at a time
43
44
45				# TODO if 'die' is imported by a script, redefine
46				# sig die in that script as this function.
47				local $SIG{'__DIE__'} = sub { my $e = $_[0]; $e =~ s/(at [^\s]+? line \d+\.$)/\nStopped $1/; die("$0: ".(caller(1))[3].": ".$e); };
48
49				=pod
50
51				=head1 NAME
52
53				Kmer
54
55				=head1 SYNOPSIS
56
57				A module for helping with kmer analysis.
58
59				use strict;
60				use warnings;
61				use Kmer;
62
63				my $kmer=Kmer->new("file.fastq.gz",{kmerCounter=>"jellyfish",numcpus=>4});
64				my $kmerHash=$kmer->count();
65				my $countOfCounts=$kmer->histogram();
66
67				=head1 DESCRIPTION
68
69				A module for helping with kmer analysis. The basic methods help count kmers and can produce a count of counts. Currently this module only supports fastq format. Although this module can count kmers with pure perl, it is recommended to give the option for a different kmer counter such as Jellyfish.
70
71				=head1 AUTHOR
72
73				Author: Lee Katz
74
75				=cut
76
77				=pod
78
79				=head1 METHODS
80
81				=item new
82
83				=over
84
85				Create a new instance of the kmer counter. One object per file.
86
87				Applicable arguments:
88				Argument Default Description
89				kmercounter perl What kmer counter software to use.
90				Choices: Perl, Jellyfish.
91				kmerlength 21 Kmer length
92				numcpus 1 This module uses perl
93				multithreading with pure perl or
94				can supply this option to other
95				software like jellyfish.
96				gt 1 If the count of kmers is fewer
97				than this, ignore the kmer. This
98				might help speed analysis if you
99				do not care about low-count kmers.
100
101				Examples:
102				my $kmer=Kmer->new("file.fastq.gz",{kmerCounter=>"jellyfish",numcpus=>4});
103
104				=back
105
106				=cut
107
108				sub new{
109				my($class,$seqfile,$settings)=@_;
110
111				die "ERROR: need a sequence file" if(!$seqfile);
112				die "ERROR: could not locate the sequence file" if(!-e $seqfile);
113
114				# Set optional parameter defaults
115				$$settings{kmerlength} \|\|=21;
116				$$settings{numcpus} \|\|=1;
117				$$settings{gt} \|\|=1;
118				$$settings{kmerCounter} \|\|="perl";
119
120				# Initialize the object and then bless it
121				my $self={
122				seqfile =>$seqfile,
123				kmerlength =>$$settings{kmerlength},
124				numcpus =>$$settings{numcpus},
125				tempdir =>tempdir("Kmer.pm.XXXXXX",TMPDIR=>1,CLEANUP=>1),
126				gt =>$$settings{gt},
127				kmerCounter=>$$settings{kmerCounter},
128
129				# Values that will be filled in after analysis
130				kmers =>{},
131				};
132				bless($self);
133
134				return $self;
135				}
136
137				=pod
138
139				=over
140
141				=item count
142
143				Count kmers. If Jellyfish is found, then it will be used. Otherwise, pure perl will be used which is slower.
144
145				Arguments: none
146				Returns: Reference to a hash of kmers where the key is
147				the kmer, and the value is count
148
149				=back
150
151				=cut
152
153				# Count kmers with faster programs in this order of
154				# priority: jellyfish (TODO: KAnalyze)
155				# and lastly, pure perl.
156				sub count{
157				my($self)=@_;
158
159				my $seqfile=$self->{seqfile};
160				my $kmerlength=$self->{kmerlength};
161
162				my $kmerHash={};
163
164				if($self->{kmerCounter}=~ /(pure)?.*perl/i){
165				$kmerHash=$self->countKmersPurePerl($seqfile,$kmerlength);
166				} elsif($self->{kmerCounter} =~ /jellyfish/i){
167				$kmerHash=$self->countKmersJellyfish($seqfile,$kmerlength);
168				} else {
169				die "ERROR: I do not understand the kmer counter $self->{kmerCounter}";
170				}
171
172				$self->{kmers}=$kmerHash;
173				return $kmerHash;
174				}
175
176				=pod
177
178				=over
179
180				=item histogram
181
182				Count kmers. If Jellyfish is found, then it will be used. Otherwise, pure perl will be used which is slower.
183
184				Arguments: none
185				Returns: Reference to an array of counts. The index of
186				the array is the frequency.
187
188				=back
189
190				=cut
191
192				sub histogram{
193				my($self)=@_;
194				my %hist=();
195				my @hist=(0); # initialize the histogram with a count of zero kmers happening zero times
196				#$hist[0]=4**$self->{kmerlength}; # or maybe it should be initialized to the total number of possibilities.
197
198				if(!values(%{ $self->{kmers} } )){
199				die "ERROR: kmers have not been counted yet. Run Kmer->count before running Kmer->histogram";
200				}
201
202				for my $kmercount(values(%{ $self->{kmers} } )){
203				$hist{$kmercount}++;
204				}
205
206				# Turn this hash into an array
207				for(1..max(keys(%hist))){
208				$hist[$_] = $hist{$_} \|\| 0;
209				#$hist[0]=$hist[0] - $hist[$_]; # subtract off from the total space of possible kmers
210				}
211
212				$self->{hist}=\@hist;
213				return \@hist;
214				}
215
216				sub countKmersPurePerl{
217				my($self,$seqfile,$kmerlength)=@_;
218
219				# Multithreading
220				my $seqQ=Thread::Queue->new;
221				my @thr;
222				for(0..$self->{numcpus}-1){
223				$thr[$_]=threads->new(\&_countKmersPurePerlWorker,$kmerlength,$seqQ);
224				}
225
226				# Pure perl to make this standalone... the only reason
227				# we are counting kmers in Perl instead of C.
228				my $fastqFh=$self->openFastq($seqfile);
229				my $i=0;
230				my @buffer=();
231				while(<$fastqFh>){ # burn the read ID line
232				$i++;
233				my $seq=<$fastqFh>;
234				push(@buffer, $seq);
235
236				if($i % 1000000 == 0){
237				$seqQ->enqueue(@buffer);
238				@buffer=();
239				}
240
241				# Burn the quality score lines
242				<$fastqFh>;
243				<$fastqFh>;
244				}
245				close $fastqFh;
246
247				$seqQ->enqueue(@buffer);
248
249				# Send the termination signal
250				$seqQ->enqueue(undef) for(@thr);
251
252				while($seqQ->pending > @thr){
253				for(1..60){
254				last if($seqQ->pending <= @thr);
255				sleep 1;
256				}
257				}
258
259				# Join the threads and put everything into a large kmer hash
260				my %kmer=();
261				for(@thr){
262				my $threadKmer=$_->join;
263				for my $kmer(keys(%$threadKmer)){
264				$kmer{$kmer}+=$$threadKmer{$kmer};
265				}
266				}
267
268				# Filtering step
269				while(my($kmer,$count)=each(%kmer)){
270				delete($kmer{$kmer}) if($count < $self->{gt});
271				}
272
273				return \%kmer;
274				}
275
276				sub _countKmersPurePerlWorker{
277				my($kmerlength,$seqQ)=@_;
278
279				my %kmer;
280				while(defined(my $seq=$seqQ->dequeue)){
281
282				my $numKmersInRead=length($seq)-$kmerlength;
283
284				# Count kmers in a sliding window.
285				# We must keep this loop optimized for speed.
286				for(my $j=0;$j<$numKmersInRead;$j++){
287				$kmer{substr($seq,$j,$kmerlength)}++;
288				}
289
290				}
291
292				return \%kmer;
293				}
294
295
296				sub countKmersJellyfish{
297				my($self,$seqfile,$kmerlength)=@_;
298				my $basename=basename($seqfile);
299				my %kmer=();
300
301				# Version checking
302				my $jfVersion=`jellyfish --version`; chomp($jfVersion);
303				# e.g., jellyfish 2.2.6
304				if($jfVersion =~ /(jellyfish\s+)?(\d+)?/){
305				my $majorVersion=$2;
306				if($majorVersion < 2){
307				die "ERROR: Jellyfish v2 or greater is required";
308				}
309				}
310
311				my $outprefix="$self->{tempdir}/$basename.mer_counts";
312				my $jfDb="$self->{tempdir}/$basename.merged.jf";
313				my $kmerTsv="$self->{tempdir}/$basename.jf.tsv";
314
315				# Counting
316				my $jellyfishCountOptions="-s 10000000 -m $kmerlength -o $outprefix -t $self->{numcpus}";
317				my $uncompressedFastq="$self->{tempdir}/$basename.fastq";
318				if($seqfile=~/\.gz$/i){
319				system("zcat $seqfile > $uncompressedFastq"); die if $?;
320				system("jellyfish count $jellyfishCountOptions $uncompressedFastq");
321				} else {
322				system("jellyfish count $jellyfishCountOptions $seqfile");
323				}
324				die "Error: problem with jellyfish" if $?;
325
326				my $lowerCount=$self->{gt}-1;
327				system("jellyfish dump --lower-count=$lowerCount --column --tab -o $kmerTsv $outprefix");
328				die if $?;
329
330				# Load kmers to memory
331				open(TSV,$kmerTsv) or die "ERROR: Could not open $kmerTsv: $!";
332				while(){
333				chomp;
334				my @F=split /\t/;
335				$kmer{$F[0]}=$F[1];
336				}
337				close TSV;
338
339				# cleanup
340				for($jfDb, $kmerTsv, $outprefix, $uncompressedFastq){
341				unlink $_ if($_ && -e $_);
342				}
343
344				return \%kmer;
345				}
346
347				# http://www.perlmonks.org/?node_id=761662
348				sub which{
349				my($self,$exe)=@_;
350
351				my $tool_path="";
352				for my $path ( split /:/, $ENV{PATH} ) {
353				if ( -f "$path/$exe" && -x "$path/$exe" ) {
354				$tool_path = "$path/$exe";
355				last;
356				}
357				}
358
359				return $tool_path;
360				}
361
362				# Opens a fastq file in a thread-safe way.
363				# Returns a file handle.
364				sub openFastq{
365				my($self,$fastq)=@_;
366
367				my $fh;
368
369				my($name,$dir,$ext)=fileparse($fastq,@fastqExt);
370
371				if(!grep(/$ext/,@fastqExt)){
372				die "ERROR: could not read $fastq as a fastq file";
373				}
374
375				# Open the file in different ways, depending on if it
376				# is gzipped or if the user has gzip installed.
377				lock($fhStick);
378				if($ext =~/\.gz$/){
379				# use binary gzip if we can... why not take advantage
380				# of the compiled binary's speedup?
381				if(-e "/usr/bin/gzip"){
382				open($fh,"gzip -cd $fastq \| ") or die "ERROR: could not open $fastq for reading!: $!";
383				}else{
384				$fh=new IO::Uncompress::Gunzip($fastq) or die "ERROR: could not read $fastq: $!";
385				}
386				} else {
387				open($fh,"<",$fastq) or die "ERROR: could not open $fastq for reading!: $!";
388				}
389
390				return $fh;
391				}
392				# In case I accidentally do $Kmer->closeFastq without thinking
393				# how ridiculous that is, let's just avoid that problem with
394				# this subroutine.
395				sub closeFastq{
396				my($self,$fastq)=@_;
397				close $fastq;
398				return 1;
399				}
400
401				1;