File Coverage

lib/Mashtree.pm
Criterion Covered Total %
statement 19 21 90.4
branch n/a
condition n/a
subroutine 7 7 100.0
pod n/a
total 26 28 92.8


line stmt bran cond sub pod time code
1             #!/usr/bin/env perl
2             package Mashtree;
3 4     4   90935 use strict;
  4         16  
  4         103  
4 4     4   18 use warnings;
  4         4  
  4         98  
5 4     4   16 use Exporter qw(import);
  4         5  
  4         99  
6 4     4   17 use File::Basename qw/fileparse basename dirname/;
  4         6  
  4         172  
7 4     4   1054 use Data::Dumper;
  4         15541  
  4         296  
8 4     4   33 use List::Util qw/shuffle/;
  4         9  
  4         259  
9              
10 4     4   1513 use threads;
  0            
  0            
11             use threads::shared;
12              
13             use lib dirname($INC{"Mashtree.pm"});
14             use Bio::Matrix::IO;
15             use Bio::TreeIO;
16              
17             our @EXPORT_OK = qw(
18             logmsg openFastq _truncateFilename distancesToPhylip createTreeFromPhylip sortNames
19             @fastqExt @fastaExt @bamExt @vcfExt @richseqExt @mshExt
20             $MASHTREE_VERSION
21             );
22              
23             local $0=basename $0;
24              
25             ######
26             # CONSTANTS
27              
28             our $VERSION = "0.27";
29             our $MASHTREE_VERSION=$VERSION;
30             our @fastqExt=qw(.fastq.gz .fastq .fq .fq.gz);
31             our @fastaExt=qw(.fasta .fna .faa .mfa .fas .fsa .fa);
32             our @bamExt=qw(.sorted.bam .bam);
33             our @vcfExt=qw(.vcf.gz .vcf);
34             our @mshExt=qw(.msh);
35             # Richseq extensions were obtained mostly from bioperl under
36             # the genbank, embl, and swissprot entries, under
37             # the source for Bio::SeqIO
38             our @richseqExt=qw(.gb .gbank .genbank .gbk .gbs .gbf .embl .ebl .emb .dat .swiss .sp);
39              
40             # Helpful things
41             my $fhStick :shared; # A thread can only open a fastq file if it has the talking stick.
42              
43             #################################################
44             ### COMMON SUBS/TOOLS (not object subroutines) ##
45             #################################################
46             # Redefine how scripts die
47             $SIG{'__DIE__'} = sub {
48             local $0=basename($0);
49             my $e = $_[0] || "";
50             my $callerSub=(caller(1))[3] || (caller(0))[3] || "UnknownSub";
51              
52             $e =~ s/(at [^\s]+? line \d+\.$)/\nStopped $1/;
53             die("$0: $callerSub: $e");
54             };
55             # Centralized logmsg
56             #sub logmsg {print STDERR "$0: ".(caller(1))[3].": @_\n";}
57             sub logmsg {
58             local $0=basename $0;
59             my $parentSub=(caller(1))[3] || (caller(0))[3];
60             $parentSub=~s/^main:://;
61              
62             # Find the thread ID and stringify it
63             my $tid=threads->tid;
64             $tid=($tid) ? "(TID$tid)" : "";
65              
66             my $msg="$0: $parentSub$tid: @_\n";
67              
68             print STDERR $msg;
69             }
70              
71             # Opens a fastq file in a thread-safe way.
72             sub openFastq{
73             my($fastq,$settings)=@_;
74              
75             my $fh;
76              
77             lock($fhStick);
78              
79             my @fastqExt=qw(.fastq.gz .fastq .fq.gz .fq);
80             my($name,$dir,$ext)=fileparse($fastq,@fastqExt);
81             if($ext =~/\.gz$/){
82             open($fh,"zcat $fastq | ") or die "ERROR: could not open $fastq for reading!: $!";
83             } else {
84             open($fh,"<",$fastq) or die "ERROR: could not open $fastq for reading!: $!";
85             }
86             return $fh;
87             }
88              
89             # Removes fastq extension, removes directory name,
90             # truncates to a length, and adds right-padding.
91             sub _truncateFilename{
92             my($file,$settings)=@_;
93             $$settings{truncLength}||=255;
94             # strip off msh and get the basename
95             my $name=basename($file,'.msh');
96             # One more extension
97             $name=basename($name,@fastqExt,@richseqExt,@fastaExt);
98             # Truncate
99             $name=substr($name,0,$$settings{truncLength});
100             # Add in padding
101             $name.=" " x ($$settings{truncLength}-length($name));
102             return $name;
103             }
104              
105              
106             # 1. Read the mash distances
107             # 2. Create a phylip file
108             sub distancesToPhylip{
109             my($distances,$outdir,$settings)=@_;
110              
111             my $phylip = "$outdir/distances.phylip";
112             # NOTE: need to regenerate the combined distances each time
113             # because I need to allow variation in the input samples.
114             #return $phylip if(-e $phylip);
115              
116             # The way phylip is, I need to know the genome names
117             # a priori
118             my %name;
119             open(MASHDIST,"<",$distances) or die "ERROR: could not open $distances for reading: $!";
120             while(<MASHDIST>){
121             next if(/^#/);
122             my($name)=split(/\t/,$_);
123             $name=~s/^\s+|\s+$//g; # whitespace trim before right-padding is added
124             $name{_truncateFilename($name,$settings)}=1;
125             }
126             close MASHDIST;
127             my @name=sortNames([keys(%name)],$settings);
128             # Index the array
129             my $columnIndex=0;
130             for(@name){
131             $name{$_}=$columnIndex++;
132             }
133              
134             # Load up the matrix object
135             logmsg "Reading the distances file at $distances";
136             open(MASHDIST,"<",$distances) or die "ERROR: could not open $distances for reading: $!";
137             my $query="UNKNOWN"; # Default ID in case anything goes wrong
138             my @m;
139             while(<MASHDIST>){
140             chomp;
141             if(/^#query\s+(.+)/){
142             $query=_truncateFilename($1,$settings);
143             } else {
144             my ($reference,$distance)=split(/\t/,$_);
145             $reference=_truncateFilename($reference,$settings);
146             $distance=sprintf("%0.8f",$distance);
147             $m[$name{$query}][$name{$reference}]=$distance;
148             $m[$name{$reference}][$name{$query}]=$distance;
149             }
150             }
151             close MASHDIST;
152             #my $matrixObj=Bio::Matrix::Generic->new(-rownames=>\@name,-colnames=>\@name,-values=>\@m);
153              
154             # taking this method from write_matrix in http://cpansearch.perl.org/src/CJFIELDS/BioPerl-1.6.924/Bio/Matrix/IO/phylip.pm
155             my $str;
156             $str.=(" " x 4) . scalar(@name)."\n";
157             for(my $i=0;$i<@name;$i++){
158             $str.=$name[$i];
159             my $count=0;
160             for(my $j=0;$j<@name;$j++){
161             if($count < $#name){
162             $str.=$m[$i][$j]. " ";
163             } else {
164             $str.=$m[$i][$j];
165             }
166             $count++;
167             }
168             $str.="\n";
169             }
170             open(PHYLIP,">",$phylip) or die "ERROR: could not write to $phylip: $!";
171             print PHYLIP $str;
172             close PHYLIP;
173             return $phylip;
174             }
175              
176             sub sortNames{
177             my($name,$settings)=@_;
178             my @sorted;
179             if($$settings{'sort-order'} =~ /^(abc|alphabet)$/){
180             @sorted=sort { $a cmp $b } @$name;
181             } elsif($$settings{'sort-order'}=~/^rand(om)?/){
182             @sorted=shuffle(@$name);
183             } elsif($$settings{'sort-order'} eq 'input-order'){
184             @sorted=@$name;
185             } else {
186             die "ERROR: I don't understand sort-order $$settings{'sort-order'}";
187             }
188             return @sorted;
189             }
190              
191             # Create tree file with Quicktree but bioperl
192             # as a backup.
193             sub createTreeFromPhylip{
194             my($phylip,$outdir,$settings)=@_;
195              
196             my $treeObj;
197              
198             my $quicktreePath=`which quicktree 2>/dev/null`;
199             # bioperl if there was an error with which quicktree
200             if($?){
201             logmsg "Creating tree with BioPerl";
202             my $dfactory = Bio::Tree::DistanceFactory->new(-method=>"NJ");
203             my $matrix = Bio::Matrix::IO->new(-format=>"phylip", -file=>$phylip)->next_matrix;
204             $treeObj = $dfactory->make_tree($matrix);
205             open(TREE,">","$outdir/tree.dnd") or die "ERROR: could not open $outdir/tree.dnd: $!";
206             print TREE $treeObj->as_text("newick");
207             print TREE "\n";
208             close TREE;
209             }
210             # quicktree
211             else {
212             logmsg "Creating tree with QuickTree";
213             system("quicktree -in m $phylip > $outdir/tree.dnd.tmp");
214             die "ERROR with quicktree" if $?;
215             $treeObj=Bio::TreeIO->new(-file=>"$outdir/tree.dnd.tmp")->next_tree;
216             my $outtree=Bio::TreeIO->new(-file=>">$outdir/tree.dnd", -format=>"newick");
217             $outtree->write_tree($treeObj);
218              
219             unlink("$outdir/tree.dnd.tmp");
220             }
221              
222             return $treeObj;
223              
224             }
225              
226             1; # gotta love how we we return 1 in modules. TRUTH!!!
227