File Coverage

lib/Bio/Tradis/RunTradis.pm
Criterion Covered Total %
statement 103 241 42.7
branch 17 48 35.4
condition n/a
subroutine 21 32 65.6
pod 0 1 0.0
total 141 322 43.7


line stmt bran cond sub pod time code
1             package Bio::Tradis::RunTradis;
2             $Bio::Tradis::RunTradis::VERSION = '1.3.2';
3             # ABSTRACT: Perform all steps required for a tradis analysis
4              
5              
6 2     2   105880 use Cwd;
  2         3  
  2         95  
7 2     2   425 use Moose;
  2         403336  
  2         12  
8 2     2   12424 use File::Temp;
  2         4  
  2         148  
9 2     2   11 use File::Path 'rmtree';
  2         3  
  2         83  
10 2     2   742 use Bio::Tradis::FilterTags;
  2         6  
  2         70  
11 2     2   852 use Bio::Tradis::RemoveTags;
  2         6  
  2         65  
12 2     2   781 use Bio::Tradis::Map;
  2         7  
  2         74  
13 2     2   925 use Bio::Tradis::TradisPlot;
  2         7  
  2         67  
14 2     2   903 use Bio::Tradis::Exception;
  2         5  
  2         47  
15 2     2   500 use Bio::Tradis::Samtools;
  2         5  
  2         4165  
16              
17             has 'verbose' => ( is => 'rw', isa => 'Bool', default => 0 );
18             has 'fastqfile' => ( is => 'rw', isa => 'Str', required => 1 );
19             has '_unzipped_fastq' =>
20             ( is => 'rw', isa => 'Str', lazy => 1, builder => '_build__unzipped_fastq' );
21             has 'tag' => ( is => 'ro', isa => 'Str', required => 1 );
22             has 'tagdirection' =>
23             ( is => 'ro', isa => 'Str', required => 1, default => '3' );
24             has 'mismatch' => ( is => 'rw', isa => 'Int', required => 1, default => 0 );
25             has 'mapping_score' =>
26             ( is => 'ro', isa => 'Int', required => 1, default => 30 );
27             has 'reference' => ( is => 'rw', isa => 'Str', required => 1 );
28             has 'outfile' => (
29             is => 'rw',
30             isa => 'Str',
31             required => 0,
32             default => sub {
33             my ($self) = @_;
34             my @dirs = split( '/', $self->fastqfile );
35             my $o = pop(@dirs);
36             $o =~ s/fastq/out/;
37             return $o;
38             }
39             );
40             has 'smalt_k' => ( is => 'rw', isa => 'Maybe[Int]', required => 0 );
41             has 'smalt_s' => ( is => 'rw', isa => 'Maybe[Int]', required => 0 );
42             has 'smalt_y' => ( is => 'rw', isa => 'Maybe[Num]', required => 0, default => 0.96 );
43             has 'smalt_r' => ( is => 'rw', isa => 'Maybe[Int]', required => 0, default => -1);
44             has 'smalt_n' => ( is => 'rw', isa => 'Maybe[Int]', required => 0, default => 1);
45             has 'samtools_exec' => ( is => 'rw', isa => 'Str', default => 'samtools' );
46              
47             has '_temp_directory' => (
48             is => 'rw',
49             isa => 'Str',
50             required => 0,
51             lazy => 1,
52             builder => '_build__temp_directory'
53             );
54             has 'output_directory' => (
55             is => 'rw',
56             isa => 'Str',
57             required => 0,
58             lazy => 1,
59             builder => '_build_output_directory'
60             );
61             has '_stats_handle' => ( is => 'ro', isa => 'FileHandle', required => 1 );
62             has '_sequence_info' => (
63             is => 'rw',
64             isa => 'HashRef',
65             required => 0,
66             lazy => 1,
67             builder => '_build__sequence_info'
68             );
69             has '_current_directory' => (
70             is => 'rw',
71             isa => 'Str',
72             required => 0,
73             lazy => 1,
74             builder => '_build__current_directory'
75             );
76              
77             sub _is_gz {
78 2     2   6 my ($self) = @_;
79 2         31 my $fq = $self->fastqfile;
80              
81 2 50       12 if ( $fq =~ /\.gz/ ) {
82 0         0 return 1;
83             }
84             else {
85 2         6 return 0;
86             }
87             }
88              
89             sub _build__unzipped_fastq {
90 2     2   6 my ($self) = @_;
91 2         37 my $fq = $self->fastqfile;
92 2         35 my $temporary_directory = $self->_temp_directory;
93              
94 2 50       8 if ( $self->_is_gz ) {
95 0         0 $fq =~ /([^\/]+)$/;
96 0         0 my $newfq = $1;
97 0         0 $newfq =~ s/\.gz//;
98 0 0       0 if ( !-e "$temporary_directory/$newfq" ) {
99 0         0 `gunzip -c $fq > $temporary_directory/$newfq`;
100             }
101 0         0 return "$temporary_directory/$newfq";
102             }
103             else {
104 2         36 return $fq;
105             }
106             }
107              
108             sub _build__stats_handle {
109 0     0   0 my ($self) = @_;
110 0         0 my $outfile = $self->outfile;
111              
112 0         0 open( my $stats, ">", "$outfile.stats" );
113 0         0 return $stats;
114             }
115              
116             sub _build__sequence_info {
117 0     0   0 my ($self) = @_;
118 0         0 my $temporary_directory = $self->_temp_directory;
119 0         0 open( GREP,
120             "grep \@SQ $temporary_directory/mapped.sam | awk '{print \$2, \$3}' |"
121             );
122 0         0 my %sns = ();
123 0         0 while ( my $sn = <GREP> ) {
124 0         0 chomp($sn);
125 0         0 $sn =~ /SN:(\S+)\s+LN:(\d+)/;
126 0         0 $sns{$1} = $2;
127             }
128 0         0 return \%sns;
129             }
130              
131             sub _build__temp_directory {
132 1     1   4 my ($self) = @_;
133 1         23 my $tmp_dir = File::Temp->newdir( 'tmp_run_tradis_XXXXX',
134             CLEANUP => 0,
135             DIR => $self->output_directory );
136 1         300 return $tmp_dir->dirname;
137             }
138              
139             sub _build_output_directory {
140 0     0   0 return cwd();
141             }
142              
143             sub _build__current_directory {
144 0     0   0 my ($self) = @_;
145 0         0 my $fq = $self->fastqfile;
146              
147 0         0 my @dirs = split( '/', $fq );
148 0         0 pop(@dirs);
149 0         0 return join( '/', @dirs );
150             }
151              
152             sub run_tradis {
153 1     1 0 3 my ($self) = @_;
154 1         27 my $temporary_directory = $self->_temp_directory;
155 1         23 my $fq = $self->fastqfile;
156            
157 1         22 my $ref = $self->reference;
158 1 50       17 Bio::Tradis::Exception::RefNotFound->throw( error => "$ref not found\n" ) unless( -e $ref );
159              
160 1 50       20 print STDERR "::::::::::::::::::\n$fq\n::::::::::::::::::\n\n" if($self->verbose);
161              
162             # Step 1: Filter tags that match user input tag
163 1 50       16 print STDERR "..........Step 1: Filter tags that match user input tag\n" if($self->verbose);
164 1         4 $self->_filter;
165              
166 1 50       23 print STDERR "..........Step 1.1: Check that at least one read started with the tag\n" if($self->verbose);
167 1         5 $self->_check_filter;
168              
169             # Step 2: Remove the tag from the sequence and quality strings
170 1 50       20 print STDERR
171             "..........Step 2: Remove the tag from the sequence and quality strings\n" if($self->verbose);
172 1         6 $self->_remove;
173              
174             # Step 3: Map file to reference
175 1 50       23 print STDERR "..........Step 3: Map file to reference\n" if($self->verbose);
176 1         5 $self->_map;
177              
178             # Step 4: Convert output from SAM to BAM, sort and index
179 1 50       28 print STDERR
180             "..........Step 3.5: Convert output from SAM to BAM and sort\n" if($self->verbose);
181 1         5 $self->_sam2bam;
182 1         11 $self->_sort_bam;
183 0         0 $self->_bamcheck;
184              
185             # Step 5: Generate plot
186 0 0       0 print STDERR "..........Step 4: Generate plot\n" if($self->verbose);
187 0         0 $self->_make_plot;
188              
189             # Step 6: Generate statistics
190 0 0       0 print STDERR "..........Step 5: Generate statistics\n" if($self->verbose);
191 0         0 $self->_stats;
192              
193             # Step 7: Move files to current directory
194 0 0       0 print STDERR "..........Step 6: Move files to current directory\n" if($self->verbose);
195 0         0 my $outfile = $self->outfile;
196 0         0 my $output_directory = $self->output_directory;
197 0         0 system("mv $temporary_directory/$outfile* $output_directory");
198 0         0 system("mv $temporary_directory/mapped.sort.bam $output_directory/$outfile.mapped.bam");
199 0         0 system("mv $temporary_directory/mapped.sort.bam.bai $output_directory/$outfile.mapped.bam.bai");
200 0         0 system("mv $temporary_directory/mapped.bamcheck $output_directory/$outfile.mapped.bamcheck");
201              
202             # Clean up
203 0 0       0 print STDERR "..........Clean up\n" if($self->verbose);
204              
205 0         0 rmtree($temporary_directory);
206              
207 0         0 return 1;
208             }
209              
210             sub _filter {
211 2     2   5 my ($self) = @_;
212 2         47 my $temporary_directory = $self->_temp_directory;
213 2         56 my $fqfile = $self->_unzipped_fastq;
214 2         38 my $tag = $self->tag;
215 2         40 my $mm = $self->mismatch;
216              
217 2         54 my $filter = Bio::Tradis::FilterTags->new(
218             fastqfile => $fqfile,
219             tag => $tag,
220             mismatch => $mm,
221             outfile => "$temporary_directory/filter.fastq"
222             )->filter_tags;
223             }
224              
225             sub _check_filter {
226 8     8   518 my ($self) = @_;
227 8         309 my $temporary_directory = $self->_temp_directory;
228 8         21 my $filtered_file_filename = "$temporary_directory/filter.fastq";
229 8 100       337 open my $filtered_file, '<', $filtered_file_filename or
230             Bio::Tradis::Exception::TagFilterError->throw( error => "There was a problem filtering reads by the specified tag. Please check all input files are Fastq formatted and that at least one read in each starts with the specified tag\n" );
231 7         15 my @first_read_data;
232 7         105 while( my $line = <$filtered_file> ) {
233 22 100       61 last if $. > 4;
234 20         26 chomp($line);
235 20         68 push @first_read_data, $line;
236             }
237 7         16 my $number_of_read_lines = scalar @first_read_data;
238 7 100       23 if ( $number_of_read_lines ne 4) {
239             # There wasn't enough data for a complete read
240 3         31 Bio::Tradis::Exception::TagFilterError->throw( error => "There was a problem filtering reads by the specified tag. Please check all input files are Fastq formatted and that at least one read in each starts with the specified tag\n" );
241             }
242 4         10 my $read_plus_sign = $first_read_data[2];
243 4 100       11 if ( $read_plus_sign ne '+' ) {
244             # The first 'read' didn't have a '+' on the third line, suspicious
245 1         13 Bio::Tradis::Exception::TagFilterError->throw( error => "There was a problem filtering reads by the specified tag. Please check all input files are Fastq formatted and that at least one read in each starts with the specified tag\n" );
246             }
247             # I'm not proposing further (more detailed) validation here
248 3         38 close $filtered_file;
249             }
250              
251             sub _remove {
252 2     2   9 my ($self) = @_;
253 2         73 my $temporary_directory = $self->_temp_directory;
254 2         40 my $tag = $self->tag;
255 2         40 my $mm = $self->mismatch;
256              
257 2         65 my $rm_tags = Bio::Tradis::RemoveTags->new(
258             fastqfile => "$temporary_directory/filter.fastq",
259             tag => $tag,
260             mismatch => $mm,
261             outfile => "$temporary_directory/tags_removed.fastq"
262             )->remove_tags;
263             }
264              
265             sub _map {
266 2     2   8 my ($self) = @_;
267 2         45 my $temporary_directory = $self->_temp_directory;
268              
269 2         39 my $ref = $self->reference;
270              
271 2         46 my $mapping = Bio::Tradis::Map->new(
272             fastqfile => "$temporary_directory/tags_removed.fastq",
273             reference => "$ref",
274             refname => "$temporary_directory/ref.index",
275             outfile => "$temporary_directory/mapped.sam",
276             smalt_k => $self->smalt_k,
277             smalt_s => $self->smalt_s,
278             smalt_y => $self->smalt_y,
279             smalt_r => $self->smalt_r,
280             smalt_n => $self->smalt_n
281             );
282 2         15 $mapping->index_ref;
283 2         21 $mapping->do_mapping;
284             }
285              
286             sub _sam2bam {
287 2     2   8 my ($self) = @_;
288 2         54 my $temporary_directory = $self->_temp_directory;
289              
290 2         46 system(
291             $self->samtools_exec." view -b -o $temporary_directory/mapped.bam -S $temporary_directory/mapped.sam"
292             );
293 2         55 return 1;
294             }
295              
296             sub _sort_bam {
297 2     2   11 my ($self) = @_;
298 2         91 my $temporary_directory = $self->_temp_directory;
299              
300 2         48 my $samtools_obj = Bio::Tradis::Samtools->new(exec => $self->samtools_exec, threads => $self->smalt_n);
301 2         21 $samtools_obj->run_sort("$temporary_directory/mapped.bam","$temporary_directory/mapped.sort.bam");
302 0           $samtools_obj->run_index("$temporary_directory/mapped.sort.bam");
303 0           return 1;
304             }
305              
306             sub _bamcheck {
307 0     0     my ($self) = @_;
308 0           my $temporary_directory = $self->_temp_directory;
309              
310 0           system(
311             $self->samtools_exec." stats $temporary_directory/mapped.sort.bam > $temporary_directory/mapped.bamcheck"
312             );
313 0           return 1;
314             }
315              
316             sub _make_plot {
317 0     0     my ($self) = @_;
318 0           my $temporary_directory = $self->_temp_directory;
319 0           my $ref = $self->reference;
320 0           my $outfile = $self->outfile;
321 0           my $tr_d = $self->tagdirection;
322              
323 0           my $plot = Bio::Tradis::TradisPlot->new(
324             mappedfile => "$temporary_directory/mapped.sort.bam",
325             mapping_score => $self->mapping_score,
326             outfile => "$temporary_directory/$outfile"
327             )->plot;
328              
329             # if tag direction is 5, reverse plot columns
330 0 0         if ( $self->tagdirection eq '5' ) {
331 0 0         print STDERR "Tag direction = 5. Reversing plot..\n" if($self->verbose);
332 0           $self->_reverse_plots;
333             }
334 0           return 1;
335             }
336              
337             sub _reverse_plots {
338 0     0     my ($self) = @_;
339 0           my $temporary_directory = $self->_temp_directory;
340 0           my $outfile = $self->outfile;
341 0           my @seqnames = keys %{ $self->_sequence_info };
  0            
342              
343 0           my @current_plots =
344             glob("$temporary_directory/$outfile.*.insert_site_plot.gz");
345              
346 0           foreach my $plotname (@current_plots) {
347 0 0         print STDERR "Reversing $plotname\n" if($self->verbose);
348              
349             #my $plotname = $self->_plotname($sn);
350 0           system("gunzip -c $plotname > $temporary_directory/tmp.plot");
351 0           system(
352             "awk '{ t = \$1; \$1 = \$2; \$2 = t; print; }' $temporary_directory/tmp.plot > rv_plot"
353             );
354 0           system("gzip -c rv_plot > $plotname");
355             }
356 0           unlink("$temporary_directory/tmp.plot");
357 0           unlink("rv_plot");
358             }
359              
360             sub _stats {
361 0     0     my ($self) = @_;
362 0           my $outfile = $self->outfile;
363 0           my $temporary_directory = $self->_temp_directory;
364 0           my $fq = $self->_unzipped_fastq;
365 0           my $seq_info = $self->_sequence_info;
366              
367             #write header to stats file
368 0           $self->_write_stats_header;
369              
370             # Add file name and number of reads in it
371 0           my @fql = split( "/", $fq );
372 0           my $stats = "$fql[-1],";
373 0           my $total_reads = `wc $fq | awk '{print \$1/4}'`;
374 0           chomp($total_reads);
375 0           $stats .= "$total_reads,";
376              
377             # Matching reads
378 0           my $matching =
379             `wc $temporary_directory/filter.fastq | awk '{print \$1/4}'`;
380 0           chomp($matching);
381 0           $stats .= "$matching,";
382 0           $stats .= ( $matching / $total_reads ) * 100 . ",";
383              
384             # Mapped reads
385 0           my $mapped = $self->_number_of_mapped_reads;
386 0           $stats .= "$mapped,";
387 0           $stats .= ( $mapped / $matching ) * 100 . ",";
388              
389             # Unique insertion sites
390 0           my ( $total_uis, $total_seq_len );
391 0           foreach my $si ( keys %{$seq_info} ) {
  0            
392 0           my $plotname = $self->_plotname($si);
393 0           system(
394             "gunzip -c $temporary_directory/$plotname > $temporary_directory/tmp.plot"
395             );
396 0           my $uis = `grep -c -v "0 0" $temporary_directory/tmp.plot`;
397 0           chomp($uis);
398 0           $total_uis += $uis;
399 0           $stats .= "$uis,";
400 0           my $seqlen = ${$seq_info}{$si};
  0            
401 0           $total_seq_len += $seqlen;
402 0           my $uis_per_seqlen = "NaN";
403 0 0         $uis_per_seqlen = $seqlen / $uis if ( $uis > 0 );
404 0           chomp($uis_per_seqlen);
405 0           $stats .= "$uis_per_seqlen,";
406             }
407 0           $stats .= "$total_uis,";
408 0           my $t_uis_p_l = "NaN";
409 0 0         $t_uis_p_l = $total_seq_len / $total_uis if ( $total_uis > 0 );
410 0           $stats .= "$t_uis_p_l\n";
411 0           print { $self->_stats_handle } $stats;
  0            
412             }
413              
414             sub _write_stats_header {
415 0     0     my ($self) = @_;
416 0           my @seqnames = keys %{ $self->_sequence_info };
  0            
417 0           my @fields = (
418             "File",
419             "Total Reads",
420             "Reads Matched",
421             "\% Matched",
422             "Reads Mapped",
423             "\% Mapped"
424             );
425 0           print { $self->_stats_handle } join( ",", @fields ) . ",";
  0            
426 0           foreach my $sn (@seqnames) {
427 0           print { $self->_stats_handle } "Unique Insertion Sites : $sn,";
  0            
428 0           print { $self->_stats_handle } "Seq Len/UIS : $sn,";
  0            
429             }
430 0           print { $self->_stats_handle } "Total Unique Insertion Sites,";
  0            
431 0           print { $self->_stats_handle } "Total Seq Len/Total UIS\n";
  0            
432             }
433              
434             sub _plotname {
435 0     0     my ( $self, $seq_name ) = @_;
436 0           my $outfile = $self->outfile;
437              
438 0           $seq_name =~ s/[^\w\d\.]/_/g;
439 0           my $plotfile_name = "$outfile.$seq_name.insert_site_plot.gz";
440 0           return $plotfile_name;
441             }
442              
443             sub _number_of_mapped_reads {
444 0     0     my ($self) = @_;
445 0           my $temporary_directory = $self->_temp_directory;
446              
447 0           my $pars =
448             Bio::Tradis::Parser::Bam->new(
449             file => "$temporary_directory/mapped.bam" );
450 0           my $c = 0;
451 0           while ( $pars->next_read ) {
452 0 0         if ( $pars->is_mapped ) {
453 0           $c++;
454             }
455             }
456 0           return $c;
457             }
458              
459             __PACKAGE__->meta->make_immutable;
460 2     2   19 no Moose;
  2         5  
  2         13  
461             1;
462              
463             __END__
464              
465             =pod
466              
467             =encoding UTF-8
468              
469             =head1 NAME
470              
471             Bio::Tradis::RunTradis - Perform all steps required for a tradis analysis
472              
473             =head1 VERSION
474              
475             version 1.3.2
476              
477             =head1 SYNOPSIS
478              
479             Takes a fastq file with tags already attached, filters the tags matching user input,
480             removes the tags, maps to a reference (.fa) and generates insertion site plots for use in
481             Artemis (or other genome browsers), mapped BAM files for each lane and a statistical summary of the analysis.
482              
483             use Bio::Tradis::RunTradis;
484            
485             my $pipeline = Bio::Tradis::RunTradis->new(
486             fastqfile => 'abc',
487             reference => 'abc',
488             tag => 'abc',
489             tagdirection => '5'|'3'
490             );
491             $pipeline->run_tradis();
492              
493             =head1 PARAMETERS
494              
495             =head2 Required
496              
497             =over
498              
499             =item * C<fastqfile> - file containing a list of fastqs (gzipped or raw) to run the
500             complete analysis on. This includes all (including
501             intermediary format conversion and sorting) steps starting from
502             filtering.
503              
504             =item * C<tag> - TraDIS tag to filter and then remove
505              
506             =item * C<reference> - path to/name of reference genome in fasta format (.fa)
507              
508             =back
509              
510             =head2 Optional
511              
512             =over
513              
514             =item * C<mismatch> - number of mismatches to allow when filtering/removing the tag. Default = 0
515              
516             =item * C<tagdirection> - direction of the tag, 5' or 3'. Default = 3
517              
518             =item * C<mapping_score> - cutoff value for mapping score when creating insertion site plots. Default = 30
519              
520             =back
521              
522             =head1 METHODS
523              
524             C<run_tradis> - run complete analysis with given parameters
525              
526             =head1 AUTHOR
527              
528             Carla Cummins <path-help@sanger.ac.uk>
529              
530             =head1 COPYRIGHT AND LICENSE
531              
532             This software is Copyright (c) 2013 by Wellcome Trust Sanger Institute.
533              
534             This is free software, licensed under:
535              
536             The GNU General Public License, Version 3, June 2007
537              
538             =cut