File Coverage

blib/lib/BioX/Workflow/Plugin/Drake.pm

Criterion	Covered	Total	%
statement	4	6	66.6
branch			n/a
condition			n/a
subroutine	2	2	100.0
pod			n/a
total	6	8	75.0

line	stmt	sub	time	code
1				package BioX::Workflow::Plugin::Drake;
2
3				our $VERSION = '0.10';
4	1	1	27713	use Data::Dumper;
	1		10607
	1		81
5	1	1	1580	use Data::Pairs;
	0
	0
6
7				use Moose::Role;
8				#extends 'BioX::Workflow';
9
10				use Interpolation E => 'eval';
11
12				=head1 NAME
13
14				BioX::Workflow::Plugin::Drake - A very opinionated template based bioinformatics workflow writer for Drake.
15
16				=head1 SYNOPSIS
17
18				The main documentation for this module is at L. This module extends Workflow in order to add functionality for outputing workflows in drake format.
19
20				biox-workflow-drake.pl --workflow workflow.yml > workflow.drake
21				drake --workflow workflow.drake #with other functionality such as --jobs for asynchronous output, etc.
22
23				More information about Drake can be found here L.
24
25				=head2 Default Variables
26
27				BioX::Workflow::Plugin::Drake assumes your INPUT/OUTPUT and indir/outdirs are
28				linked.
29
30				This means the output from step1 is the input for step2.
31
32				You can override this behavior by either declaring any of these values, or in the global
33				variables set auto_input: 0, disable automatic indir/outdir naming with
34				auto_name: 0, and disable automatically naming outdirectories by rule names with
35				enforce_struct: 0.
36
37
38				=head2 Example
39
40				=head3 workflow.yml
41
42				---
43				global:
44				- indir: /home/user/workflow
45				- outdir: /home/user/workflow/output
46				- file_rule: (.csv)$
47				rules:
48				- backup:
49				local:
50				- INPUT: "{$self->indir}/{$sample}.csv"
51				- OUTPUT: "{$self->outdir}/{$sample}.csv"
52				- thing: "other thing"
53				process: \|
54				cp $INPUT $OUTPUT
55				- grep_VARA:
56				local:
57				- OUTPUT: "{$self->outdir}/{$sample}.grep_VARA.csv"
58				process: \|
59				echo "Working on {$self->{indir}}/{$sample.csv}"
60				grep -i "VARA" {$self->indir}/{$sample}.csv >> {$self->outdir}/{$sample}.grep_VARA.csv \
61				\|\| touch {$self->OUTPUT}
62				- grep_VARB:
63				local:
64				- OUTPUT: "{$self->outdir}/{$sample}.grep_VARA.grep_VARB.csv"
65				process: \|
66				grep -i "VARB" {$self->indir}/{$sample}.grep_VARA.csv >> {$self->outdir}/{$sample}.grep_VARA.grep_VARB.csv \|\| touch {$self->OUTPUT}
67
68				=head3 Notes on the drake.yml
69
70				Drake will stop everything if you're job returns with an exit code of anything
71				besides 0. For this reason we have the last command have a command1 \|\| command2
72				syntax, so that even if we don't grep any "VARB" from the file the workflow
73				could continue.
74
75				=head3 Run it with default setup
76
77				biox-workflow-drake.pl --workflow workflow.yml > workflow.full.drake
78
79				=head3 Output with default setup
80
81				I don't want to inlcude the whole file, but you get the idea
82
83				;
84				; Generated at: 2015-06-21T11:01:24
85				; This file was generated with the following options
86				; --workflow drake.yml
87				; --min 1
88				;
89
90				;
91				; Samples: test1, test2
92				;
93				;
94				; Starting Workflow
95				;
96
97				;
98				; Starting backup
99				;
100
101
102				;
103				; Variables
104				; Indir: /home/guests/jir2004/workflow
105				; Outdir: /home/guests/jir2004/workflow/output/backup
106				; Local Variables:
107				; INPUT: {$self->indir}/{$sample}.csv
108				; OUTPUT: {$self->outdir}/{$sample}.csv
109				; thing: other thing
110				;
111
112				/home/guests/jir2004/workflow/output/backup/$[SAMPLE].csv <- /home/guests/jir2004/workflow/$[SAMPLE].csv
113				cp $INPUT $OUTPUT
114
115
116				;
117				; Ending backup
118				;
119
120
121				;
122				; Starting grep_VARA
123				;
124
125
126				Run drake
127
128				drake --workflow workflow.full.drake
129
130				The following steps will be run, in order:
131				1: /home/user/workflow/output/backup/test1.csv <- /home/user/workflow/test1.csv [timestamped]
132				2: /home/user/workflow/output/backup/test2.csv <- /home/user/workflow/test2.csv [timestamped]
133				3: /home/user/workflow/output/grep_vara/test1.grep_VARA.csv <- /home/user/workflow/output/backup/test1.csv [projected timestamped]
134				4: /home/user/workflow/output/grep_vara/test2.grep_VARA.csv <- /home/user/workflow/output/backup/test2.csv [projected timestamped]
135				5: /home/user/workflow/output/grep_varb/test1.grep_VARA.grep_VARB.csv <- /home/user/workflow/output/grep_vara/test1.grep_VARA.csv [projected timestamped]
136				6: /home/user/workflow/output/grep_varb/test2.grep_VARA.grep_VARB.csv <- /home/user/workflow/output/grep_vara/test2.grep_VARA.csv [projected timestamped]
137				Confirm? [y/n] y
138				Running 6 steps with concurrence of 1...
139
140				--- 0. Running (timestamped): /home/user/workflow/output/backup/test1.csv <- /home/user/workflow/test1.csv
141				--- 0: /home/user/workflow/output/backup/test1.csv <- /home/user/workflow/test1.csv -> done in 0.02s
142
143				--- 1. Running (timestamped): /home/user/workflow/output/backup/test2.csv <- /home/user/workflow/test2.csv
144				--- 1: /home/user/workflow/output/backup/test2.csv <- /home/user/workflow/test2.csv -> done in 0.01s
145
146				--- 2. Running (timestamped): /home/user/workflow/output/grep_vara/test1.grep_VARA.csv <- /home/user/workflow/output/backup/test1.csv
147				Working on /home/user/workflow/output/backup/test1csv
148				--- 2: /home/user/workflow/output/grep_vara/test1.grep_VARA.csv <- /home/user/workflow/output/backup/test1.csv -> done in 0.01s
149
150				--- 3. Running (timestamped): /home/user/workflow/output/grep_vara/test2.grep_VARA.csv <- /home/user/workflow/output/backup/test2.csv
151				Working on /home/user/workflow/output/backup/test2csv
152				--- 3: /home/user/workflow/output/grep_vara/test2.grep_VARA.csv <- /home/user/workflow/output/backup/test2.csv -> done in 0.01s
153
154				--- 4. Running (timestamped): /home/user/workflow/output/grep_varb/test1.grep_VARA.grep_VARB.csv <- /home/user/workflow/output/grep_vara/test1.grep_VARA.csv
155				--- 4: /home/user/workflow/output/grep_varb/test1.grep_VARA.grep_VARB.csv <- /home/user/workflow/output/grep_vara/test1.grep_VARA.csv -> done in 0.01s
156
157				--- 5. Running (timestamped): /home/user/workflow/output/grep_varb/test2.grep_VARA.grep_VARB.csv <- /home/user/workflow/output/grep_vara/test2.grep_VARA.csv
158				--- 5: /home/user/workflow/output/grep_varb/test2.grep_VARA.grep_VARB.csv <- /home/user/workflow/output/grep_vara/test2.grep_VARA.csv -> done in 0.08s
159				Done (6 steps run).
160
161
162				=head3 Run in minified mode
163
164				As an alternative you can run this with the --min option, which instead of
165				printing out each workflow prints out only one, and creates a run-workflow.sh
166				which has all of your environmental variables.
167
168				This option is preferable if running on an HPC cluster with many nodes.
169
170				This WILL break with use of --resample, either local or global. You need to
171				split up your workflows as opposed to using the --resample option.
172
173				biox-workflow-drake.pl --workflow workflow.yml --min 1 > workflow.drake #This also creates the run-workflow.sh in the same directory
174				./run-workflow.sh
175
176				cat drake.log #Here is the log for the first run
177
178				2015-06-21 14:02:47,543 INFO Running 3 steps with concurrence of 1...
179				2015-06-21 14:02:47,568 INFO
180				2015-06-21 14:02:47,570 INFO --- 0. Running (timestamped): /home/user/workflow/output/backup/test1.csv <- /home/user/workflow/test1.csv
181				2015-06-21 14:02:47,592 INFO --- 0: /home/user/workflow/output/backup/test1.csv <- /home/user/workflow/test1.csv -> done in 0.02s
182
183				#So on and so forth
184
185				If you look in the example directory you will see a few png files, these are outputs of the drake workflow.
186
187				=cut
188
189				=head1 Acknowledgements
190
191				Before version 0.03
192
193				This module was originally developed at and for Weill Cornell Medical
194				College in Qatar within ITS Advanced Computing Team. With approval from
195				WCMC-Q, this information was generalized and put on github, for which
196				the authors would like to express their gratitude.
197
198				As of version 0.03:
199
200				This modules continuing development is supported by NYU Abu Dhabi in the Center for Genomics and Systems Biology.
201				With approval from NYUAD, this information was generalized and put on bitbucket, for which
202				the authors would like to express their gratitude.
203
204				=head1 Inline Code Documentation
205
206				You shouldn't need these, but if you do here they are.
207
208				=head2 Attributes
209
210				=cut
211
212				=head3 full
213
214				Print the whole workflow hardcoded. This is the default
215
216				=cut
217
218				has 'full' => (
219				is => 'rw',
220				isa => 'Bool',
221				default => 1,
222				);
223
224				=head3 min
225
226				Print the workflow as 2 files.
227
228				Run the drake things
229
230				drake --vars "SAMPLE=$sample" --workflow/workflow.drake
231
232				workflow.drake
233
234				Our regular file
235
236				=cut
237
238				has 'min' => (
239				is => 'rw',
240				isa => 'Bool',
241				default => 0,
242				);
243
244				=head2 Subroutines
245
246				Subroutines
247
248				=head3 before run
249
250				Must initialize some variables
251
252				=cut
253
254				before 'run' => sub{
255				my($self) = shift;
256
257				if($self->min){
258				$self->full(0);
259				}
260				$self->wait(0);
261				$self->comment_char(';');
262				};
263
264				=head3 after get_samples
265
266				Things to do if we decide to do a min version
267
268				=cut
269
270				after 'get_samples' => sub{
271				my($self) = shift;
272
273				return unless $self->min;
274
275				open(my $fh, '>', 'run-workflow.sh') or die print "Could not open file $!\n";
276
277				print $fh "#!/bin/bash\n\n";
278
279				foreach my $sample (@{$self->samples}){
280				print $fh <
281				drake --vars "SAMPLE=$sample" --workflow workflow.drake
282				EOF
283				}
284
285				close $fh;
286
287				chmod 0777, 'run-workflow.sh';
288
289				$self->samples(["\$SAMPLE"]);
290				};
291
292				=head3 write_process
293
294				Fill in the template with the process
295
296				Ensure INPUT/OUTPUT exist
297
298				Prettyify the output a bit
299
300				=cut
301
302				before 'write_process' => sub{
303				my($self) = shift;
304
305				$DB::single=2;
306				if((! $self->local_attr->exists('INPUT')) && ! $self->local_attr->exists('OUTPUT') ){
307				print "$self->{comment_char} There is no INPUT or OUTPUT!\n";
308				}
309
310				#Make the formatting a big prettier
311				my @tmp = split("\n", $self->process);
312				$self->process(join("\n\t", @tmp));
313				};
314
315				before 'process_template' => sub {
316				my $self = shift;
317
318				my $tmp = "{\$self->OUTPUT} <- {\$self->INPUT}\n\t";
319				$DB::single=2;
320				if($self->min){
321				$tmp =~ s/\$SAMPLE/\$[SAMPLE]/g;
322				}
323				my $newprocess = $tmp.$self->process;
324				$self->process($newprocess);
325
326				};
327
328				1;
329				__END__