File Coverage

lib/Bio/Roary/CommandLine/Roary.pm
Criterion Covered Total %
statement 96 150 64.0
branch 37 90 41.1
condition 3 18 16.6
subroutine 16 18 88.8
pod 0 3 0.0
total 152 279 54.4


line stmt bran cond sub pod time code
1             undef $VERSION;
2              
3             package Bio::Roary::CommandLine::Roary;
4             $Bio::Roary::CommandLine::Roary::VERSION = '3.10.1';
5             # ABSTRACT: Take in FASTA files of proteins and cluster them
6              
7              
8 1     1   783 use Moose;
  1         3  
  1         7  
9 1     1   6602 use Getopt::Long qw(GetOptionsFromArray);
  1         8248  
  1         4  
10 1     1   500 use Bio::Roary;
  1         4  
  1         43  
11 1     1   414 use Bio::Roary::PrepareInputFiles;
  1         3  
  1         42  
12 1     1   452 use Bio::Roary::QC::Report;
  1         3  
  1         39  
13 1     1   447 use Bio::Roary::ReformatInputGFFs;
  1         12  
  1         86  
14 1     1   439 use Bio::Roary::External::CheckTools;
  1         3  
  1         34  
15 1     1   7 use File::Which;
  1         2  
  1         62  
16 1     1   5 use File::Path qw(make_path);
  1         2  
  1         40  
17 1     1   6 use Cwd qw(abs_path getcwd);
  1         2  
  1         37  
18 1     1   5 use File::Temp;
  1         2  
  1         1446  
19             extends 'Bio::Roary::CommandLine::Common';
20              
21             has 'args' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
22             has 'script_name' => ( is => 'ro', isa => 'Str', required => 1 );
23             has 'help' => ( is => 'rw', isa => 'Bool', default => 0 );
24              
25             has 'fasta_files' => ( is => 'rw', isa => 'ArrayRef', default => sub { [] } );
26             has 'output_filename' => ( is => 'rw', isa => 'Str', default => 'clustered_proteins' );
27             has 'output_directory' => ( is => 'rw', isa => 'Str', default => '.' );
28             has '_original_directory' => ( is => 'rw', isa => 'Str', default => '.' );
29             has 'job_runner' => ( is => 'rw', isa => 'Str', default => 'Local' );
30             has 'makeblastdb_exec' => ( is => 'rw', isa => 'Str', default => 'makeblastdb' );
31             has 'blastp_exec' => ( is => 'rw', isa => 'Str', default => 'blastp' );
32             has 'mcxdeblast_exec' => ( is => 'rw', isa => 'Str', default => 'mcxdeblast' );
33             has 'mcl_exec' => ( is => 'rw', isa => 'Str', default => 'mcl' );
34             has 'apply_unknowns_filter' => ( is => 'rw', isa => 'Bool', default => 1 );
35             has 'cpus' => ( is => 'rw', isa => 'Int', default => 1 );
36             has 'output_multifasta_files' => ( is => 'rw', isa => 'Bool', default => 0 );
37             has 'perc_identity' => ( is => 'rw', isa => 'Num', default => 95 );
38             has 'dont_delete_files' => ( is => 'rw', isa => 'Bool', default => 0 );
39             has 'dont_create_rplots' => ( is => 'rw', isa => 'Bool', default => 1 );
40             has 'dont_run_qc' => ( is => 'rw', isa => 'Bool', default => 0 );
41             has 'dont_split_groups' => ( is => 'rw', isa => 'Bool', default => 0 );
42             has 'verbose_stats' => ( is => 'rw', isa => 'Bool', default => 0 );
43             has 'translation_table' => ( is => 'rw', isa => 'Int', default => 11 );
44             has 'mafft' => ( is => 'rw', isa => 'Bool', default => 0 );
45             has 'allow_paralogs' => ( is => 'rw', isa => 'Bool', default => 0 );
46             has 'group_limit' => ( is => 'rw', isa => 'Num', default => 50000 );
47             has 'core_definition' => ( is => 'rw', isa => 'Num', default => 0.99 );
48             has 'verbose' => ( is => 'rw', isa => 'Bool', default => 0 );
49             has 'kraken_db' => ( is => 'rw', isa => 'Str', default => '/lustre/scratch118/infgen/pathogen/pathpipe/kraken/minikraken_20140330/' );
50             has 'run_qc' => ( is => 'rw', isa => 'Bool', default => 0 );
51             has '_working_directory' => ( is => 'rw', isa => 'File::Temp::Dir', lazy => 1, builder => '_build__working_directory' );
52              
53             has 'inflation_value' => ( is => 'rw', isa => 'Num', default => 1.5 );
54              
55             sub _build__working_directory
56             {
57 1     1   3 my ($self) = @_;
58 1         25 return File::Temp->newdir( DIR => getcwd, CLEANUP => 1 );
59             }
60              
61             sub BUILD {
62 1     1 0 4 my ($self) = @_;
63              
64             my (
65 1         4 $fasta_files, $verbose, $create_rplots, $group_limit, $dont_run_qc,
66             $max_threads, $dont_delete_files, $dont_split_groups, $perc_identity, $output_filename,
67             $job_runner, $makeblastdb_exec, $mcxdeblast_exec, $mcl_exec, $blastp_exec,
68             $apply_unknowns_filter, $cpus, $output_multifasta_files, $verbose_stats, $translation_table,
69             $run_qc, $core_definition, $help, $kraken_db, $cmd_version,
70             $mafft, $output_directory, $check_dependancies, $inflation_value, $allow_paralogs,
71             );
72              
73 1         38 GetOptionsFromArray(
74             $self->args,
75             'o|output=s' => \$output_filename,
76             'f|output_directory=s' => \$output_directory,
77             'j|job_runner=s' => \$job_runner,
78             'm|makeblastdb_exec=s' => \$makeblastdb_exec,
79             'b|blastp_exec=s' => \$blastp_exec,
80             'd|mcxdeblast_exec=s' => \$mcxdeblast_exec,
81             'c|mcl_exec=s' => \$mcl_exec,
82             'p|processors=i' => \$cpus,
83             'u|apply_unknowns_filter=i' => \$apply_unknowns_filter,
84             'e|output_multifasta_files' => \$output_multifasta_files,
85             'i|perc_identity=i' => \$perc_identity,
86             'z|dont_delete_files' => \$dont_delete_files,
87             's|dont_split_groups' => \$dont_split_groups,
88             'r|create_rplots' => \$create_rplots,
89             'y|verbose_stats' => \$verbose_stats,
90             't|translation_table=i' => \$translation_table,
91             'g|group_limit=i' => \$group_limit,
92             'qc|run_qc' => \$run_qc,
93             'x|dont_run_qc' => \$dont_run_qc,
94             'cd|core_definition=f' => \$core_definition,
95             'v|verbose' => \$verbose,
96             'n|mafft' => \$mafft,
97             'ap|allow_paralogs' => \$allow_paralogs,
98             'k|kraken_db=s' => \$kraken_db,
99             'w|version' => \$cmd_version,
100             'a|check_dependancies' => \$check_dependancies,
101             'iv|inflation_value=f' => \$inflation_value,
102             'h|help' => \$help,
103             );
104              
105 1 50       2813 $self->version($cmd_version) if ( defined($cmd_version) );
106 1 50       46 if ( $self->version ) {
107 0         0 print $self->_version() ;
108 0         0 return;
109             }
110              
111 1         16 print "\nPlease cite Roary if you use any of the results it produces:
112             Andrew J. Page, Carla A. Cummins, Martin Hunt, Vanessa K. Wong, Sandra Reuter, Matthew T. G. Holden, Maria Fookes, Daniel Falush, Jacqueline A. Keane, Julian Parkhill,
113             \"Roary: Rapid large-scale prokaryote pan genome analysis\", Bioinformatics, 2015 Nov 15;31(22):3691-3693
114             doi: http://doi.org/10.1093/bioinformatics/btv421
115             Pubmed: 26198102\n\n";
116              
117 1 50       3 $self->help($help) if ( defined($help) );
118 1 50       32 if( $self->help )
119             {
120 0         0 print $self->usage_text;
121 0         0 return;
122             }
123              
124 1 50       4 if ($check_dependancies) {
125 0         0 my $check_tools = Bio::Roary::External::CheckTools->new();
126 0         0 $check_tools->check_all_tools;
127 0         0 $self->logger->error( "Roary version " . $self->_version() );
128             }
129              
130 1 50       5 if ( defined($verbose) ) {
131 0         0 $self->verbose($verbose);
132 0         0 $self->logger->level(10000);
133             }
134              
135 1 50       2 if ( @{ $self->args } < 2 ) {
  1         25  
136 0         0 $self->logger->error("Error: You need to provide at least 2 files to build a pan genome");
137 0         0 die $self->usage_text;
138             }
139 1 50       3 $self->output_filename($output_filename) if ( defined($output_filename) );
140 1 50       29 $self->job_runner($job_runner) if ( defined($job_runner) );
141 1 50       3 $self->makeblastdb_exec($makeblastdb_exec) if ( defined($makeblastdb_exec) );
142 1 50       3 $self->blastp_exec($blastp_exec) if ( defined($blastp_exec) );
143 1 50       2 $self->mcxdeblast_exec($mcxdeblast_exec) if ( defined($mcxdeblast_exec) );
144 1 50       4 $self->mcl_exec($mcl_exec) if ( defined($mcl_exec) );
145 1 50       3 $self->cpus($cpus) if ( defined($cpus) );
146 1 50       2 $self->inflation_value($inflation_value) if ( defined($inflation_value));
147              
148 1 50       6 if ( defined($perc_identity) ) {
149 0         0 $self->perc_identity($perc_identity);
150 0 0       0 if ( $perc_identity < 50 ) {
151 0         0 $self->logger->error(
152             "The percentage identity is too low. Either something is wrong with your data, like contamination, or your doing something that the software isnt designed to support."
153             );
154             }
155             }
156              
157 1 50       3 $self->mafft($mafft) if ( defined($mafft) );
158 1 50       3 $self->apply_unknowns_filter($apply_unknowns_filter)
159             if ( defined($apply_unknowns_filter) );
160              
161 1 50       3 if ( defined($output_multifasta_files) ) {
162 0 0       0 if ( which('prank') ) {
163 0         0 $self->output_multifasta_files($output_multifasta_files);
164             }
165             else {
166              
167 0 0       0 if ( which('mafft') ) {
168 0         0 $self->output_multifasta_files($output_multifasta_files);
169 0         0 $self->mafft(1);
170 0         0 $self->logger->warn("PRANK not found in your PATH so using MAFFT instead to generate multiFASTA alignments.");
171             }
172             else {
173 0         0 $self->logger->warn("PRANK (or MAFFT) not found in your PATH so cannot generate multiFASTA alignments, skipping for now.");
174             }
175             }
176             }
177 1 50       3 $self->dont_delete_files($dont_delete_files) if ( defined($dont_delete_files) );
178 1 50       32 $self->dont_split_groups($dont_split_groups) if ( defined($dont_split_groups) );
179 1 50       3 $self->dont_create_rplots(0) if ( defined($create_rplots) );
180 1 50       3 $self->verbose_stats($verbose_stats) if ( defined $verbose_stats );
181 1 50       34 $self->translation_table($translation_table) if ( defined($translation_table) );
182 1 50       4 $self->group_limit($group_limit) if ( defined($group_limit) );
183 1 50       2 $self->kraken_db($kraken_db) if ( defined($kraken_db) );
184 1 50       2 $self->output_directory($output_directory) if ( defined($output_directory) );
185              
186 1 50 33     4 if ( defined $verbose_stats && defined($output_multifasta_files) ) {
187 0         0 $self->verbose_stats(0);
188 0         0 $self->logger->warn("The verbose stats spreadsheet is not compatible with the core gene alignement so disabling verbose_stats");
189             }
190              
191 1 50       3 if ( defined($run_qc) ) {
192 0 0 0     0 if ( which('kraken') && which('kraken-report') ) {
193 0         0 $self->run_qc($run_qc);
194             }
195             else {
196 0         0 $self->logger->warn("kraken or kraken-report not found in your PATH so cannot run QC, skipping for now.");
197             }
198             }
199              
200 1 50       27 if ( $self->cpus > 1 ) {
201 0         0 $self->job_runner('Parallel');
202             }
203              
204 1 50       3 $self->core_definition( $core_definition / 100 ) if ( defined($core_definition) );
205              
206 1         3 for my $filename ( @{ $self->args } ) {
  1         22  
207 3 50       40 if ( !-e $filename ) {
208 0         0 $self->logger->error("Error: Cant access file $filename");
209 0         0 die $self->usage_text;
210             }
211 3         5 push( @{ $self->fasta_files }, abs_path($filename) );
  3         76  
212             }
213              
214 1 50       29 $self->_working_directory( File::Temp->newdir( DIR => getcwd, CLEANUP => 0 ) ) if ( $self->dont_delete_files );
215             }
216              
217             sub _setup_output_directory {
218 1     1   3 my ($self) = @_;
219 1 50 33     34 return if ( $self->output_directory eq '.' || $self->output_directory eq '' );
220              
221 0 0 0     0 if ( -e $self->output_directory || -d $self->output_directory ) {
222 0         0 $self->logger->warn("Output directory name exists already so adding a timestamp to the end");
223 0         0 $self->output_directory( $self->output_directory() . '_' . time() );
224 0 0 0     0 if ( -e $self->output_directory || -d $self->output_directory ) {
225 0         0 die("Output directory name with time stamp exist so giving up");
226             }
227             }
228 0         0 make_path( $self->output_directory, { error => \my $err } );
229 0 0       0 if (@$err) {
230 0         0 for my $diag (@$err) {
231 0         0 my ( $file, $message ) = %$diag;
232 0         0 die("Error creating output directory $message");
233             }
234             }
235 0         0 $self->logger->info( "Output directory created: " . $self->output_directory );
236              
237 0         0 $self->_original_directory( getcwd() );
238 0         0 chdir( $self->output_directory );
239 0         0 return $self;
240             }
241              
242             sub run {
243 1     1 0 4 my ($self) = @_;
244            
245 1 50 33     22 return if($self->version || $self->help);
246              
247 1         5 $self->_setup_output_directory;
248              
249 1         42 $self->logger->info("Fixing input GFF files");
250 1         39 my $reformat_input_files = Bio::Roary::ReformatInputGFFs->new( gff_files => $self->fasta_files, logger => $self->logger );
251 1         9 $reformat_input_files->fix_duplicate_gene_ids();
252 1 50       2 if ( @{ $reformat_input_files->fixed_gff_files } == 0 ) {
  1         27  
253 0         0 die(
254             "All input files have been excluded from analysis. Please check you have valid GFF files, with annotation and a FASTA sequence at the end. Better still, reannotate your FASTA file with PROKKA."
255             );
256             }
257 1         23 $self->fasta_files( $reformat_input_files->fixed_gff_files );
258              
259 1         27 $self->logger->info("Extracting proteins from GFF files");
260 1         34 my $prepare_input_files = Bio::Roary::PrepareInputFiles->new(
261             input_files => $self->fasta_files,
262             job_runner => $self->job_runner,
263             apply_unknowns_filter => $self->apply_unknowns_filter,
264             cpus => $self->cpus,
265             translation_table => $self->translation_table,
266             verbose => $self->verbose,
267             working_directory => $self->_working_directory,
268              
269             );
270              
271 1 50       35 if ( $self->run_qc ) {
272 0         0 $self->logger->info("Running Kraken on each input assembly");
273 0         0 my $qc_input_files = Bio::Roary::QC::Report->new(
274             input_files => $self->fasta_files,
275             job_runner => $self->job_runner,
276             cpus => $self->cpus,
277             verbose => $self->verbose,
278             kraken_db => $self->kraken_db
279             );
280 0         0 $qc_input_files->report;
281             }
282              
283 1         25 my $pan_genome_obj = Bio::Roary->new(
284             input_files => $self->fasta_files,
285             fasta_files => $prepare_input_files->fasta_files,
286             output_filename => $self->output_filename,
287             job_runner => $self->job_runner,
288             cpus => $self->cpus,
289             makeblastdb_exec => $self->makeblastdb_exec,
290             blastp_exec => $self->blastp_exec,
291             output_multifasta_files => $self->output_multifasta_files,
292             perc_identity => $self->perc_identity,
293             dont_delete_files => $self->dont_delete_files,
294             dont_create_rplots => $self->dont_create_rplots,
295             dont_split_groups => $self->dont_split_groups,
296             verbose_stats => $self->verbose_stats,
297             translation_table => $self->translation_table,
298             group_limit => $self->group_limit,
299             core_definition => $self->core_definition,
300             verbose => $self->verbose,
301             mafft => $self->mafft,
302             allow_paralogs => $self->allow_paralogs,
303             inflation_value => $self->inflation_value,
304             );
305 1         11 $pan_genome_obj->run();
306              
307 0           chdir( $self->_original_directory );
308             }
309              
310             sub _version {
311 0     0     my ($self) = @_;
312 0 0         if ( defined($Bio::Roary::CommandLine::Roary::VERSION) ) {
313 0           return $Bio::Roary::CommandLine::Roary::VERSION . "\n";
314             }
315             else {
316 0           return "x.y.z\n";
317             }
318             }
319              
320             sub usage_text {
321 0     0 0   my ($self) = @_;
322              
323 0           return <<USAGE;
324             Usage: roary [options] *.gff
325              
326             Options: -p INT number of threads [1]
327             -o STR clusters output filename [clustered_proteins]
328             -f STR output directory [.]
329             -e create a multiFASTA alignment of core genes using PRANK
330             -n fast core gene alignment with MAFFT, use with -e
331             -i minimum percentage identity for blastp [95]
332             -cd FLOAT percentage of isolates a gene must be in to be core [99]
333             -qc generate QC report with Kraken
334             -k STR path to Kraken database for QC, use with -qc
335             -a check dependancies and print versions
336             -b STR blastp executable [blastp]
337             -c STR mcl executable [mcl]
338             -d STR mcxdeblast executable [mcxdeblast]
339             -g INT maximum number of clusters [50000]
340             -m STR makeblastdb executable [makeblastdb]
341             -r create R plots, requires R and ggplot2
342             -s dont split paralogs
343             -t INT translation table [11]
344             -ap allow paralogs in core alignment
345             -z dont delete intermediate files
346             -v verbose output to STDOUT
347             -w print version and exit
348             -y add gene inference information to spreadsheet, doesnt work with -e
349             -iv STR Change the MCL inflation value [1.5]
350             -h this help message
351              
352             Example: Quickly generate a core gene alignment using 8 threads
353             roary -e --mafft -p 8 *.gff
354              
355             For further info see: http://sanger-pathogens.github.io/Roary/
356             USAGE
357             }
358              
359             __PACKAGE__->meta->make_immutable;
360 1     1   7 no Moose;
  1         6  
  1         6  
361             1;
362              
363             __END__
364              
365             =pod
366              
367             =encoding UTF-8
368              
369             =head1 NAME
370              
371             Bio::Roary::CommandLine::Roary - Take in FASTA files of proteins and cluster them
372              
373             =head1 VERSION
374              
375             version 3.10.1
376              
377             =head1 SYNOPSIS
378              
379             Take in FASTA files of proteins and cluster them
380              
381             =head1 AUTHOR
382              
383             Andrew J. Page <ap13@sanger.ac.uk>
384              
385             =head1 COPYRIGHT AND LICENSE
386              
387             This software is Copyright (c) 2013 by Wellcome Trust Sanger Institute.
388              
389             This is free software, licensed under:
390              
391             The GNU General Public License, Version 3, June 2007
392              
393             =cut