| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
undef $VERSION; |
|
2
|
|
|
|
|
|
|
package Bio::Roary::CommandLine::RoaryPostAnalysis; |
|
3
|
|
|
|
|
|
|
$Bio::Roary::CommandLine::RoaryPostAnalysis::VERSION = '3.11.0'; |
|
4
|
|
|
|
|
|
|
# ABSTRACT: Perform the post analysis on the pan genome |
|
5
|
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
|
|
7
|
1
|
|
|
1
|
|
443002
|
use Moose; |
|
|
1
|
|
|
|
|
8
|
|
|
|
1
|
|
|
|
|
6
|
|
|
8
|
1
|
|
|
1
|
|
6144
|
use Getopt::Long qw(GetOptionsFromArray); |
|
|
1
|
|
|
|
|
7784
|
|
|
|
1
|
|
|
|
|
4
|
|
|
9
|
1
|
|
|
1
|
|
401
|
use Bio::Roary::PostAnalysis; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
43
|
|
|
10
|
1
|
|
|
1
|
|
556
|
use File::Find::Rule; |
|
|
1
|
|
|
|
|
6116
|
|
|
|
1
|
|
|
|
|
7
|
|
|
11
|
1
|
|
|
1
|
|
346
|
use Bio::Roary::External::GeneAlignmentFromNucleotides; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
44
|
|
|
12
|
1
|
|
|
1
|
|
8
|
use File::Path qw(remove_tree); |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
86
|
|
|
13
|
1
|
|
|
1
|
|
7
|
use Bio::Roary::External::Fasttree; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
804
|
|
|
14
|
|
|
|
|
|
|
extends 'Bio::Roary::CommandLine::Common'; |
|
15
|
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
has 'args' => ( is => 'ro', isa => 'ArrayRef', required => 1 ); |
|
17
|
|
|
|
|
|
|
has 'script_name' => ( is => 'ro', isa => 'Str', required => 1 ); |
|
18
|
|
|
|
|
|
|
has 'help' => ( is => 'rw', isa => 'Bool', default => 0 ); |
|
19
|
|
|
|
|
|
|
has '_error_message' => ( is => 'rw', isa => 'Str' ); |
|
20
|
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
has 'fasta_files' => ( is => 'rw', isa => 'Str', default => '_fasta_files' ); |
|
22
|
|
|
|
|
|
|
has 'input_files' => ( is => 'rw', isa => 'Str', default => '_gff_files'); |
|
23
|
|
|
|
|
|
|
has 'output_filename' => ( is => 'rw', isa => 'Str', default => 'clustered_proteins' ); |
|
24
|
|
|
|
|
|
|
has 'output_pan_geneome_filename' => ( is => 'rw', isa => 'Str', default => 'pan_genome.fa' ); |
|
25
|
|
|
|
|
|
|
has 'output_statistics_filename' => ( is => 'rw', isa => 'Str', default => 'gene_presence_absence.csv' ); |
|
26
|
|
|
|
|
|
|
has 'output_multifasta_files' => ( is => 'rw', isa => 'Bool', default => 0 ); |
|
27
|
|
|
|
|
|
|
has 'clusters_filename' => ( is => 'rw', isa => 'Str', default => '_clustered.clstr' ); |
|
28
|
|
|
|
|
|
|
has 'job_runner' => ( is => 'rw', isa => 'Str', default => 'Local' ); |
|
29
|
|
|
|
|
|
|
has 'cpus' => ( is => 'rw', isa => 'Int', default => 1 ); |
|
30
|
|
|
|
|
|
|
has 'dont_delete_files' => ( is => 'rw', isa => 'Bool', default => 0 ); |
|
31
|
|
|
|
|
|
|
has 'dont_create_rplots' => ( is => 'rw', isa => 'Bool', default => 0 ); |
|
32
|
|
|
|
|
|
|
has 'dont_split_groups' => ( is => 'rw', isa => 'Bool', default => 0 ); |
|
33
|
|
|
|
|
|
|
has 'verbose_stats' => ( is => 'rw', isa => 'Bool', default => 0 ); |
|
34
|
|
|
|
|
|
|
has 'translation_table' => ( is => 'rw', isa => 'Int', default => 11 ); |
|
35
|
|
|
|
|
|
|
has 'group_limit' => ( is => 'rw', isa => 'Num', default => 50000 ); |
|
36
|
|
|
|
|
|
|
has 'core_definition' => ( is => 'rw', isa => 'Num', default => 0.99 ); |
|
37
|
|
|
|
|
|
|
has 'verbose' => ( is => 'rw', isa => 'Bool', default => 0 ); |
|
38
|
|
|
|
|
|
|
has 'mafft' => ( is => 'rw', isa => 'Bool', default => 0 ); |
|
39
|
|
|
|
|
|
|
has 'allow_paralogs' => ( is => 'rw', isa => 'Bool', default => 0 ); |
|
40
|
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
sub BUILD { |
|
42
|
0
|
|
|
0
|
0
|
|
my ($self) = @_; |
|
43
|
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
my ( |
|
45
|
0
|
|
|
|
|
|
$output_filename, $dont_create_rplots, $dont_delete_files, $dont_split_groups, $output_pan_geneome_filename, |
|
46
|
|
|
|
|
|
|
$job_runner, $output_statistics_filename, $output_multifasta_files, $clusters_filename, $core_definition, |
|
47
|
|
|
|
|
|
|
$fasta_files, $input_files, $verbose_stats, $translation_table, $help, $cpus,$group_limit,$verbose,$mafft, $allow_paralogs |
|
48
|
|
|
|
|
|
|
); |
|
49
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
51
|
0
|
|
|
|
|
|
GetOptionsFromArray( |
|
52
|
|
|
|
|
|
|
$self->args, |
|
53
|
|
|
|
|
|
|
'o|output=s' => \$output_filename, |
|
54
|
|
|
|
|
|
|
'j|job_runner=s' => \$job_runner, |
|
55
|
|
|
|
|
|
|
'm|output_multifasta_files' => \$output_multifasta_files, |
|
56
|
|
|
|
|
|
|
'p=s' => \$output_pan_geneome_filename, |
|
57
|
|
|
|
|
|
|
's=s' => \$output_statistics_filename, |
|
58
|
|
|
|
|
|
|
'c=s' => \$clusters_filename, |
|
59
|
|
|
|
|
|
|
'f=s' => \$fasta_files, |
|
60
|
|
|
|
|
|
|
'i=s' => \$input_files, |
|
61
|
|
|
|
|
|
|
'a|dont_delete_files' => \$dont_delete_files, |
|
62
|
|
|
|
|
|
|
'b|dont_create_rplots' => \$dont_create_rplots, |
|
63
|
|
|
|
|
|
|
'd|dont_split_groups' => \$dont_split_groups, |
|
64
|
|
|
|
|
|
|
'e|verbose_stats' => \$verbose_stats, |
|
65
|
|
|
|
|
|
|
'z|processors=i' => \$cpus, |
|
66
|
|
|
|
|
|
|
't|translation_table=i' => \$translation_table, |
|
67
|
|
|
|
|
|
|
'g|group_limit=i' => \$group_limit, |
|
68
|
|
|
|
|
|
|
'cd|core_definition=f' => \$core_definition, |
|
69
|
|
|
|
|
|
|
'v|verbose' => \$verbose, |
|
70
|
|
|
|
|
|
|
'n|mafft' => \$mafft, |
|
71
|
|
|
|
|
|
|
'q|allow_paralogs' => \$allow_paralogs, |
|
72
|
|
|
|
|
|
|
'h|help' => \$help, |
|
73
|
|
|
|
|
|
|
); |
|
74
|
|
|
|
|
|
|
|
|
75
|
0
|
0
|
|
|
|
|
$self->help($help) if(defined($help)); |
|
76
|
0
|
0
|
|
|
|
|
$self->job_runner($job_runner) if ( defined($job_runner) ); |
|
77
|
0
|
0
|
|
|
|
|
$self->fasta_files($fasta_files) if ( defined($fasta_files) ); |
|
78
|
0
|
0
|
|
|
|
|
$self->input_files($input_files) if ( defined($input_files) ); |
|
79
|
0
|
0
|
|
|
|
|
$self->output_filename($output_filename) if ( defined($output_filename) ); |
|
80
|
0
|
0
|
|
|
|
|
$self->output_pan_geneome_filename($output_pan_geneome_filename) if ( defined($output_pan_geneome_filename) ); |
|
81
|
0
|
0
|
|
|
|
|
$self->output_statistics_filename($output_statistics_filename) if ( defined($output_statistics_filename) ); |
|
82
|
0
|
0
|
|
|
|
|
$self->output_multifasta_files($output_multifasta_files) if ( defined($output_multifasta_files) ); |
|
83
|
0
|
0
|
|
|
|
|
$self->clusters_filename($clusters_filename) if ( defined($clusters_filename) ); |
|
84
|
0
|
0
|
|
|
|
|
$self->dont_delete_files($dont_delete_files) if ( defined($dont_delete_files) ); |
|
85
|
0
|
0
|
|
|
|
|
$self->dont_create_rplots($dont_create_rplots) if ( defined($dont_create_rplots) ); |
|
86
|
0
|
0
|
|
|
|
|
$self->dont_split_groups($dont_split_groups) if ( defined($dont_split_groups) ); |
|
87
|
0
|
0
|
|
|
|
|
$self->verbose_stats($verbose_stats) if ( defined($verbose_stats)); |
|
88
|
0
|
0
|
|
|
|
|
$self->translation_table($translation_table) if ( defined($translation_table) ); |
|
89
|
0
|
0
|
|
|
|
|
$self->cpus($cpus) if ( defined($cpus) ); |
|
90
|
0
|
0
|
|
|
|
|
$self->group_limit($group_limit) if ( defined($group_limit) ); |
|
91
|
0
|
0
|
|
|
|
|
$self->core_definition( $core_definition/100 ) if ( defined($core_definition) ); |
|
92
|
0
|
0
|
|
|
|
|
$self->mafft($mafft) if ( defined($mafft) ); |
|
93
|
0
|
0
|
|
|
|
|
$self->allow_paralogs($allow_paralogs) if ( defined($allow_paralogs) ); |
|
94
|
0
|
0
|
|
|
|
|
if ( defined($verbose) ) { |
|
95
|
0
|
|
|
|
|
|
$self->verbose($verbose); |
|
96
|
0
|
|
|
|
|
|
$self->logger->level(10000); |
|
97
|
|
|
|
|
|
|
} |
|
98
|
|
|
|
|
|
|
} |
|
99
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
sub run { |
|
101
|
0
|
|
|
0
|
0
|
|
my ($self) = @_; |
|
102
|
|
|
|
|
|
|
|
|
103
|
0
|
0
|
|
|
|
|
( !$self->help ) or die $self->usage_text; |
|
104
|
0
|
0
|
|
|
|
|
if ( defined( $self->_error_message ) ) { |
|
105
|
0
|
|
|
|
|
|
print $self->_error_message . "\n"; |
|
106
|
0
|
|
|
|
|
|
die $self->usage_text; |
|
107
|
|
|
|
|
|
|
} |
|
108
|
|
|
|
|
|
|
|
|
109
|
0
|
|
|
|
|
|
my $input_files = $self->_read_file_into_array($self->input_files); |
|
110
|
0
|
|
|
|
|
|
my $obj = Bio::Roary::PostAnalysis->new( |
|
111
|
|
|
|
|
|
|
fasta_files => $self->_read_file_into_array($self->fasta_files) , |
|
112
|
|
|
|
|
|
|
input_files => $input_files , |
|
113
|
|
|
|
|
|
|
output_filename => $self->output_filename , |
|
114
|
|
|
|
|
|
|
output_pan_geneome_filename => $self->output_pan_geneome_filename, |
|
115
|
|
|
|
|
|
|
output_statistics_filename => $self->output_statistics_filename , |
|
116
|
|
|
|
|
|
|
output_multifasta_files => $self->output_multifasta_files , |
|
117
|
|
|
|
|
|
|
clusters_filename => $self->clusters_filename , |
|
118
|
|
|
|
|
|
|
dont_delete_files => $self->dont_delete_files, |
|
119
|
|
|
|
|
|
|
dont_create_rplots => $self->dont_create_rplots, |
|
120
|
|
|
|
|
|
|
dont_split_groups => $self->dont_split_groups, |
|
121
|
|
|
|
|
|
|
verbose_stats => $self->verbose_stats, |
|
122
|
|
|
|
|
|
|
group_limit => $self->group_limit, |
|
123
|
|
|
|
|
|
|
verbose => $self->verbose, |
|
124
|
|
|
|
|
|
|
cpus => $self->cpus, |
|
125
|
|
|
|
|
|
|
logger => $self->logger, |
|
126
|
|
|
|
|
|
|
core_definition => $self->core_definition, |
|
127
|
|
|
|
|
|
|
); |
|
128
|
0
|
|
|
|
|
|
$obj->run(); |
|
129
|
|
|
|
|
|
|
|
|
130
|
0
|
0
|
|
|
|
|
if($self->dont_delete_files == 0) |
|
131
|
|
|
|
|
|
|
{ |
|
132
|
0
|
|
|
|
|
|
unlink('_inflated_unsplit_mcl_groups'); |
|
133
|
0
|
|
|
|
|
|
remove_tree('split_groups'); |
|
134
|
|
|
|
|
|
|
} |
|
135
|
|
|
|
|
|
|
|
|
136
|
0
|
0
|
|
|
|
|
if($self->output_multifasta_files == 1) |
|
137
|
|
|
|
|
|
|
{ |
|
138
|
0
|
0
|
|
|
|
|
print "Aligning each cluster\n" if($self->verbose); |
|
139
|
|
|
|
|
|
|
|
|
140
|
0
|
|
|
|
|
|
my $job_runner_to_use = $self->job_runner; |
|
141
|
0
|
0
|
0
|
|
|
|
if($self->_is_lsf_job_runner_available && $self->job_runner eq "LSF") |
|
142
|
|
|
|
|
|
|
{ |
|
143
|
0
|
|
|
|
|
|
$job_runner_to_use = $self->job_runner; |
|
144
|
|
|
|
|
|
|
} |
|
145
|
|
|
|
|
|
|
else |
|
146
|
|
|
|
|
|
|
{ |
|
147
|
0
|
|
|
|
|
|
$job_runner_to_use = 'Parallel'; |
|
148
|
|
|
|
|
|
|
} |
|
149
|
|
|
|
|
|
|
|
|
150
|
0
|
|
|
|
|
|
my $output_gene_files = $self->_find_input_files; |
|
151
|
|
|
|
|
|
|
my $seg = Bio::Roary::External::GeneAlignmentFromNucleotides->new( |
|
152
|
|
|
|
|
|
|
fasta_files => $output_gene_files, |
|
153
|
|
|
|
|
|
|
job_runner => $job_runner_to_use, |
|
154
|
|
|
|
|
|
|
translation_table => $self->translation_table, |
|
155
|
|
|
|
|
|
|
core_definition => $self->core_definition, |
|
156
|
|
|
|
|
|
|
cpus => $self->cpus, |
|
157
|
|
|
|
|
|
|
verbose => $self->verbose, |
|
158
|
|
|
|
|
|
|
mafft => $self->mafft, |
|
159
|
|
|
|
|
|
|
allow_paralogs => $self->allow_paralogs, |
|
160
|
|
|
|
|
|
|
dont_delete_files => $self->dont_delete_files, |
|
161
|
0
|
|
|
|
|
|
num_input_files => $#{$input_files}, |
|
|
0
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
); |
|
163
|
0
|
|
|
|
|
|
$seg->run(); |
|
164
|
|
|
|
|
|
|
} |
|
165
|
|
|
|
|
|
|
} |
|
166
|
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
sub _is_lsf_job_runner_available |
|
168
|
|
|
|
|
|
|
{ |
|
169
|
0
|
|
|
0
|
|
|
my ($self) = @_; |
|
170
|
0
|
|
|
|
|
|
my $rc = eval "require Bio::Roary::JobRunner::LSF; 1;"; |
|
171
|
0
|
0
|
0
|
|
|
|
if(defined($rc) && $rc == 1) |
|
172
|
|
|
|
|
|
|
{ |
|
173
|
0
|
|
|
|
|
|
return 1; |
|
174
|
|
|
|
|
|
|
} |
|
175
|
|
|
|
|
|
|
else |
|
176
|
|
|
|
|
|
|
{ |
|
177
|
0
|
|
|
|
|
|
return 0; |
|
178
|
|
|
|
|
|
|
} |
|
179
|
|
|
|
|
|
|
} |
|
180
|
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
sub _find_input_files |
|
182
|
|
|
|
|
|
|
{ |
|
183
|
0
|
|
|
0
|
|
|
my ($self) = @_; |
|
184
|
0
|
|
|
|
|
|
my @files = File::Find::Rule->file() |
|
185
|
|
|
|
|
|
|
->name( '*.fa' ) |
|
186
|
|
|
|
|
|
|
->in('pan_genome_sequences' ); |
|
187
|
0
|
|
|
|
|
|
return \@files; |
|
188
|
|
|
|
|
|
|
} |
|
189
|
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
sub _read_file_into_array |
|
191
|
|
|
|
|
|
|
{ |
|
192
|
0
|
|
|
0
|
|
|
my ($self, $filename) = @_; |
|
193
|
0
|
|
|
|
|
|
open(my $in_fh, $filename); |
|
194
|
|
|
|
|
|
|
|
|
195
|
0
|
|
|
|
|
|
my @filenames; |
|
196
|
0
|
|
|
|
|
|
while(<$in_fh>){ |
|
197
|
0
|
|
|
|
|
|
chomp; |
|
198
|
0
|
|
|
|
|
|
my $line = $_; |
|
199
|
0
|
|
|
|
|
|
push(@filenames, $line); |
|
200
|
|
|
|
|
|
|
} |
|
201
|
0
|
|
|
|
|
|
return \@filenames; |
|
202
|
|
|
|
|
|
|
} |
|
203
|
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
sub usage_text { |
|
205
|
0
|
|
|
0
|
0
|
|
my ($self) = @_; |
|
206
|
|
|
|
|
|
|
|
|
207
|
0
|
|
|
|
|
|
return <<USAGE; |
|
208
|
|
|
|
|
|
|
Usage: pan_genome_post_analysis [options] |
|
209
|
|
|
|
|
|
|
Perform the post analysis on the pan genome. This script is usally only called by another script. |
|
210
|
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
Options: -a dont delete intermediate files |
|
212
|
|
|
|
|
|
|
-b dont create R plots |
|
213
|
|
|
|
|
|
|
-c STR clusters filename [_clustered.clstr] |
|
214
|
|
|
|
|
|
|
-cd FLOAT percentage of isolates a gene must be in to be core [0.99] |
|
215
|
|
|
|
|
|
|
-d dont split groups |
|
216
|
|
|
|
|
|
|
-e add inference values to gene presence and absence spreadsheet |
|
217
|
|
|
|
|
|
|
-f STR file of protein filenames [_fasta_files] |
|
218
|
|
|
|
|
|
|
-g INT maximum number of clusters [50000] |
|
219
|
|
|
|
|
|
|
-i STR file of GFF filenames [_gff_files] |
|
220
|
|
|
|
|
|
|
-m core gene alignement with PRANK |
|
221
|
|
|
|
|
|
|
-n fast core gene alignement with MAFFT instead of PRANK |
|
222
|
|
|
|
|
|
|
-o STR clusters output filename [clustered_proteins] |
|
223
|
|
|
|
|
|
|
-p STR output pan genome filename [pan_genome.fa] |
|
224
|
|
|
|
|
|
|
-q allow paralogs in core alignment |
|
225
|
|
|
|
|
|
|
-s STR output gene presence and absence filename [gene_presence_absence.csv] |
|
226
|
|
|
|
|
|
|
-t INT translation table [11] |
|
227
|
|
|
|
|
|
|
-z INT number of threads [1] |
|
228
|
|
|
|
|
|
|
-v verbose output to STDOUT |
|
229
|
|
|
|
|
|
|
-h this help message |
|
230
|
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
For further info see: http://sanger-pathogens.github.io/Roary/ |
|
232
|
|
|
|
|
|
|
USAGE |
|
233
|
|
|
|
|
|
|
} |
|
234
|
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
__PACKAGE__->meta->make_immutable; |
|
236
|
1
|
|
|
1
|
|
7
|
no Moose; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
6
|
|
|
237
|
|
|
|
|
|
|
1; |
|
238
|
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
__END__ |
|
240
|
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
=pod |
|
242
|
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
=encoding UTF-8 |
|
244
|
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
=head1 NAME |
|
246
|
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
Bio::Roary::CommandLine::RoaryPostAnalysis - Perform the post analysis on the pan genome |
|
248
|
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
=head1 VERSION |
|
250
|
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
version 3.11.0 |
|
252
|
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
254
|
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
Perform the post analysis on the pan genome |
|
256
|
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
=head1 AUTHOR |
|
258
|
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
Andrew J. Page <ap13@sanger.ac.uk> |
|
260
|
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
|
262
|
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
This software is Copyright (c) 2013 by Wellcome Trust Sanger Institute. |
|
264
|
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
This is free software, licensed under: |
|
266
|
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
The GNU General Public License, Version 3, June 2007 |
|
268
|
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
=cut |