File Coverage

lib/Bio/MLST/Check.pm

Criterion	Covered	Total	%
statement	60	88	68.1
branch	8	16	50.0
condition			n/a
subroutine	15	16	93.7
pod	1	2	50.0
total	84	122	68.8

line	stmt	bran	sub	pod	time	code
1						package Bio::MLST::Check;
2						# ABSTRACT: Multilocus sequence type checking using blast
3						$Bio::MLST::Check::VERSION = '2.1.1706216';
4
5	9		9		385923	use Moose;
	9				1178201
	9				45
6	9		9		43491	use Parallel::ForkManager;
	9				108962
	9				207
7	9		9		2783	use Bio::MLST::ProcessFasta;
	9				20
	9				343
8	9		9		4449	use Bio::MLST::Spreadsheet::File;
	9				22
	9				311
9	9		9		4092	use Bio::MLST::NormaliseFasta;
	9				21
	9				303
10	9		9		5713	use Bio::AlignIO;
	9				251833
	9				259
11	9		9		82	use Bio::SimpleAlign;
	9				11
	9				137
12	9		9		29	use File::Temp;
	9				18
	9				733
13	9		9		36	use Cwd;
	9				16
	9				6717
14
15						has 'species' => ( is => 'ro', isa => 'Str', required => 1 );
16						has 'base_directory' => ( is => 'ro', isa => 'Str', required => 1 );
17						has 'raw_input_fasta_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
18						has 'makeblastdb_exec' => ( is => 'ro', isa => 'Str', required => 1 );
19						has 'blastn_exec' => ( is => 'ro', isa => 'Str', required => 1 );
20						has 'output_directory' => ( is => 'ro', isa => 'Str', required => 1 );
21						has 'output_fasta_files' => ( is => 'ro', isa => 'Bool', default => 0 );
22						has 'spreadsheet_basename' => ( is => 'ro', isa => 'Str', default => 'mlst_results' );
23						has 'output_phylip_files' => ( is => 'ro', isa => 'Bool', default => 0 );
24						has 'show_contamination_instead_of_alt_matches' => ( is => 'ro', isa => 'Bool', default => 1 );
25						has 'report_lowest_st' => ( is => 'ro', isa => 'Bool', default => 0 );
26
27						has 'parallel_processes' => ( is => 'ro', isa => 'Int', default => 1 );
28
29						has '_spreadsheet_header' => ( is => 'rw', isa => 'ArrayRef', default => sub {[]} );
30						has '_spreadsheet_allele_numbers_rows' => ( is => 'rw', isa => 'ArrayRef', default => sub {[]} );
31						has '_spreadsheet_genomic_rows' => ( is => 'rw', isa => 'ArrayRef', default => sub {[]} );
32						has '_input_fasta_files' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, builder => '_build__input_fasta_files');
33
34						has '_concat_names' => ( is => 'rw', isa => 'ArrayRef', default => sub {[]} );
35						has '_concat_sequences' => ( is => 'rw', isa => 'ArrayRef', default => sub {[]} );
36						has '_working_directory' => ( is => 'ro', isa => 'File::Temp::Dir', default => sub { File::Temp->newdir(DIR => getcwd, CLEANUP => 1); });
37
38						sub _generate_spreadsheet_rows
39						{
40	8		8		18	my($self) = @_;
41
42	8				270	my $pm = new Parallel::ForkManager($self->parallel_processes);
43						$pm -> run_on_finish (
44						sub {
45	4		4		12016142	my ($pid, $exit_code, $ident, $exit_signal, $core_dump, $data_structure_reference) = @_;
46						# retrieve data structure from child
47	4	50			90	if (defined($data_structure_reference)) { # children are not forced to send anything
48	0				0	my ($header_row, $allele_numbers_row, $genomic_row, $concat_name, $concat_sequence) = @{$data_structure_reference};
	0				0
49	0				0	push(@{$self->_spreadsheet_header}, $header_row);
	0				0
50	0				0	push(@{$self->_spreadsheet_allele_numbers_rows}, $allele_numbers_row);
	0				0
51	0				0	push(@{$self->_spreadsheet_genomic_rows}, $genomic_row);
	0				0
52
53	0				0	push(@{$self->_concat_names}, $concat_name);
	0				0
54	0				0	push(@{$self->_concat_sequences}, $concat_sequence);
	0				0
55
56						} else { # problems occuring during storage or retrieval will throw a warning
57	4				1031	print qq\|No message received from child process $pid!\n\|;
58						}
59						}
60	8				2526	);
61
62	8				106	for my $fastafile (@{$self->_input_fasta_files})
	8				196
63						{
64	8	100			40	$pm->start and next; # do the fork
65
66	4				8914	my $output_fasta_obj = Bio::MLST::NormaliseFasta->new(
67						fasta_filename => $fastafile,
68						working_directory => $self->_working_directory->dirname()
69						);
70
71	4				150	my $fasta_sequence_type_results = Bio::MLST::ProcessFasta->new(
72						species => $self->species,
73						base_directory => $self->base_directory,
74						fasta_file => $output_fasta_obj->processed_fasta_filename(),
75						makeblastdb_exec => $self->makeblastdb_exec,
76						blastn_exec => $self->blastn_exec,
77						output_directory => $self->output_directory,
78						output_fasta_files => $self->output_fasta_files,
79						show_contamination_instead_of_alt_matches => $self->show_contamination_instead_of_alt_matches,
80						report_lowest_st => $self->report_lowest_st
81						);
82	4				11	my @result_rows;
83	4				184	push(@result_rows, ($fasta_sequence_type_results->_spreadsheet_row_obj->header_row,
84						$fasta_sequence_type_results->_spreadsheet_row_obj->allele_numbers_row,
85						$fasta_sequence_type_results->_spreadsheet_row_obj->genomic_row,
86						$fasta_sequence_type_results->concat_name,
87						$fasta_sequence_type_results->concat_sequence));
88
89	0				0	$pm->finish(0,\@result_rows); # do the exit in the child process
90						}
91	4				6397	$pm->wait_all_children;
92	4				96	1;
93						}
94
95						sub _build__input_fasta_files
96						{
97	14		14		18	my($self) = @_;
98	14				292	return $self->raw_input_fasta_files;
99						}
100
101						sub create_result_files
102						{
103	8		8	1	20	my($self) = @_;
104	8	50			38	exit 1 unless $self->input_fasta_files_exist;
105	8				34	$self->_generate_spreadsheet_rows;
106
107						my $spreadsheet = Bio::MLST::Spreadsheet::File->new(
108	4				2863	header => pop(@{$self->_spreadsheet_header}),
	4				264
109						spreadsheet_allele_numbers_rows => $self->_spreadsheet_allele_numbers_rows,
110						spreadsheet_genomic_rows => $self->_spreadsheet_genomic_rows,
111						output_directory => $self->output_directory,
112						spreadsheet_basename => $self->spreadsheet_basename
113						);
114	0				0	$spreadsheet->create();
115
116	0	0			0	if($self->output_fasta_files)
117						{
118	0				0	$self->_create_alignment('Fasta','fa');
119						}
120
121	0	0			0	if($self->output_phylip_files)
122						{
123	0				0	$self->_create_alignment('phylip','phylip');
124						}
125	0				0	1;
126						}
127
128						sub _create_alignment
129						{
130	0		0		0	my($self, $format, $extension) = @_;
131
132	0				0	my $output_filename = join('/',($self->output_directory,'concatenated_alleles.'.$extension));
133	0				0	my $out = Bio::AlignIO->new(-file => "+>$output_filename" , '-format' => $format);
134	0				0	my $aln = Bio::SimpleAlign->new();
135	0				0	for(my $i = 0; $i < @{$self->_concat_names}; $i++)
	0				0
136						{
137	0	0			0	next unless(defined( $self->_concat_sequences->[$i]));
138	0				0	$aln->add_seq(Bio::LocatableSeq->new(
139						-seq => $self->_concat_sequences->[$i],
140						-id => $self->_concat_names->[$i],
141						-start => 1,
142						-end => length($self->_concat_sequences->[$i])
143						));
144						}
145	0				0	$out->write_aln($aln);
146						}
147
148						sub input_fasta_files_exist
149						{
150	14		14	0	22	my($self) = @_;
151	14				20	my $file_not_found = 0;
152	14				12	for my $fastafile (@{$self->_input_fasta_files})
	14				369
153						{
154	15	100			242	unless( -e $fastafile )
155						{
156	1				15	print qq[Input fasta file not found: $fastafile\n];
157	1				3	$file_not_found++;
158						}
159						}
160	14	100			171	return $file_not_found ? 0:1;
161						}
162
163	9		9		52	no Moose;
	9				13
	9				77
164						__PACKAGE__->meta->make_immutable;
165						1;
166
167						__END__
168
169						=pod
170
171						=encoding UTF-8
172
173						=head1 NAME
174
175						Bio::MLST::Check - Multilocus sequence type checking using blast
176
177						=head1 VERSION
178
179						version 2.1.1706216
180
181						=head1 SYNOPSIS
182
183						High throughput multilocus sequence typing (MLST) checking.
184
185						=head1 DESCRIPTION
186
187						This application is for taking Multilocus sequence typing (MLST) sources from multiple locations and consolidating them in one place so that they can be easily used (and kept up to date).
188						Then you can provide FASTA files and get out sequence types for a given MLST database.
189						Two spreadsheets are outputted, one contains the allele number for each locus, and the ST (or nearest ST), the other contains the genomic sequence for each allele.
190						If more than 1 allele gives 100% identity for a locus, the contaminated flag is set.
191						Optionally you can output a concatenated sequence in FASTA format, which you can then use with tree building programs.
192						New, unseen alleles are saved in FASTA format, with 1 per file, for submission to back to MLST databases.
193
194						It requires NCBI Blast+ to be installed and for blastn and makeblastdb to be in your PATH.
195
196						# Add this environment variable to your ~/.bashrc file - do this once
197						export MLST_DATABASES=/path/to/where_you_want_to_store_the_databases
198
199						# Download the latest copy of the databases (run it once per month)
200						download_mlst_databases
201
202						# Find the sequence types for all fasta files in your current directory
203						get_sequence_type -s "Clostridium difficile" *.fa
204
205						use Bio::MLST::Check;
206						Bio::MLST::Check->new(
207						'species' => 'E.coli',
208						'base_directory' => '/path/to/dir',
209						'raw_input_fasta_files' => ['myfasta.fa'],
210						'makeblastdb_exec' => 'makeblastdb',
211						'blastn_exec' => 'blastn',
212						'output_directory' => '/path/to/output',
213						'output_fasta_files'=> 1,
214						);
215
216						=head1 METHODS
217
218						=head2 create_result_files
219
220						Creates a spreadsheet of results, FASTA files with novel sequences and optionally a concatentated sequence (FASTA) for tree building.
221
222						=head1 AUTHOR
223
224						Andrew J. Page <ap13@sanger.ac.uk>
225
226						=head1 COPYRIGHT AND LICENSE
227
228						This software is Copyright (c) 2012 by Wellcome Trust Sanger Institute.
229
230						This is free software, licensed under:
231
232						The GNU General Public License, Version 3, June 2007
233
234						=cut