File Coverage

lib/Bio/MLST/Check.pm
Criterion Covered Total %
statement 60 88 68.1
branch 8 16 50.0
condition n/a
subroutine 15 16 93.7
pod 1 2 50.0
total 84 122 68.8


line stmt bran cond sub pod time code
1             package Bio::MLST::Check;
2             # ABSTRACT: Multilocus sequence type checking using blast
3             $Bio::MLST::Check::VERSION = '2.1.1706216';
4              
5 9     9   385923 use Moose;
  9         1178201  
  9         45  
6 9     9   43491 use Parallel::ForkManager;
  9         108962  
  9         207  
7 9     9   2783 use Bio::MLST::ProcessFasta;
  9         20  
  9         343  
8 9     9   4449 use Bio::MLST::Spreadsheet::File;
  9         22  
  9         311  
9 9     9   4092 use Bio::MLST::NormaliseFasta;
  9         21  
  9         303  
10 9     9   5713 use Bio::AlignIO;
  9         251833  
  9         259  
11 9     9   82 use Bio::SimpleAlign;
  9         11  
  9         137  
12 9     9   29 use File::Temp;
  9         18  
  9         733  
13 9     9   36 use Cwd;
  9         16  
  9         6717  
14              
15             has 'species' => ( is => 'ro', isa => 'Str', required => 1 );
16             has 'base_directory' => ( is => 'ro', isa => 'Str', required => 1 );
17             has 'raw_input_fasta_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
18             has 'makeblastdb_exec' => ( is => 'ro', isa => 'Str', required => 1 );
19             has 'blastn_exec' => ( is => 'ro', isa => 'Str', required => 1 );
20             has 'output_directory' => ( is => 'ro', isa => 'Str', required => 1 );
21             has 'output_fasta_files' => ( is => 'ro', isa => 'Bool', default => 0 );
22             has 'spreadsheet_basename' => ( is => 'ro', isa => 'Str', default => 'mlst_results' );
23             has 'output_phylip_files' => ( is => 'ro', isa => 'Bool', default => 0 );
24             has 'show_contamination_instead_of_alt_matches' => ( is => 'ro', isa => 'Bool', default => 1 );
25             has 'report_lowest_st' => ( is => 'ro', isa => 'Bool', default => 0 );
26              
27             has 'parallel_processes' => ( is => 'ro', isa => 'Int', default => 1 );
28              
29             has '_spreadsheet_header' => ( is => 'rw', isa => 'ArrayRef', default => sub {[]} );
30             has '_spreadsheet_allele_numbers_rows' => ( is => 'rw', isa => 'ArrayRef', default => sub {[]} );
31             has '_spreadsheet_genomic_rows' => ( is => 'rw', isa => 'ArrayRef', default => sub {[]} );
32             has '_input_fasta_files' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, builder => '_build__input_fasta_files');
33              
34             has '_concat_names' => ( is => 'rw', isa => 'ArrayRef', default => sub {[]} );
35             has '_concat_sequences' => ( is => 'rw', isa => 'ArrayRef', default => sub {[]} );
36             has '_working_directory' => ( is => 'ro', isa => 'File::Temp::Dir', default => sub { File::Temp->newdir(DIR => getcwd, CLEANUP => 1); });
37              
38             sub _generate_spreadsheet_rows
39             {
40 8     8   18 my($self) = @_;
41              
42 8         270 my $pm = new Parallel::ForkManager($self->parallel_processes);
43             $pm -> run_on_finish (
44             sub {
45 4     4   12016142 my ($pid, $exit_code, $ident, $exit_signal, $core_dump, $data_structure_reference) = @_;
46             # retrieve data structure from child
47 4 50       90 if (defined($data_structure_reference)) { # children are not forced to send anything
48 0         0 my ($header_row, $allele_numbers_row, $genomic_row, $concat_name, $concat_sequence) = @{$data_structure_reference};
  0         0  
49 0         0 push(@{$self->_spreadsheet_header}, $header_row);
  0         0  
50 0         0 push(@{$self->_spreadsheet_allele_numbers_rows}, $allele_numbers_row);
  0         0  
51 0         0 push(@{$self->_spreadsheet_genomic_rows}, $genomic_row);
  0         0  
52            
53 0         0 push(@{$self->_concat_names}, $concat_name);
  0         0  
54 0         0 push(@{$self->_concat_sequences}, $concat_sequence);
  0         0  
55              
56             } else { # problems occuring during storage or retrieval will throw a warning
57 4         1031 print qq|No message received from child process $pid!\n|;
58             }
59             }
60 8         2526 );
61            
62 8         106 for my $fastafile (@{$self->_input_fasta_files})
  8         196  
63             {
64 8 100       40 $pm->start and next; # do the fork
65            
66 4         8914 my $output_fasta_obj = Bio::MLST::NormaliseFasta->new(
67             fasta_filename => $fastafile,
68             working_directory => $self->_working_directory->dirname()
69             );
70            
71 4         150 my $fasta_sequence_type_results = Bio::MLST::ProcessFasta->new(
72             species => $self->species,
73             base_directory => $self->base_directory,
74             fasta_file => $output_fasta_obj->processed_fasta_filename(),
75             makeblastdb_exec => $self->makeblastdb_exec,
76             blastn_exec => $self->blastn_exec,
77             output_directory => $self->output_directory,
78             output_fasta_files => $self->output_fasta_files,
79             show_contamination_instead_of_alt_matches => $self->show_contamination_instead_of_alt_matches,
80             report_lowest_st => $self->report_lowest_st
81             );
82 4         11 my @result_rows;
83 4         184 push(@result_rows, ($fasta_sequence_type_results->_spreadsheet_row_obj->header_row,
84             $fasta_sequence_type_results->_spreadsheet_row_obj->allele_numbers_row,
85             $fasta_sequence_type_results->_spreadsheet_row_obj->genomic_row,
86             $fasta_sequence_type_results->concat_name,
87             $fasta_sequence_type_results->concat_sequence));
88            
89 0         0 $pm->finish(0,\@result_rows); # do the exit in the child process
90             }
91 4         6397 $pm->wait_all_children;
92 4         96 1;
93             }
94              
95             sub _build__input_fasta_files
96             {
97 14     14   18 my($self) = @_;
98 14         292 return $self->raw_input_fasta_files;
99             }
100              
101             sub create_result_files
102             {
103 8     8 1 20 my($self) = @_;
104 8 50       38 exit 1 unless $self->input_fasta_files_exist;
105 8         34 $self->_generate_spreadsheet_rows;
106              
107             my $spreadsheet = Bio::MLST::Spreadsheet::File->new(
108 4         2863 header => pop(@{$self->_spreadsheet_header}),
  4         264  
109             spreadsheet_allele_numbers_rows => $self->_spreadsheet_allele_numbers_rows,
110             spreadsheet_genomic_rows => $self->_spreadsheet_genomic_rows,
111             output_directory => $self->output_directory,
112             spreadsheet_basename => $self->spreadsheet_basename
113             );
114 0         0 $spreadsheet->create();
115            
116 0 0       0 if($self->output_fasta_files)
117             {
118 0         0 $self->_create_alignment('Fasta','fa');
119             }
120            
121 0 0       0 if($self->output_phylip_files)
122             {
123 0         0 $self->_create_alignment('phylip','phylip');
124             }
125 0         0 1;
126             }
127              
128             sub _create_alignment
129             {
130 0     0   0 my($self, $format, $extension) = @_;
131            
132 0         0 my $output_filename = join('/',($self->output_directory,'concatenated_alleles.'.$extension));
133 0         0 my $out = Bio::AlignIO->new(-file => "+>$output_filename" , '-format' => $format);
134 0         0 my $aln = Bio::SimpleAlign->new();
135 0         0 for(my $i = 0; $i < @{$self->_concat_names}; $i++)
  0         0  
136             {
137 0 0       0 next unless(defined( $self->_concat_sequences->[$i]));
138 0         0 $aln->add_seq(Bio::LocatableSeq->new(
139             -seq => $self->_concat_sequences->[$i],
140             -id => $self->_concat_names->[$i],
141             -start => 1,
142             -end => length($self->_concat_sequences->[$i])
143             ));
144             }
145 0         0 $out->write_aln($aln);
146             }
147              
148             sub input_fasta_files_exist
149             {
150 14     14 0 22 my($self) = @_;
151 14         20 my $file_not_found = 0;
152 14         12 for my $fastafile (@{$self->_input_fasta_files})
  14         369  
153             {
154 15 100       242 unless( -e $fastafile )
155             {
156 1         15 print qq[Input fasta file not found: $fastafile\n];
157 1         3 $file_not_found++;
158             }
159             }
160 14 100       171 return $file_not_found ? 0:1;
161             }
162              
163 9     9   52 no Moose;
  9         13  
  9         77  
164             __PACKAGE__->meta->make_immutable;
165             1;
166              
167             __END__
168              
169             =pod
170              
171             =encoding UTF-8
172              
173             =head1 NAME
174              
175             Bio::MLST::Check - Multilocus sequence type checking using blast
176              
177             =head1 VERSION
178              
179             version 2.1.1706216
180              
181             =head1 SYNOPSIS
182              
183             High throughput multilocus sequence typing (MLST) checking.
184              
185             =head1 DESCRIPTION
186              
187             This application is for taking Multilocus sequence typing (MLST) sources from multiple locations and consolidating them in one place so that they can be easily used (and kept up to date).
188             Then you can provide FASTA files and get out sequence types for a given MLST database.
189             Two spreadsheets are outputted, one contains the allele number for each locus, and the ST (or nearest ST), the other contains the genomic sequence for each allele.
190             If more than 1 allele gives 100% identity for a locus, the contaminated flag is set.
191             Optionally you can output a concatenated sequence in FASTA format, which you can then use with tree building programs.
192             New, unseen alleles are saved in FASTA format, with 1 per file, for submission to back to MLST databases.
193              
194             It requires NCBI Blast+ to be installed and for blastn and makeblastdb to be in your PATH.
195              
196             # Add this environment variable to your ~/.bashrc file - do this once
197             export MLST_DATABASES=/path/to/where_you_want_to_store_the_databases
198            
199             # Download the latest copy of the databases (run it once per month)
200             download_mlst_databases
201            
202             # Find the sequence types for all fasta files in your current directory
203             get_sequence_type -s "Clostridium difficile" *.fa
204              
205             use Bio::MLST::Check;
206             Bio::MLST::Check->new(
207             'species' => 'E.coli',
208             'base_directory' => '/path/to/dir',
209             'raw_input_fasta_files' => ['myfasta.fa'],
210             'makeblastdb_exec' => 'makeblastdb',
211             'blastn_exec' => 'blastn',
212             'output_directory' => '/path/to/output',
213             'output_fasta_files'=> 1,
214             );
215              
216             =head1 METHODS
217              
218             =head2 create_result_files
219              
220             Creates a spreadsheet of results, FASTA files with novel sequences and optionally a concatentated sequence (FASTA) for tree building.
221              
222             =head1 AUTHOR
223              
224             Andrew J. Page <ap13@sanger.ac.uk>
225              
226             =head1 COPYRIGHT AND LICENSE
227              
228             This software is Copyright (c) 2012 by Wellcome Trust Sanger Institute.
229              
230             This is free software, licensed under:
231              
232             The GNU General Public License, Version 3, June 2007
233              
234             =cut