File Coverage

lib/Bio/MLST/Check.pm
Criterion Covered Total %
statement 60 88 68.1
branch 8 16 50.0
condition n/a
subroutine 15 16 93.7
pod 1 2 50.0
total 84 122 68.8


line stmt bran cond sub pod time code
1             package Bio::MLST::Check;
2             # ABSTRACT: Multilocus sequence type checking using blast
3             $Bio::MLST::Check::VERSION = '2.1.1630910';
4              
5 10     10   692461 use Moose;
  10         2216700  
  10         85  
6 10     10   80637 use Parallel::ForkManager;
  10         179032  
  10         362  
7 10     10   4648 use Bio::MLST::ProcessFasta;
  10         38  
  10         441  
8 10     10   5236 use Bio::MLST::Spreadsheet::File;
  10         40  
  10         508  
9 10     10   5174 use Bio::MLST::NormaliseFasta;
  10         37  
  10         525  
10 10     10   7628 use Bio::AlignIO;
  10         436938  
  10         383  
11 10     10   115 use Bio::SimpleAlign;
  10         20  
  10         213  
12 10     10   62 use File::Temp;
  10         18  
  10         1059  
13 10     10   59 use Cwd;
  10         17  
  10         10839  
14              
15             has 'species' => ( is => 'ro', isa => 'Str', required => 1 );
16             has 'base_directory' => ( is => 'ro', isa => 'Str', required => 1 );
17             has 'raw_input_fasta_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
18             has 'makeblastdb_exec' => ( is => 'ro', isa => 'Str', required => 1 );
19             has 'blastn_exec' => ( is => 'ro', isa => 'Str', required => 1 );
20             has 'output_directory' => ( is => 'ro', isa => 'Str', required => 1 );
21             has 'output_fasta_files' => ( is => 'ro', isa => 'Bool', default => 0 );
22             has 'spreadsheet_basename' => ( is => 'ro', isa => 'Str', default => 'mlst_results' );
23             has 'output_phylip_files' => ( is => 'ro', isa => 'Bool', default => 0 );
24             has 'show_contamination_instead_of_alt_matches' => ( is => 'ro', isa => 'Bool', default => 1 );
25             has 'report_lowest_st' => ( is => 'ro', isa => 'Bool', default => 0 );
26              
27             has 'parallel_processes' => ( is => 'ro', isa => 'Int', default => 1 );
28              
29             has '_spreadsheet_header' => ( is => 'rw', isa => 'ArrayRef', default => sub {[]} );
30             has '_spreadsheet_allele_numbers_rows' => ( is => 'rw', isa => 'ArrayRef', default => sub {[]} );
31             has '_spreadsheet_genomic_rows' => ( is => 'rw', isa => 'ArrayRef', default => sub {[]} );
32             has '_input_fasta_files' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, builder => '_build__input_fasta_files');
33              
34             has '_concat_names' => ( is => 'rw', isa => 'ArrayRef', default => sub {[]} );
35             has '_concat_sequences' => ( is => 'rw', isa => 'ArrayRef', default => sub {[]} );
36             has '_working_directory' => ( is => 'ro', isa => 'File::Temp::Dir', default => sub { File::Temp->newdir(DIR => getcwd, CLEANUP => 1); });
37              
38             sub _generate_spreadsheet_rows
39             {
40 8     8   22 my($self) = @_;
41              
42 8         314 my $pm = new Parallel::ForkManager($self->parallel_processes);
43             $pm -> run_on_finish (
44             sub {
45 4     4   18028896 my ($pid, $exit_code, $ident, $exit_signal, $core_dump, $data_structure_reference) = @_;
46             # retrieve data structure from child
47 4 50       100 if (defined($data_structure_reference)) { # children are not forced to send anything
48 0         0 my ($header_row, $allele_numbers_row, $genomic_row, $concat_name, $concat_sequence) = @{$data_structure_reference};
  0         0  
49 0         0 push(@{$self->_spreadsheet_header}, $header_row);
  0         0  
50 0         0 push(@{$self->_spreadsheet_allele_numbers_rows}, $allele_numbers_row);
  0         0  
51 0         0 push(@{$self->_spreadsheet_genomic_rows}, $genomic_row);
  0         0  
52            
53 0         0 push(@{$self->_concat_names}, $concat_name);
  0         0  
54 0         0 push(@{$self->_concat_sequences}, $concat_sequence);
  0         0  
55              
56             } else { # problems occuring during storage or retrieval will throw a warning
57 4         1585 print qq|No message received from child process $pid!\n|;
58             }
59             }
60 8         6644 );
61            
62 8         244 for my $fastafile (@{$self->_input_fasta_files})
  8         386  
63             {
64 8 100       60 $pm->start and next; # do the fork
65            
66 4         15823 my $output_fasta_obj = Bio::MLST::NormaliseFasta->new(
67             fasta_filename => $fastafile,
68             working_directory => $self->_working_directory->dirname()
69             );
70            
71 4         291 my $fasta_sequence_type_results = Bio::MLST::ProcessFasta->new(
72             species => $self->species,
73             base_directory => $self->base_directory,
74             fasta_file => $output_fasta_obj->processed_fasta_filename(),
75             makeblastdb_exec => $self->makeblastdb_exec,
76             blastn_exec => $self->blastn_exec,
77             output_directory => $self->output_directory,
78             output_fasta_files => $self->output_fasta_files,
79             show_contamination_instead_of_alt_matches => $self->show_contamination_instead_of_alt_matches,
80             report_lowest_st => $self->report_lowest_st
81             );
82 4         18 my @result_rows;
83 4         293 push(@result_rows, ($fasta_sequence_type_results->_spreadsheet_row_obj->header_row,
84             $fasta_sequence_type_results->_spreadsheet_row_obj->allele_numbers_row,
85             $fasta_sequence_type_results->_spreadsheet_row_obj->genomic_row,
86             $fasta_sequence_type_results->concat_name,
87             $fasta_sequence_type_results->concat_sequence));
88            
89 0         0 $pm->finish(0,\@result_rows); # do the exit in the child process
90             }
91 4         14756 $pm->wait_all_children;
92 4         181 1;
93             }
94              
95             sub _build__input_fasta_files
96             {
97 14     14   35 my($self) = @_;
98 14         698 return $self->raw_input_fasta_files;
99             }
100              
101             sub create_result_files
102             {
103 8     8 1 30 my($self) = @_;
104 8 50       112 exit 1 unless $self->input_fasta_files_exist;
105 8         48 $self->_generate_spreadsheet_rows;
106              
107             my $spreadsheet = Bio::MLST::Spreadsheet::File->new(
108 4         3522 header => pop(@{$self->_spreadsheet_header}),
  4         400  
109             spreadsheet_allele_numbers_rows => $self->_spreadsheet_allele_numbers_rows,
110             spreadsheet_genomic_rows => $self->_spreadsheet_genomic_rows,
111             output_directory => $self->output_directory,
112             spreadsheet_basename => $self->spreadsheet_basename
113             );
114 0         0 $spreadsheet->create();
115            
116 0 0       0 if($self->output_fasta_files)
117             {
118 0         0 $self->_create_alignment('Fasta','fa');
119             }
120            
121 0 0       0 if($self->output_phylip_files)
122             {
123 0         0 $self->_create_alignment('phylip','phylip');
124             }
125 0         0 1;
126             }
127              
128             sub _create_alignment
129             {
130 0     0   0 my($self, $format, $extension) = @_;
131            
132 0         0 my $output_filename = join('/',($self->output_directory,'concatenated_alleles.'.$extension));
133 0         0 my $out = Bio::AlignIO->new(-file => "+>$output_filename" , '-format' => $format);
134 0         0 my $aln = Bio::SimpleAlign->new();
135 0         0 for(my $i = 0; $i < @{$self->_concat_names}; $i++)
  0         0  
136             {
137 0 0       0 next unless(defined( $self->_concat_sequences->[$i]));
138 0         0 $aln->add_seq(Bio::LocatableSeq->new(
139             -seq => $self->_concat_sequences->[$i],
140             -id => $self->_concat_names->[$i],
141             -start => 1,
142             -end => length($self->_concat_sequences->[$i])
143             ));
144             }
145 0         0 $out->write_aln($aln);
146             }
147              
148             sub input_fasta_files_exist
149             {
150 14     14 0 32 my($self) = @_;
151 14         36 my $file_not_found = 0;
152 14         23 for my $fastafile (@{$self->_input_fasta_files})
  14         760  
153             {
154 15 100       349 unless( -e $fastafile )
155             {
156 1         17 print qq[Input fasta file not found: $fastafile\n];
157 1         4 $file_not_found++;
158             }
159             }
160 14 100       337 return $file_not_found ? 0:1;
161             }
162              
163 10     10   78 no Moose;
  10         19  
  10         116  
164             __PACKAGE__->meta->make_immutable;
165             1;
166              
167             __END__
168              
169             =pod
170              
171             =encoding UTF-8
172              
173             =head1 NAME
174              
175             Bio::MLST::Check - Multilocus sequence type checking using blast
176              
177             =head1 VERSION
178              
179             version 2.1.1630910
180              
181             =head1 SYNOPSIS
182              
183             High throughput multilocus sequence typing (MLST) checking.
184              
185             =head1 DESCRIPTION
186              
187             This application is for taking Multilocus sequence typing (MLST) sources from multiple locations and consolidating them in one place so that they can be easily used (and kept up to date).
188             Then you can provide FASTA files and get out sequence types for a given MLST database.
189             Two spreadsheets are outputted, one contains the allele number for each locus, and the ST (or nearest ST), the other contains the genomic sequence for each allele.
190             If more than 1 allele gives 100% identity for a locus, the contaminated flag is set.
191             Optionally you can output a concatenated sequence in FASTA format, which you can then use with tree building programs.
192             New, unseen alleles are saved in FASTA format, with 1 per file, for submission to back to MLST databases.
193              
194             It requires NCBI Blast+ to be installed and for blastn and makeblastdb to be in your PATH.
195              
196             # Add this environment variable to your ~/.bashrc file - do this once
197             export MLST_DATABASES=/path/to/where_you_want_to_store_the_databases
198            
199             # Download the latest copy of the databases (run it once per month)
200             download_mlst_databases
201            
202             # Find the sequence types for all fasta files in your current directory
203             get_sequence_type -s "Clostridium difficile" *.fa
204              
205             use Bio::MLST::Check;
206             Bio::MLST::Check->new(
207             'species' => 'E.coli',
208             'base_directory' => '/path/to/dir',
209             'raw_input_fasta_files' => ['myfasta.fa'],
210             'makeblastdb_exec' => 'makeblastdb',
211             'blastn_exec' => 'blastn',
212             'output_directory' => '/path/to/output',
213             'output_fasta_files'=> 1,
214             );
215              
216             =head1 METHODS
217              
218             =head2 create_result_files
219              
220             Creates a spreadsheet of results, FASTA files with novel sequences and optionally a concatentated sequence (FASTA) for tree building.
221              
222             =head1 AUTHOR
223              
224             Andrew J. Page <ap13@sanger.ac.uk>
225              
226             =head1 COPYRIGHT AND LICENSE
227              
228             This software is Copyright (c) 2012 by Wellcome Trust Sanger Institute.
229              
230             This is free software, licensed under:
231              
232             The GNU General Public License, Version 3, June 2007
233              
234             =cut