File Coverage

lib/Bio/MLST/CompareAlleles.pm
Criterion Covered Total %
statement 46 138 33.3
branch 0 34 0.0
condition 0 3 0.0
subroutine 14 23 60.8
pod 2 3 66.6
total 62 201 30.8


line stmt bran cond sub pod time code
1             package Bio::MLST::CompareAlleles;
2             $Bio::MLST::CompareAlleles::VERSION = '2.1.1630910';
3             # ABSTRACT: Get a list of matching alleles between the sequence and database
4              
5              
6 12     12   300864 use Moose;
  12         843239  
  12         106  
7 12     12   79313 use File::Basename;
  12         25  
  12         1433  
8 12     12   10378 use Bio::SeqIO;
  12         735479  
  12         484  
9 12     12   7273 use Bio::Perl;
  12         2086632  
  12         1732  
10 12     12   6120 use Bio::MLST::Blast::Database;
  12         43  
  12         621  
11 12     12   6852 use Bio::MLST::Blast::BlastN;
  12         38  
  12         528  
12 12     12   117 use Bio::MLST::Types;
  12         17  
  12         476  
13 12     12   5841 use Bio::MLST::SequenceType;
  12         47  
  12         24611  
14              
15             has 'sequence_filename' => ( is => 'ro', isa => 'Bio::MLST::File', required => 1 );
16             has 'allele_filenames' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
17             has 'makeblastdb_exec' => ( is => 'ro', isa => 'Str', default => 'makeblastdb' );
18             has 'blastn_exec' => ( is => 'ro', isa => 'Str', default => 'blastn' );
19              
20             has '_sequence_handle' => ( is => 'ro', isa => 'Bio::SeqIO::fasta', lazy => 1, builder => '_build__sequence_handle');
21             has '_blast_db_location_obj' => ( is => 'ro', isa => 'Bio::MLST::Blast::Database', lazy => 1, builder => '_build__blast_db_location_obj');
22             has '_blast_db_location' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build__blast_db_location');
23              
24             has 'matching_sequences' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_matching_sequences' );
25             has 'non_matching_sequences' => ( is => 'rw', isa => 'HashRef', default => sub {{}});
26             has 'contamination' => ( is => 'rw', isa => 'Bool', default => 0);
27             has 'contamination_alleles' => ( is => 'rw', isa => 'Maybe[Str]' );
28             has 'contamination_sequence_names' => ( is => 'rw', isa => 'Maybe[ArrayRef]' );
29             has 'new_st' => ( is => 'rw', isa => 'Bool', default => 0);
30             has '_absent_loci' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build__absent_loci' );
31             has 'profiles_filename' => ( is => 'ro', isa => 'Bio::MLST::File', required => 1 );
32              
33             sub _build__blast_db_location
34             {
35 6     6   22 my ($self) = @_;
36 6         354 return $self->_blast_db_location_obj->location();
37             }
38              
39             sub _build__blast_db_location_obj
40             {
41 6     6   22 my ($self) = @_;
42 6         286 return Bio::MLST::Blast::Database->new(fasta_file => $self->sequence_filename, exec => $self->makeblastdb_exec);
43             }
44              
45              
46             sub _build__sequence_handle
47             {
48 0     0   0 my ($self) = @_;
49 0         0 return Bio::SeqIO->new( -file => $self->sequence_filename , -format => 'Fasta');
50             }
51              
52             sub sequence_filename_root
53             {
54 0     0 0 0 my ($self) = @_;
55 0         0 $self->_get_base_filename($self->sequence_filename);
56             }
57              
58             sub found_sequence_names
59             {
60 2     2 1 13 my ($self) = @_;
61 2         5 my @sequence_names = sort(keys %{$self->matching_sequences});
  2         90  
62 0         0 return \@sequence_names;
63             }
64              
65             sub found_non_matching_sequence_names
66             {
67 0     0 1 0 my ($self) = @_;
68 0         0 my @sequence_names = sort(keys %{$self->non_matching_sequences});
  0         0  
69 0         0 return \@sequence_names;
70             }
71              
72              
73             sub _word_sizes_for_given_allele_file
74             {
75 6     6   29 my ($self,$filename) = @_;
76 6         24 my %seq_lens;
77 6         171 my $seqio = Bio::SeqIO->new( -file => $filename , -format => 'Fasta');
78 6         19351 while( my $seq = $seqio->next_seq() ){
79 37         9739 $seq_lens{$seq->primary_id} = $seq->length;
80             }
81 6         558 return \%seq_lens;
82             }
83              
84             sub _get_word_size_from_blast_hit {
85 0     0   0 my ( $self, $word_sizes, $blast_hit, $allele_filename ) = @_;
86              
87             # return len of top blast hit allele, otherwise return len of first seq in allele file
88 0         0 my ($word_size, $first_seq);
89 0 0       0 if( defined $blast_hit->{allele_name} ){
90 0         0 $word_size = $word_sizes->{$blast_hit->{allele_name}};
91             }
92             else{
93 0         0 my $seqio = Bio::SeqIO->new( -file => $allele_filename, -format => 'Fasta' );
94 0         0 $word_size = $seqio->next_seq->length;
95             }
96              
97 0         0 return $word_size;
98             }
99              
100             sub _build_matching_sequences
101             {
102 6     6   29 my ($self) = @_;
103 6         37 my %matching_sequence_names;
104             my %non_matching_sequence_names;
105 0         0 my %missing_locus_names;
106            
107 6         27 for my $allele_filename (@{$self->allele_filenames})
  6         335  
108             {
109 6         106 my $word_sizes = $self->_word_sizes_for_given_allele_file($allele_filename);
110             # TODO: You'll never get matches or contamination noted if there is a SNP
111             # near the end of the allele. This is because we filter all matches which
112             # are shorter than the length of the allele in the profiles. This could
113             # mean that we're missing contamination of falsly noting matches against
114             # truncated alleles.
115 6         1215 my $blast_results = Bio::MLST::Blast::BlastN->new(
116             blast_database => $self->_blast_db_location,
117             query_file => $allele_filename,
118             word_sizes => $word_sizes,
119             exec => $self->blastn_exec
120             );
121 0           my %top_blast_hit = %{$blast_results->top_hit()};
  0            
122            
123             # possible missing locus
124 0 0         if(! %top_blast_hit)
125             {
126 0           my %absent_loci_type = %{$self->_absent_loci};
  0            
127 0           my $allele = $self->_get_base_filename($allele_filename);
128 0 0         $missing_locus_names{$allele} = $absent_loci_type{$allele} if exists $absent_loci_type{$allele};
129             }
130              
131 0           my $word_size = $self->_get_word_size_from_blast_hit($word_sizes, \%top_blast_hit, $allele_filename);
132              
133             # unknown allele
134 0 0         if(! %top_blast_hit)
135             {
136 0           $non_matching_sequence_names{$self->_get_base_filename($allele_filename)} = $self->_pad_out_sequence("", $word_size);
137 0           next;
138             }
139            
140             # more than 1 allele has a good match
141 0 0         if(defined($top_blast_hit{contamination}))
142             {
143 0           $self->contamination(1);
144 0           my $contaminants = $top_blast_hit{contamination};
145 0           my @contaminant_names = map { $_->{allele_name} } @$contaminants;
  0            
146             # Add tilde to matches which are not 100%
147 0 0         my @contaminant_names_with_tilde = map { $_->{percentage_identity} == 100 ? $_->{allele_name} : "$_->{allele_name}~" } @$contaminants;
  0            
148 0           my $contamination_alleles = join( ',', sort @contaminant_names_with_tilde );
149 0           $self->contamination_alleles( $contamination_alleles );
150 0           $self->_translate_contamination_names_into_sequence_types(\@contaminant_names, $top_blast_hit{allele_name});
151             }
152            
153 0           $top_blast_hit{allele_name} =~ s![-_]+!-!g;
154            
155 0 0         if($top_blast_hit{percentage_identity} == 100 )
156             {
157 0           $matching_sequence_names{$top_blast_hit{allele_name}} = $self->_get_blast_hit_sequence($top_blast_hit{source_name}, $top_blast_hit{source_start},$top_blast_hit{source_end},$word_size,$top_blast_hit{reverse});
158             }
159             else
160             {
161             # If the top hit isn't 100%, add a tilde to the allele_name
162 0           my $name_with_tilde = "$top_blast_hit{allele_name}~";
163 0           $non_matching_sequence_names{$name_with_tilde} = $self->_get_blast_hit_sequence($top_blast_hit{source_name}, $top_blast_hit{source_start},$top_blast_hit{source_end},$word_size,$top_blast_hit{reverse});
164 0           $self->new_st(1);
165             }
166             }
167              
168             # deal with missing loci
169 0 0 0       if(%matching_sequence_names && %missing_locus_names)
170             {
171 0           for my $allele (keys %missing_locus_names)
172             {
173 0           delete $non_matching_sequence_names{$allele};
174 0           $matching_sequence_names{$allele.'-'.$missing_locus_names{$allele}} = '';
175             }
176             }
177              
178             # set new ST flag
179 0 0         $self->new_st(1) if %non_matching_sequence_names;
180              
181 0           $self->non_matching_sequences(\%non_matching_sequence_names);
182 0           return \%matching_sequence_names;
183             }
184              
185             sub _translate_contamination_names_into_sequence_types
186             {
187 0     0     my ($self, $contamination_names, $main_allele_name) = @_;
188 0           my @contamination_sequence_types;
189            
190 0           for my $allele_number (@{ $contamination_names})
  0            
191             {
192 0 0         next if($main_allele_name eq $allele_number);
193 0           my $st = Bio::MLST::SequenceType->new(
194             profiles_filename => $self->profiles_filename,
195             matching_names => [$allele_number],
196             non_matching_names => []
197             );
198            
199 0 0         if(defined($st->sequence_type()) )
200             {
201 0           push(@contamination_sequence_types, $st->sequence_type());
202             }
203             }
204            
205 0           $self->contamination_sequence_names(\@contamination_sequence_types);
206             }
207              
208              
209             sub _get_blast_hit_sequence
210             {
211 0     0     my ($self, $contig_name, $start, $end, $word_size, $reverse_complement) = @_;
212 0           seek($self->_sequence_handle->_fh, 0,0);
213 0           while( my $input_sequence_obj = $self->_sequence_handle->next_seq() )
214             {
215 0 0         next if( $input_sequence_obj->id ne $contig_name);
216 0           my $sequence = $input_sequence_obj->subseq($start, $end);
217 0 0         if($reverse_complement)
218             {
219 0           my $reverse_sequence = revcom( $sequence );
220 0           $sequence = $reverse_sequence->{seq};
221             }
222            
223 0           $sequence = $self->_pad_out_sequence($sequence, $word_size);
224 0           return $sequence;
225             }
226            
227 0           return $self->_pad_out_sequence("", $word_size);
228             }
229              
230             sub _get_base_filename
231             {
232 0     0     my($self, $filename) = @_;
233 0           my $filename_root = fileparse($filename, qr/\.[^.]*$/);
234 0           return $filename_root;
235             }
236              
237             sub _pad_out_sequence
238             {
239 0     0     my($self, $input_sequence, $length_of_main_sequence) = @_;
240 0 0         return $input_sequence if(length($input_sequence) == $length_of_main_sequence);
241 0 0         if(length($input_sequence) > $length_of_main_sequence)
242             {
243 0           $input_sequence = substr($input_sequence,0,$length_of_main_sequence);
244             }
245 0 0         $input_sequence = "" if($input_sequence eq 'U');
246            
247 0           for(my $i=length($input_sequence); $i < $length_of_main_sequence; $i++)
248             {
249 0           $input_sequence .= "N";
250             }
251 0           return $input_sequence;
252             }
253              
254             sub _build__absent_loci
255             {
256 0     0     my( $self ) = @_;
257 0           my %absent_loci = ();
258            
259 0           for my $allele_file (@{$self->allele_filenames})
  0            
260             {
261 0           my $seq_io = Bio::SeqIO->new( -file => $allele_file , -format => 'Fasta');
262 0           while( my $seq = $seq_io->next_seq() )
263             {
264 0 0         if($seq->length == 0)
265             {
266 0           my($allele,$type) = split(/[-_]+/,$seq->id(),2);
267 0           $absent_loci{$allele} = $type;
268             }
269             }
270             }
271              
272 0           return \%absent_loci;
273             }
274              
275 12     12   141 no Moose;
  12         36  
  12         123  
276             __PACKAGE__->meta->make_immutable;
277             1;
278              
279             __END__
280              
281             =pod
282              
283             =encoding UTF-8
284              
285             =head1 NAME
286              
287             Bio::MLST::CompareAlleles - Get a list of matching alleles between the sequence and database
288              
289             =head1 VERSION
290              
291             version 2.1.1630910
292              
293             =head1 SYNOPSIS
294              
295             Take in an assembly file in Fasta format, and a list of allele files (in multifasta format) and return a list of the alleles and IDs.
296              
297             use Bio::MLST::CompareAlleles;
298            
299             my $compare_alleles = Bio::MLST::CompareAlleles->new(
300            
301             sequence_filename => 'contigs.fa',
302             allele_filenames => ['abc.tfa','efg.tfa']
303             );
304             $compare_alleles->found_sequence_names;
305             $compare_alleles->found_non_matching_sequence_names
306             $compare_alleles->matching_sequences;
307             $compare_alleles->non_matching_sequences
308              
309             =head1 METHODS
310              
311             =head2 found_sequence_names
312              
313             Return a list of the sequence names which match.
314              
315             =head2 found_non_matching_sequence_names
316              
317             Return a list of the sequence names which dont match.
318              
319             =head2 matching_sequences
320              
321             Return a Hash containing the sequnces that match.
322              
323             =head2 non_matching_sequences
324              
325             Return a Hash containing the sequnces that dont match.
326              
327             =head2 contamination
328              
329             Flag which is set if more than one 100% match is found for a single locus.
330              
331             =head2 new_st
332              
333             Flag which is set if the results contain a novel combination of sequences or a new sequence.
334              
335             =head1 SEE ALSO
336              
337             =over 4
338              
339             =item *
340              
341             L<Bio::MLST::Check>
342              
343             =back
344              
345             =head1 AUTHOR
346              
347             Andrew J. Page <ap13@sanger.ac.uk>
348              
349             =head1 COPYRIGHT AND LICENSE
350              
351             This software is Copyright (c) 2012 by Wellcome Trust Sanger Institute.
352              
353             This is free software, licensed under:
354              
355             The GNU General Public License, Version 3, June 2007
356              
357             =cut