File Coverage

lib/Bio/MLST/SequenceType.pm
Criterion Covered Total %
statement 92 96 95.8
branch 28 34 82.3
condition 2 2 100.0
subroutine 13 14 92.8
pod 0 1 0.0
total 135 147 91.8


line stmt bran cond sub pod time code
1             package Bio::MLST::SequenceType;
2             # ABSTRACT: Take in a list of matched alleles and look up the sequence type from the profile.
3             $Bio::MLST::SequenceType::VERSION = '2.1.1630910';
4              
5 13     13   177473 use Data::Dumper;
  13         21  
  13         1140  
6 13     13   7524 use Text::CSV;
  13         121506  
  13         91  
7 13     13   627 use List::Util qw(min reduce);
  13         23  
  13         1277  
8              
9 13     13   749 use Moose;
  13         438095  
  13         123  
10 13     13   81866 use Bio::MLST::Types;
  13         35  
  13         490  
11 13     13   6477 use Bio::MLST::FilterAlleles qw(is_metadata);
  13         31  
  13         14855  
12              
13             has 'profiles_filename' => ( is => 'ro', isa => 'Bio::MLST::File', required => 1 );
14             has 'matching_names' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
15             has 'non_matching_names' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
16              
17             has 'allele_to_number' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_allele_to_number' );
18             has '_profiles' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, builder => '_build__profiles' );
19             has 'sequence_type' => ( is => 'ro', isa => 'Maybe[Str]', lazy => 1, builder => '_build_sequence_type' );
20              
21             has 'nearest_sequence_type' => ( is => 'rw', isa => 'Maybe[Str]');
22             has 'report_lowest_st' => ( is => 'ro', isa => 'Bool', default => 0 );
23              
24             sub sequence_type_or_nearest
25             {
26 0     0 0 0 my($self) = @_;
27 0 0       0 return $self->sequence_type if(defined($self->sequence_type));
28             # If there isn't a perfect match, add a tilde to the sequence type
29 0 0       0 return $self->nearest_sequence_type."~" if(defined($self->nearest_sequence_type));
30 0         0 return $self->nearest_sequence_type;
31             }
32              
33             sub _build__profiles
34             {
35 8     8   10 my($self) = @_;
36 8 50       198 open(my $fh, $self->profiles_filename) or die "Couldnt open profile: ".$self->profiles_filename."\n";
37 8         61 my $csv_in = Text::CSV->new({sep_char=>"\t"});
38 8         581 my $profile = $csv_in->getline_all($fh);
39              
40 8         16113 return $profile;
41             }
42              
43             sub _build_allele_to_number
44             {
45 8     8   10 my($self) = @_;
46 8         8 my %allele_to_number;
47              
48 8         9 for my $sequence_name (@{$self->non_matching_names})
  8         169  
49             {
50 6         17 my @sequence_name_details = split(/[-_]/,$sequence_name);
51 6         9 my $num = pop @sequence_name_details;
52 6         10 my $name = join( "-", @sequence_name_details );
53 6         12 $allele_to_number{$name} = $num;
54             }
55              
56 8         9 for my $sequence_name (@{$self->matching_names})
  8         161  
57             {
58 19         45 my @sequence_name_details = split(/[-_]/,$sequence_name);
59 19         18 my $num = pop @sequence_name_details;
60 19         23 my $name = join( "-", @sequence_name_details );
61 19         40 $allele_to_number{$name} = $num;
62             }
63              
64             #print "ALLELE TO NUMBER: ";
65             #print Dumper \%allele_to_number;
66              
67 8         165 return \%allele_to_number;
68             }
69              
70             sub _allele_numbers_similar
71             {
72 123     123   137 my($self, $number_a, $number_b) = @_;
73 123 100       315 if ($number_a eq $number_b) {
    100          
    100          
74 2         8 return 1;
75             } elsif ("$number_a~" eq $number_b) {
76 1         4 return 1;
77             } elsif ("$number_b~" eq $number_a) {
78 18         32 return 1;
79             } else {
80 102         312 return 0;
81             }
82             }
83              
84             sub _build_sequence_type
85             {
86 8     8   9 my($self) = @_;
87              
88 8         8 my @header_row = @{$self->_profiles->[0]};
  8         175  
89              
90 8         23 for(my $i=0; $i< @header_row; $i++)
91             {
92 50 100       104 next if(is_metadata($header_row[$i]) == 1);
93 28         36 $header_row[$i] =~ s!_!!g;
94 28         55 $header_row[$i] =~ s!-!!g;
95             }
96              
97 8         18 my $num_loci = 0;
98 8         9 my %sequence_type_match_freq;
99             my %sequence_type_part_match_freq;
100              
101 8         11 for(my $row = 1; $row < @{$self->_profiles}; $row++)
  62         1188  
102             {
103 54         38 my @current_row = @{$self->_profiles->[$row]};
  54         1015  
104 54         110 for(my $col = 0; $col< @current_row; $col++)
105             {
106 299 50       425 next unless(defined($header_row[$col]));
107 299 100       436 next if(is_metadata($header_row[$col]) == 1);
108 210 100       306 $num_loci++ if($row == 1);
109              
110 210         4290 my $allele_number = $self->allele_to_number->{$header_row[$col]};
111 210 100       331 next if(!defined($allele_number) );
112 192 100       373 if ($allele_number eq $current_row[$col]) {
    100          
113 75         217 $sequence_type_match_freq{$current_row[0]}++;
114             } elsif ($self->_allele_numbers_similar($allele_number, $current_row[$col])) {
115 17         47 $sequence_type_part_match_freq{$current_row[0]}++;
116             }
117             }
118             }
119              
120 8         24 return $self->_get_sequence_type_or_set_nearest_match(\%sequence_type_match_freq,
121             \%sequence_type_part_match_freq,
122             $num_loci);
123             }
124              
125             sub _get_sequence_type_or_set_nearest_match
126             {
127 8     8   12 my($self,$st_match_f, $st_part_match_f, $num_loci) = @_;
128 8         9 my %st_match_freq = %{$st_match_f};
  8         35  
129              
130             # Combine the frequencies of the perfect matches (%st_match_freq) and the
131             # partial matches ($st_part_match_f)
132 8         9 my %st_nearest_match_freq = %{$st_match_f};
  8         20  
133 8         10 while (my($sequence_type, $freq) = each(%{$st_part_match_f})) {
  20         59  
134 12   100     31 my $nearest_match_frequency = ( $st_nearest_match_freq{$sequence_type} || 0 );
135 12         15 $st_nearest_match_freq{$sequence_type} = $nearest_match_frequency + $freq;
136             }
137              
138             # if $num_loci is in $st_match_freq vals, return that, otherwise return lowest numbered sequence type
139 8         23 while (my($sequence_type, $freq) = each(%st_match_freq)) {
140 27 100       68 if ($freq == $num_loci) {
141 2         49 return $sequence_type;
142             }
143             }
144 6         5 my $best_sequence_type;
145 6 100       141 if ( $self->report_lowest_st ){
146              
147 2         19 $best_sequence_type = min (keys %st_nearest_match_freq);
148             }
149             else {
150             # This reduce takes pairs of sequence types and compares them. It looks
151             # for the ST with the highest number of matching alleles; if two matches
152             # are just as good, it picks the ST with the smaller number.
153             $best_sequence_type = reduce {
154 12 100   12   26 if ( $st_nearest_match_freq{$a} > $st_nearest_match_freq{$b} ) {
    100          
155 6         6 $a;
156             } elsif ( $st_nearest_match_freq{$a} < $st_nearest_match_freq{$b} ) {
157 3         4 $b;
158             } else {
159 3         13 min ($a, $b);
160             }
161 4         40 } keys %st_nearest_match_freq;
162             }
163              
164 6         181 $self->nearest_sequence_type($best_sequence_type);
165 6         149 return undef;
166             }
167              
168 13     13   96 no Moose;
  13         19  
  13         127  
169             __PACKAGE__->meta->make_immutable;
170             1;
171              
172             __END__
173              
174             =pod
175              
176             =encoding UTF-8
177              
178             =head1 NAME
179              
180             Bio::MLST::SequenceType - Take in a list of matched alleles and look up the sequence type from the profile.
181              
182             =head1 VERSION
183              
184             version 2.1.1630910
185              
186             =head1 SYNOPSIS
187              
188             Take in a list of matched alleles and look up the sequence type from the profile.
189              
190             use Bio::MLST::SequenceType;
191             my $st = Bio::MLST::SequenceType->new(
192             profiles_filename => 't/data/Escherichia_coli_1/profiles/escherichia_coli.txt',
193             matching_names => ['adk-2','purA-3','recA-1'],
194             non_matching_names => []
195             );
196             $st->sequence_type();
197              
198             =head1 METHODS
199              
200             =head2 allele_to_number
201              
202             Maps the allele name to the corresponding locus sequence number.
203              
204             =head2 sequence_type
205              
206             Returns the sequence type (an integer).
207              
208             =head2 nearest_sequence_type
209              
210             Returns the nearest matching sequence type if there is no exact match, randomly chosen if there is more than 1 with equal identity.
211              
212             =head1 AUTHOR
213              
214             Andrew J. Page <ap13@sanger.ac.uk>
215              
216             =head1 COPYRIGHT AND LICENSE
217              
218             This software is Copyright (c) 2012 by Wellcome Trust Sanger Institute.
219              
220             This is free software, licensed under:
221              
222             The GNU General Public License, Version 3, June 2007
223              
224             =cut