File Coverage

lib/Bio/MLST/SequenceType.pm
Criterion Covered Total %
statement 92 96 95.8
branch 28 34 82.3
condition 2 2 100.0
subroutine 13 14 92.8
pod 0 1 0.0
total 135 147 91.8


line stmt bran cond sub pod time code
1             package Bio::MLST::SequenceType;
2             # ABSTRACT: Take in a list of matched alleles and look up the sequence type from the profile.
3             $Bio::MLST::SequenceType::VERSION = '2.1.1706216';
4              
5 12     12   97074 use Data::Dumper;
  12         16  
  12         915  
6 12     12   7162 use Text::CSV;
  12         109227  
  12         496  
7 12     12   69 use List::Util qw(min reduce);
  12         12  
  12         647  
8              
9 12     12   484 use Moose;
  12         290722  
  12         85  
10 12     12   54476 use Bio::MLST::Types;
  12         17  
  12         260  
11 12     12   3342 use Bio::MLST::FilterAlleles qw(is_metadata);
  12         22  
  12         8267  
12              
13             has 'profiles_filename' => ( is => 'ro', isa => 'Bio::MLST::File', required => 1 );
14             has 'matching_names' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
15             has 'non_matching_names' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
16              
17             has 'allele_to_number' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_allele_to_number' );
18             has '_profiles' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, builder => '_build__profiles' );
19             has 'sequence_type' => ( is => 'ro', isa => 'Maybe[Str]', lazy => 1, builder => '_build_sequence_type' );
20              
21             has 'nearest_sequence_type' => ( is => 'rw', isa => 'Maybe[Str]');
22             has 'report_lowest_st' => ( is => 'ro', isa => 'Bool', default => 0 );
23              
24             sub sequence_type_or_nearest
25             {
26 0     0 0 0 my($self) = @_;
27 0 0       0 return $self->sequence_type if(defined($self->sequence_type));
28             # If there isn't a perfect match, add a tilde to the sequence type
29 0 0       0 return $self->nearest_sequence_type."~" if(defined($self->nearest_sequence_type));
30 0         0 return $self->nearest_sequence_type;
31             }
32              
33             sub _build__profiles
34             {
35 8     8   47 my($self) = @_;
36 8 50       160 open(my $fh, $self->profiles_filename) or die "Couldnt open profile: ".$self->profiles_filename."\n";
37 8         90 my $csv_in = Text::CSV->new({sep_char=>"\t"});
38 8         777 my $profile = $csv_in->getline_all($fh);
39              
40 8         1520 return $profile;
41             }
42              
43             sub _build_allele_to_number
44             {
45 8     8   8 my($self) = @_;
46 8         8 my %allele_to_number;
47              
48 8         5 for my $sequence_name (@{$self->non_matching_names})
  8         135  
49             {
50 6         15 my @sequence_name_details = split(/[-_]/,$sequence_name);
51 6         7 my $num = pop @sequence_name_details;
52 6         7 my $name = join( "-", @sequence_name_details );
53 6         10 $allele_to_number{$name} = $num;
54             }
55              
56 8         9 for my $sequence_name (@{$self->matching_names})
  8         125  
57             {
58 19         33 my @sequence_name_details = split(/[-_]/,$sequence_name);
59 19         15 my $num = pop @sequence_name_details;
60 19         21 my $name = join( "-", @sequence_name_details );
61 19         29 $allele_to_number{$name} = $num;
62             }
63              
64             #print "ALLELE TO NUMBER: ";
65             #print Dumper \%allele_to_number;
66              
67 8         132 return \%allele_to_number;
68             }
69              
70             sub _allele_numbers_similar
71             {
72 123     123   114 my($self, $number_a, $number_b) = @_;
73 123 100       260 if ($number_a eq $number_b) {
    100          
    100          
74 2         7 return 1;
75             } elsif ("$number_a~" eq $number_b) {
76 1         3 return 1;
77             } elsif ("$number_b~" eq $number_a) {
78 18         31 return 1;
79             } else {
80 102         237 return 0;
81             }
82             }
83              
84             sub _build_sequence_type
85             {
86 8     8   9 my($self) = @_;
87              
88 8         7 my @header_row = @{$self->_profiles->[0]};
  8         129  
89              
90 8         18 for(my $i=0; $i< @header_row; $i++)
91             {
92 50 100       67 next if(is_metadata($header_row[$i]) == 1);
93 28         30 $header_row[$i] =~ s!_!!g;
94 28         48 $header_row[$i] =~ s!-!!g;
95             }
96              
97 8         7 my $num_loci = 0;
98 8         7 my %sequence_type_match_freq;
99             my %sequence_type_part_match_freq;
100              
101 8         8 for(my $row = 1; $row < @{$self->_profiles}; $row++)
  62         1014  
102             {
103 54         33 my @current_row = @{$self->_profiles->[$row]};
  54         798  
104 54         87 for(my $col = 0; $col< @current_row; $col++)
105             {
106 299 50       378 next unless(defined($header_row[$col]));
107 299 100       352 next if(is_metadata($header_row[$col]) == 1);
108 210 100       267 $num_loci++ if($row == 1);
109              
110 210         3495 my $allele_number = $self->allele_to_number->{$header_row[$col]};
111 210 100       282 next if(!defined($allele_number) );
112 192 100       291 if ($allele_number eq $current_row[$col]) {
    100          
113 75         154 $sequence_type_match_freq{$current_row[0]}++;
114             } elsif ($self->_allele_numbers_similar($allele_number, $current_row[$col])) {
115 17         35 $sequence_type_part_match_freq{$current_row[0]}++;
116             }
117             }
118             }
119              
120 8         16 return $self->_get_sequence_type_or_set_nearest_match(\%sequence_type_match_freq,
121             \%sequence_type_part_match_freq,
122             $num_loci);
123             }
124              
125             sub _get_sequence_type_or_set_nearest_match
126             {
127 8     8   9 my($self,$st_match_f, $st_part_match_f, $num_loci) = @_;
128 8         4 my %st_match_freq = %{$st_match_f};
  8         27  
129              
130             # Combine the frequencies of the perfect matches (%st_match_freq) and the
131             # partial matches ($st_part_match_f)
132 8         8 my %st_nearest_match_freq = %{$st_match_f};
  8         16  
133 8         5 while (my($sequence_type, $freq) = each(%{$st_part_match_f})) {
  20         36  
134 12   100     25 my $nearest_match_frequency = ( $st_nearest_match_freq{$sequence_type} || 0 );
135 12         15 $st_nearest_match_freq{$sequence_type} = $nearest_match_frequency + $freq;
136             }
137              
138             # if $num_loci is in $st_match_freq vals, return that, otherwise return lowest numbered sequence type
139 8         14 while (my($sequence_type, $freq) = each(%st_match_freq)) {
140 26 100       51 if ($freq == $num_loci) {
141 2         41 return $sequence_type;
142             }
143             }
144 6         2 my $best_sequence_type;
145 6 100       112 if ( $self->report_lowest_st ){
146              
147 2         16 $best_sequence_type = min (keys %st_nearest_match_freq);
148             }
149             else {
150             # This reduce takes pairs of sequence types and compares them. It looks
151             # for the ST with the highest number of matching alleles; if two matches
152             # are just as good, it picks the ST with the smaller number.
153             $best_sequence_type = reduce {
154 12 100   12   21 if ( $st_nearest_match_freq{$a} > $st_nearest_match_freq{$b} ) {
    100          
155 7         8 $a;
156             } elsif ( $st_nearest_match_freq{$a} < $st_nearest_match_freq{$b} ) {
157 3         3 $b;
158             } else {
159 2         7 min ($a, $b);
160             }
161 4         31 } keys %st_nearest_match_freq;
162             }
163              
164 6         123 $self->nearest_sequence_type($best_sequence_type);
165 6         110 return undef;
166             }
167              
168 12     12   57 no Moose;
  12         14  
  12         66  
169             __PACKAGE__->meta->make_immutable;
170             1;
171              
172             __END__
173              
174             =pod
175              
176             =encoding UTF-8
177              
178             =head1 NAME
179              
180             Bio::MLST::SequenceType - Take in a list of matched alleles and look up the sequence type from the profile.
181              
182             =head1 VERSION
183              
184             version 2.1.1706216
185              
186             =head1 SYNOPSIS
187              
188             Take in a list of matched alleles and look up the sequence type from the profile.
189              
190             use Bio::MLST::SequenceType;
191             my $st = Bio::MLST::SequenceType->new(
192             profiles_filename => 't/data/Escherichia_coli_1/profiles/escherichia_coli.txt',
193             matching_names => ['adk-2','purA-3','recA-1'],
194             non_matching_names => []
195             );
196             $st->sequence_type();
197              
198             =head1 METHODS
199              
200             =head2 allele_to_number
201              
202             Maps the allele name to the corresponding locus sequence number.
203              
204             =head2 sequence_type
205              
206             Returns the sequence type (an integer).
207              
208             =head2 nearest_sequence_type
209              
210             Returns the nearest matching sequence type if there is no exact match, randomly chosen if there is more than 1 with equal identity.
211              
212             =head1 AUTHOR
213              
214             Andrew J. Page <ap13@sanger.ac.uk>
215              
216             =head1 COPYRIGHT AND LICENSE
217              
218             This software is Copyright (c) 2012 by Wellcome Trust Sanger Institute.
219              
220             This is free software, licensed under:
221              
222             The GNU General Public License, Version 3, June 2007
223              
224             =cut