File Coverage

lib/Bio/MLST/SequenceType.pm

Criterion	Covered	Total	%
statement	92	96	95.8
branch	28	34	82.3
condition	2	2	100.0
subroutine	13	14	92.8
pod	0	1	0.0
total	135	147	91.8

line	stmt	bran	cond	sub	pod	time	code
1							package Bio::MLST::SequenceType;
2							# ABSTRACT: Take in a list of matched alleles and look up the sequence type from the profile.
3							$Bio::MLST::SequenceType::VERSION = '2.1.1706216';
4
5	12			12		97074	use Data::Dumper;
	12					16
	12					915
6	12			12		7162	use Text::CSV;
	12					109227
	12					496
7	12			12		69	use List::Util qw(min reduce);
	12					12
	12					647
8
9	12			12		484	use Moose;
	12					290722
	12					85
10	12			12		54476	use Bio::MLST::Types;
	12					17
	12					260
11	12			12		3342	use Bio::MLST::FilterAlleles qw(is_metadata);
	12					22
	12					8267
12
13							has 'profiles_filename' => ( is => 'ro', isa => 'Bio::MLST::File', required => 1 );
14							has 'matching_names' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
15							has 'non_matching_names' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
16
17							has 'allele_to_number' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_allele_to_number' );
18							has '_profiles' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, builder => '_build__profiles' );
19							has 'sequence_type' => ( is => 'ro', isa => 'Maybe[Str]', lazy => 1, builder => '_build_sequence_type' );
20
21							has 'nearest_sequence_type' => ( is => 'rw', isa => 'Maybe[Str]');
22							has 'report_lowest_st' => ( is => 'ro', isa => 'Bool', default => 0 );
23
24							sub sequence_type_or_nearest
25							{
26	0			0	0	0	my($self) = @_;
27	0	0				0	return $self->sequence_type if(defined($self->sequence_type));
28							# If there isn't a perfect match, add a tilde to the sequence type
29	0	0				0	return $self->nearest_sequence_type."~" if(defined($self->nearest_sequence_type));
30	0					0	return $self->nearest_sequence_type;
31							}
32
33							sub _build__profiles
34							{
35	8			8		47	my($self) = @_;
36	8	50				160	open(my $fh, $self->profiles_filename) or die "Couldnt open profile: ".$self->profiles_filename."\n";
37	8					90	my $csv_in = Text::CSV->new({sep_char=>"\t"});
38	8					777	my $profile = $csv_in->getline_all($fh);
39
40	8					1520	return $profile;
41							}
42
43							sub _build_allele_to_number
44							{
45	8			8		8	my($self) = @_;
46	8					8	my %allele_to_number;
47
48	8					5	for my $sequence_name (@{$self->non_matching_names})
	8					135
49							{
50	6					15	my @sequence_name_details = split(/[-_]/,$sequence_name);
51	6					7	my $num = pop @sequence_name_details;
52	6					7	my $name = join( "-", @sequence_name_details );
53	6					10	$allele_to_number{$name} = $num;
54							}
55
56	8					9	for my $sequence_name (@{$self->matching_names})
	8					125
57							{
58	19					33	my @sequence_name_details = split(/[-_]/,$sequence_name);
59	19					15	my $num = pop @sequence_name_details;
60	19					21	my $name = join( "-", @sequence_name_details );
61	19					29	$allele_to_number{$name} = $num;
62							}
63
64							#print "ALLELE TO NUMBER: ";
65							#print Dumper \%allele_to_number;
66
67	8					132	return \%allele_to_number;
68							}
69
70							sub _allele_numbers_similar
71							{
72	123			123		114	my($self, $number_a, $number_b) = @_;
73	123	100				260	if ($number_a eq $number_b) {
		100
		100
74	2					7	return 1;
75							} elsif ("$number_a~" eq $number_b) {
76	1					3	return 1;
77							} elsif ("$number_b~" eq $number_a) {
78	18					31	return 1;
79							} else {
80	102					237	return 0;
81							}
82							}
83
84							sub _build_sequence_type
85							{
86	8			8		9	my($self) = @_;
87
88	8					7	my @header_row = @{$self->_profiles->[0]};
	8					129
89
90	8					18	for(my $i=0; $i< @header_row; $i++)
91							{
92	50	100				67	next if(is_metadata($header_row[$i]) == 1);
93	28					30	$header_row[$i] =~ s!_!!g;
94	28					48	$header_row[$i] =~ s!-!!g;
95							}
96
97	8					7	my $num_loci = 0;
98	8					7	my %sequence_type_match_freq;
99							my %sequence_type_part_match_freq;
100
101	8					8	for(my $row = 1; $row < @{$self->_profiles}; $row++)
	62					1014
102							{
103	54					33	my @current_row = @{$self->_profiles->[$row]};
	54					798
104	54					87	for(my $col = 0; $col< @current_row; $col++)
105							{
106	299	50				378	next unless(defined($header_row[$col]));
107	299	100				352	next if(is_metadata($header_row[$col]) == 1);
108	210	100				267	$num_loci++ if($row == 1);
109
110	210					3495	my $allele_number = $self->allele_to_number->{$header_row[$col]};
111	210	100				282	next if(!defined($allele_number) );
112	192	100				291	if ($allele_number eq $current_row[$col]) {
		100
113	75					154	$sequence_type_match_freq{$current_row[0]}++;
114							} elsif ($self->_allele_numbers_similar($allele_number, $current_row[$col])) {
115	17					35	$sequence_type_part_match_freq{$current_row[0]}++;
116							}
117							}
118							}
119
120	8					16	return $self->_get_sequence_type_or_set_nearest_match(\%sequence_type_match_freq,
121							\%sequence_type_part_match_freq,
122							$num_loci);
123							}
124
125							sub _get_sequence_type_or_set_nearest_match
126							{
127	8			8		9	my($self,$st_match_f, $st_part_match_f, $num_loci) = @_;
128	8					4	my %st_match_freq = %{$st_match_f};
	8					27
129
130							# Combine the frequencies of the perfect matches (%st_match_freq) and the
131							# partial matches ($st_part_match_f)
132	8					8	my %st_nearest_match_freq = %{$st_match_f};
	8					16
133	8					5	while (my($sequence_type, $freq) = each(%{$st_part_match_f})) {
	20					36
134	12		100			25	my $nearest_match_frequency = ( $st_nearest_match_freq{$sequence_type} \|\| 0 );
135	12					15	$st_nearest_match_freq{$sequence_type} = $nearest_match_frequency + $freq;
136							}
137
138							# if $num_loci is in $st_match_freq vals, return that, otherwise return lowest numbered sequence type
139	8					14	while (my($sequence_type, $freq) = each(%st_match_freq)) {
140	26	100				51	if ($freq == $num_loci) {
141	2					41	return $sequence_type;
142							}
143							}
144	6					2	my $best_sequence_type;
145	6	100				112	if ( $self->report_lowest_st ){
146
147	2					16	$best_sequence_type = min (keys %st_nearest_match_freq);
148							}
149							else {
150							# This reduce takes pairs of sequence types and compares them. It looks
151							# for the ST with the highest number of matching alleles; if two matches
152							# are just as good, it picks the ST with the smaller number.
153							$best_sequence_type = reduce {
154	12	100		12		21	if ( $st_nearest_match_freq{$a} > $st_nearest_match_freq{$b} ) {
		100
155	7					8	$a;
156							} elsif ( $st_nearest_match_freq{$a} < $st_nearest_match_freq{$b} ) {
157	3					3	$b;
158							} else {
159	2					7	min ($a, $b);
160							}
161	4					31	} keys %st_nearest_match_freq;
162							}
163
164	6					123	$self->nearest_sequence_type($best_sequence_type);
165	6					110	return undef;
166							}
167
168	12			12		57	no Moose;
	12					14
	12					66
169							__PACKAGE__->meta->make_immutable;
170							1;
171
172							__END__
173
174							=pod
175
176							=encoding UTF-8
177
178							=head1 NAME
179
180							Bio::MLST::SequenceType - Take in a list of matched alleles and look up the sequence type from the profile.
181
182							=head1 VERSION
183
184							version 2.1.1706216
185
186							=head1 SYNOPSIS
187
188							Take in a list of matched alleles and look up the sequence type from the profile.
189
190							use Bio::MLST::SequenceType;
191							my $st = Bio::MLST::SequenceType->new(
192							profiles_filename => 't/data/Escherichia_coli_1/profiles/escherichia_coli.txt',
193							matching_names => ['adk-2','purA-3','recA-1'],
194							non_matching_names => []
195							);
196							$st->sequence_type();
197
198							=head1 METHODS
199
200							=head2 allele_to_number
201
202							Maps the allele name to the corresponding locus sequence number.
203
204							=head2 sequence_type
205
206							Returns the sequence type (an integer).
207
208							=head2 nearest_sequence_type
209
210							Returns the nearest matching sequence type if there is no exact match, randomly chosen if there is more than 1 with equal identity.
211
212							=head1 AUTHOR
213
214							Andrew J. Page <ap13@sanger.ac.uk>
215
216							=head1 COPYRIGHT AND LICENSE
217
218							This software is Copyright (c) 2012 by Wellcome Trust Sanger Institute.
219
220							This is free software, licensed under:
221
222							The GNU General Public License, Version 3, June 2007
223
224							=cut