File Coverage

blib/lib/Bio/GeneDesign/PrefixTree.pm

Criterion	Covered	Total	%
statement	65	65	100.0
branch	19	20	95.0
condition	10	11	90.9
subroutine	7	7	100.0
pod	4	4	100.0
total	105	107	98.1

line	stmt	bran	cond	sub	pod	time	code
1							#
2							# GeneDesign module for sequence segmentation
3							#
4
5							=head1 NAME
6
7							GeneDesign::PrefixTree - A suffix tree implementation for nucleotide searching
8
9							=head1 VERSION
10
11							Version 5.56
12
13							=head1 DESCRIPTION
14
15							GeneDesign uses this object to parse peptide sequences for restriction enzyme
16							recognition site possibilities
17
18							=head1 AUTHOR
19
20							Sarah Richardson
21
22							=cut
23
24							package Bio::GeneDesign::PrefixTree;
25
26	11			11		254	use 5.006;
	11					68
27	11			11		50	use strict;
	11					22
	11					218
28	11			11		53	use warnings;
	11					22
	11					6403
29
30							my $VERSION = 5.56;
31
32							=head1 Functions
33
34							=head2 new
35
36							Create a new suffix tree object.
37
38							my $tree = Bio::GeneDesign::PrefixTree->new();
39
40							=cut
41
42							sub new
43							{
44	2			2	1	6	my ($class) = @_;
45	2					6	my $self = { root => {} };
46	2					5	bless $self, $class;
47	2					6	return $self;
48							}
49
50							=head2 add_prefix
51
52							Add suffixes to the tree. You can add a sequence, an id (which can be an array
53							reference of ids), and a scalar note.
54
55							$tree->add_prefix('GGATCC', 'BamHI', "i hope this didn't pop up");
56							$tree->add_prefix('GGCCC', ['OhnoI', 'WoopsII'], "I hope these pop up");
57
58							=cut
59
60							sub add_prefix
61							{
62	288			288	1	805	my ($self, $sequence, $id, $note) = @_;
63	288	50				693	my @ids = ref($id) eq "ARRAY" ? @$id : ($id);
64	288					402	my $next = $self->{ root };
65	288					302	my $offset = 0;
66	288					298	my $len = length($sequence);
67	288					455	while ($offset < $len)
68							{
69	772					967	my $char = substr($sequence, $offset, 1);
70	772	100				1304	$next->{$char} = {} unless exists $next->{$char};
71	772					908	$next = $next->{$char};
72	772					1214	$offset++;
73							}
74	288	100				545	$next->{ids} = {} unless exists $next->{ids};
75	288					408	foreach my $id (@ids)
76							{
77	288	100				641	$next->{ids}->{$id} = [] unless (exists $next->{ids}->{$id});
78	288	100				516	push @{$next->{ids}->{$id}}, $note if ($note);
	278					701
79							}
80	288	100				521	$next->{sequence} = $sequence unless exists $next->{sequence};
81	288	100				458	$next->{count} = $next->{count} ? $next->{count} + 1 : 1;
82	288					883	return;
83							}
84
85							=head2 find_prefixes()
86
87							Pass a sequence to the tree and find the positions of hits. It will return an
88							array that is made up of array references; each array reference represents a hit
89							where the 0 index is the name of the subsequence in the tree, the 1 index is the
90							offset in the query sequence (1 BASED, NOT 0 BASED), the 2 index is the
91							subsequence, and the 3 index is whatever note is associated with the
92							subsequence.
93
94							my @hits = $tree->find_prefixes('AAAGGATCCATCGCATACGAGGCCCCACCG');
95
96							# @hits = (['BamHI', 4, 'GGATCC', 'i hope this didn't pop up'],
97							# ['OhnoI', 21, 'GGCCC', 'I hope these pop up'],
98							# ['WoopsII', 21, 'GGCCC', 'I hope these pop up']
99							#);
100
101							=cut
102
103							sub find_prefixes
104							{
105	2			2	1	6	my ($self, $sequence) = @_;
106	2					4	my @locations;
107	2					105	my @seq = split('', $sequence);
108	2					5	my $limit = scalar @seq;
109	2					7	for my $seq_idx (0..$limit)
110							{
111	802					901	my $cur_idx = $seq_idx;
112	802					915	my $ref_idx = $seq[$seq_idx];
113	802					873	my $ref = $self->{root};
114	802		100			1748	while (++$cur_idx < $limit and $ref)
115							{
116	2098	100				2947	if ($ref->{ids})
117							{
118	68					76	foreach my $id (sort {$a cmp $b} keys %{$ref->{ids}})
	15					33
	68					156
119							{
120	83					111	my $notes = $ref->{ids}->{$id};
121	83					224	push @locations, [$id, $seq_idx + 1, $ref->{sequence}, $notes];
122							}
123							}
124	2098					2288	$ref_idx = $seq[$cur_idx];
125	2098					4924	$ref = $ref->{$ref_idx};
126							}
127							}
128	2					51	return @locations;
129							}
130
131
132							=head2 find_ntons
133
134							Pass a number, n. The tree will report all of the sequences inside it that it
135							has only seen n times. The return value is a hash, where the keys are the
136							sequences and the values are hash references containing the ids and notes
137							associated with each sequence in the tree.
138
139							my %twos = $tree->find_ntons(2);
140
141							# %twos = ('GGGCC' => {'OhnoI' => ['I hope these pop up'],
142							# 'WoopsII' => ['I hope these pop up']}
143							# );
144
145
146							=cut
147
148							sub find_ntons
149							{
150	2			2	1	22347	my ($self, $n) = @_;
151	2		50			8	$n = $n \|\| 1;
152	2					4	my %ntons;
153	2					3	my $root = $self->{ root };
154	2					4	my @nodes = map { $root->{$_} } sort {$a cmp $b} keys %{$root};
	42					57
	134					146
	2					25
155	2					7	foreach my $node (@nodes)
156							{
157	592	100	100			1490	if (exists $node->{sequence} && $node->{count} == $n)
158							{
159	8					18	$ntons{$node->{sequence}} = $node->{ids};
160							}
161	550					865	push @nodes, map { $node->{$_} }
162	2056	100	100			5940	grep {$_ ne 'sequence' && $_ ne 'ids' && $_ ne 'count'}
163	592					698	sort {$a cmp $b} keys %{$node};
	2366					2851
	592					1484
164							}
165	2					26	return %ntons;
166							}
167							1;
168
169							__END__