File Coverage

blib/lib/Bio/GeneDesign/PrefixTree.pm
Criterion Covered Total %
statement 63 63 100.0
branch 19 20 95.0
condition 10 11 90.9
subroutine 7 7 100.0
pod 4 4 100.0
total 103 105 98.1


line stmt bran cond sub pod time code
1             #
2             # GeneDesign module for sequence segmentation
3             #
4              
5             =head1 NAME
6              
7             GeneDesign::PrefixTree - A suffix tree implementation for nucleotide searching
8              
9             =head1 VERSION
10              
11             Version 5.52
12              
13             =head1 DESCRIPTION
14              
15             GeneDesign uses this object to parse peptide sequences for restriction enzyme
16             recognition site possibilities
17              
18             =head1 AUTHOR
19              
20             Sarah Richardson <SMRichardson@lbl.gov>
21              
22             =cut
23              
24             package Bio::GeneDesign::PrefixTree;
25              
26 11     11   305 use 5.006;
  11         71  
  11         429  
27 11     11   60 use strict;
  11         25  
  11         299  
28 11     11   59 use warnings;
  11         20  
  11         7586  
29              
30             my $VERSION = 5.52;
31              
32             =head1 Functions
33              
34             =head2 new
35              
36             Create a new suffix tree object.
37              
38             my $tree = Bio::GeneDesign::PrefixTree->new();
39              
40             =cut
41              
42             sub new
43             {
44 2     2 1 4 my ($class) = @_;
45 2         6 my $self = { root => {} };
46 2         5 bless $self, $class;
47 2         7 return $self;
48             }
49              
50             =head2 add_prefix
51              
52             Add suffixes to the tree. You can add a sequence, an id (which can be an array
53             reference of ids), and a scalar note.
54            
55             $tree->add_prefix('GGATCC', 'BamHI', "i hope this didn't pop up");
56             $tree->add_prefix('GGCCC', ['OhnoI', 'WoopsII'], "I hope these pop up");
57              
58             =cut
59              
60             sub add_prefix
61             {
62 288     288 1 775 my ($self, $sequence, $id, $note) = @_;
63 288 50       982 my @ids = ref($id) eq "ARRAY" ? @$id : ($id);
64 288         467 my $next = $self->{ root };
65 288         326 my $offset = 0;
66 288         344 my $len = length($sequence);
67 288         653 while ($offset < $len)
68             {
69 772         1047 my $char = substr($sequence, $offset, 1);
70 772 100       1990 $next->{$char} = {} unless exists $next->{$char};
71 772         1337 $next = $next->{$char};
72 772         1668 $offset++;
73             }
74 288 100       923 $next->{ids} = {} unless exists $next->{ids};
75 288         526 foreach my $id (@ids)
76             {
77 288 100       1122 $next->{ids}->{$id} = [] unless (exists $next->{ids}->{$id});
78 288 100       791 push @{$next->{ids}->{$id}}, $note if ($note);
  278         1243  
79             }
80 288 100       892 $next->{sequence} = $sequence unless exists $next->{sequence};
81 288 100       811 $next->{count} = $next->{count} ? $next->{count} + 1 : 1;
82 288         1795 return;
83             }
84              
85             =head2 find_prefixes()
86              
87             Pass a sequence to the tree and find the positions of hits. It will return an
88             array that is made up of array references; each array reference represents a hit
89             where the 0 index is the name of the subsequence in the tree, the 1 index is the
90             offset in the query sequence (1 BASED, NOT 0 BASED), the 2 index is the
91             subsequence, and the 3 index is whatever note is associated with the
92             subsequence.
93              
94             my @hits = $tree->find_prefixes('AAAGGATCCATCGCATACGAGGCCCCACCG');
95            
96             # @hits = (['BamHI', 4, 'GGATCC', 'i hope this didn't pop up'],
97             # ['OhnoI', 21, 'GGCCC', 'I hope these pop up'],
98             # ['WoopsII', 21, 'GGCCC', 'I hope these pop up']
99             #);
100              
101             =cut
102              
103             sub find_prefixes
104             {
105 2     2 1 6 my ($self, $sequence) = @_;
106 2         5 my @locations;
107 2         418 my @seq = split('', $sequence);
108 2         30 my $limit = scalar @seq;
109 2         8 for my $seq_idx (0..$limit)
110             {
111 802         908 my $cur_idx = $seq_idx;
112 802         1000 my $ref_idx = $seq[$seq_idx];
113 802         1124 my $ref = $self->{root};
114 802   100     3147 while (++$cur_idx < $limit and $ref)
115             {
116 2098 100       4089 if ($ref->{ids})
117             {
118 68         74 foreach my $id (keys %{$ref->{ids}})
  68         182  
119             {
120 83         139 my $notes = $ref->{ids}->{$id};
121 83         398 push @locations, [$id, $seq_idx + 1, $ref->{sequence}, $notes];
122             }
123             }
124 2098         2765 $ref_idx = $seq[$cur_idx];
125 2098         9654 $ref = $ref->{$ref_idx};
126             }
127             }
128 2         150 return @locations;
129             }
130              
131              
132             =head2 find_ntons
133              
134             Pass a number, n. The tree will report all of the sequences inside it that it
135             has only seen n times. The return value is a hash, where the keys are the
136             sequences and the values are hash references containing the ids and notes
137             associated with each sequence in the tree.
138              
139             my %twos = $tree->find_ntons(2);
140            
141             # %twos = ('GGGCC' => {'OhnoI' => ['I hope these pop up'],
142             # 'WoopsII' => ['I hope these pop up']}
143             # );
144              
145              
146             =cut
147              
148             sub find_ntons
149             {
150 2     2 1 4777 my ($self, $n) = @_;
151 2   50     13 $n = $n || 1;
152 2         4 my %ntons;
153 2         7 my $root = $self->{ root };
154 2         6 my @nodes = map { $root->{$_} }keys %{$root};
  42         80  
  2         19  
155 2         10 foreach my $node (@nodes)
156             {
157 592 100 100     2610 if (exists $node->{sequence} && $node->{count} == $n)
158             {
159 8         23 $ntons{$node->{sequence}} = $node->{ids};
160             }
161 550 100 100     1158 push @nodes, map { $node->{$_} }
  2056         14730  
162 592         1316 grep {$_ ne 'sequence' && $_ ne 'ids' && $_ ne 'count'}
163 592         634 keys %{$node};
164             }
165 2         69 return %ntons;
166             }
167             1;
168              
169             __END__
170              
171             =head1 COPYRIGHT AND LICENSE
172              
173             Copyright (c) 2013, GeneDesign developers
174             All rights reserved.
175              
176             Redistribution and use in source and binary forms, with or without modification,
177             are permitted provided that the following conditions are met:
178              
179             * Redistributions of source code must retain the above copyright notice, this
180             list of conditions and the following disclaimer.
181              
182             * Redistributions in binary form must reproduce the above copyright notice, this
183             list of conditions and the following disclaimer in the documentation and/or
184             other materials provided with the distribution.
185              
186             * The names of Johns Hopkins, the Joint Genome Institute, the Lawrence Berkeley
187             National Laboratory, the Department of Energy, and the GeneDesign developers may
188             not be used to endorse or promote products derived from this software without
189             specific prior written permission.
190              
191             THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
192             ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
193             WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
194             DISCLAIMED. IN NO EVENT SHALL THE DEVELOPERS BE LIABLE FOR ANY DIRECT, INDIRECT,
195             INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
196             LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
197             PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
198             LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
199             OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
200             ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
201              
202             =cut