File Coverage

blib/lib/Bio/GeneDesign/PrefixTree.pm
Criterion Covered Total %
statement 65 65 100.0
branch 19 20 95.0
condition 10 11 90.9
subroutine 7 7 100.0
pod 4 4 100.0
total 105 107 98.1


line stmt bran cond sub pod time code
1             #
2             # GeneDesign module for sequence segmentation
3             #
4              
5             =head1 NAME
6              
7             GeneDesign::PrefixTree - A suffix tree implementation for nucleotide searching
8              
9             =head1 VERSION
10              
11             Version 5.56
12              
13             =head1 DESCRIPTION
14              
15             GeneDesign uses this object to parse peptide sequences for restriction enzyme
16             recognition site possibilities
17              
18             =head1 AUTHOR
19              
20             Sarah Richardson
21              
22             =cut
23              
24             package Bio::GeneDesign::PrefixTree;
25              
26 11     11   254 use 5.006;
  11         68  
27 11     11   50 use strict;
  11         22  
  11         218  
28 11     11   53 use warnings;
  11         22  
  11         6403  
29              
30             my $VERSION = 5.56;
31              
32             =head1 Functions
33              
34             =head2 new
35              
36             Create a new suffix tree object.
37              
38             my $tree = Bio::GeneDesign::PrefixTree->new();
39              
40             =cut
41              
42             sub new
43             {
44 2     2 1 6 my ($class) = @_;
45 2         6 my $self = { root => {} };
46 2         5 bless $self, $class;
47 2         6 return $self;
48             }
49              
50             =head2 add_prefix
51              
52             Add suffixes to the tree. You can add a sequence, an id (which can be an array
53             reference of ids), and a scalar note.
54              
55             $tree->add_prefix('GGATCC', 'BamHI', "i hope this didn't pop up");
56             $tree->add_prefix('GGCCC', ['OhnoI', 'WoopsII'], "I hope these pop up");
57              
58             =cut
59              
60             sub add_prefix
61             {
62 288     288 1 805 my ($self, $sequence, $id, $note) = @_;
63 288 50       693 my @ids = ref($id) eq "ARRAY" ? @$id : ($id);
64 288         402 my $next = $self->{ root };
65 288         302 my $offset = 0;
66 288         298 my $len = length($sequence);
67 288         455 while ($offset < $len)
68             {
69 772         967 my $char = substr($sequence, $offset, 1);
70 772 100       1304 $next->{$char} = {} unless exists $next->{$char};
71 772         908 $next = $next->{$char};
72 772         1214 $offset++;
73             }
74 288 100       545 $next->{ids} = {} unless exists $next->{ids};
75 288         408 foreach my $id (@ids)
76             {
77 288 100       641 $next->{ids}->{$id} = [] unless (exists $next->{ids}->{$id});
78 288 100       516 push @{$next->{ids}->{$id}}, $note if ($note);
  278         701  
79             }
80 288 100       521 $next->{sequence} = $sequence unless exists $next->{sequence};
81 288 100       458 $next->{count} = $next->{count} ? $next->{count} + 1 : 1;
82 288         883 return;
83             }
84              
85             =head2 find_prefixes()
86              
87             Pass a sequence to the tree and find the positions of hits. It will return an
88             array that is made up of array references; each array reference represents a hit
89             where the 0 index is the name of the subsequence in the tree, the 1 index is the
90             offset in the query sequence (1 BASED, NOT 0 BASED), the 2 index is the
91             subsequence, and the 3 index is whatever note is associated with the
92             subsequence.
93              
94             my @hits = $tree->find_prefixes('AAAGGATCCATCGCATACGAGGCCCCACCG');
95              
96             # @hits = (['BamHI', 4, 'GGATCC', 'i hope this didn't pop up'],
97             # ['OhnoI', 21, 'GGCCC', 'I hope these pop up'],
98             # ['WoopsII', 21, 'GGCCC', 'I hope these pop up']
99             #);
100              
101             =cut
102              
103             sub find_prefixes
104             {
105 2     2 1 6 my ($self, $sequence) = @_;
106 2         4 my @locations;
107 2         105 my @seq = split('', $sequence);
108 2         5 my $limit = scalar @seq;
109 2         7 for my $seq_idx (0..$limit)
110             {
111 802         901 my $cur_idx = $seq_idx;
112 802         915 my $ref_idx = $seq[$seq_idx];
113 802         873 my $ref = $self->{root};
114 802   100     1748 while (++$cur_idx < $limit and $ref)
115             {
116 2098 100       2947 if ($ref->{ids})
117             {
118 68         76 foreach my $id (sort {$a cmp $b} keys %{$ref->{ids}})
  15         33  
  68         156  
119             {
120 83         111 my $notes = $ref->{ids}->{$id};
121 83         224 push @locations, [$id, $seq_idx + 1, $ref->{sequence}, $notes];
122             }
123             }
124 2098         2288 $ref_idx = $seq[$cur_idx];
125 2098         4924 $ref = $ref->{$ref_idx};
126             }
127             }
128 2         51 return @locations;
129             }
130              
131              
132             =head2 find_ntons
133              
134             Pass a number, n. The tree will report all of the sequences inside it that it
135             has only seen n times. The return value is a hash, where the keys are the
136             sequences and the values are hash references containing the ids and notes
137             associated with each sequence in the tree.
138              
139             my %twos = $tree->find_ntons(2);
140              
141             # %twos = ('GGGCC' => {'OhnoI' => ['I hope these pop up'],
142             # 'WoopsII' => ['I hope these pop up']}
143             # );
144              
145              
146             =cut
147              
148             sub find_ntons
149             {
150 2     2 1 22347 my ($self, $n) = @_;
151 2   50     8 $n = $n || 1;
152 2         4 my %ntons;
153 2         3 my $root = $self->{ root };
154 2         4 my @nodes = map { $root->{$_} } sort {$a cmp $b} keys %{$root};
  42         57  
  134         146  
  2         25  
155 2         7 foreach my $node (@nodes)
156             {
157 592 100 100     1490 if (exists $node->{sequence} && $node->{count} == $n)
158             {
159 8         18 $ntons{$node->{sequence}} = $node->{ids};
160             }
161 550         865 push @nodes, map { $node->{$_} }
162 2056 100 100     5940 grep {$_ ne 'sequence' && $_ ne 'ids' && $_ ne 'count'}
163 592         698 sort {$a cmp $b} keys %{$node};
  2366         2851  
  592         1484  
164             }
165 2         26 return %ntons;
166             }
167             1;
168              
169             __END__