File Coverage

blib/lib/MEME/Alphabet.pm
Criterion Covered Total %
statement 460 812 56.6
branch 123 316 38.9
condition 60 113 53.1
subroutine 32 60 53.3
pod 0 42 0.0
total 675 1343 50.2


line stmt bran cond sub pod time code
1             =pod
2              
3             =head1 NAME
4              
5             MEME::Alphabet - Provides nucleobase alphabet capabilities for Perl code.
6              
7             =head1 AUTHORS
8              
9             James Johnson and Timothy L. Bailey.
10              
11             =head1 LICENSE
12              
13             Copyright
14             (1994 - 2017) The Regents of the University of California.
15             All Rights Reserved.
16              
17             Permission to use, copy, modify, and distribute any part of
18             this software for educational, research and non-profit purposes,
19             without fee, and without a written agreement is hereby granted,
20             provided that the above copyright notice, this paragraph and
21             the following three paragraphs appear in all copies.
22              
23             Those desiring to incorporate this software into commercial
24             products or use for commercial purposes should contact the
25             Technology Transfer Office, University of California, San Diego,
26             9500 Gilman Drive, La Jolla, California, 92093-0910,
27             Ph: (619) 534 5815.
28              
29             IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO
30             ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR
31             CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF
32             THE USE OF THIS SOFTWARE, EVEN IF THE UNIVERSITY OF CALIFORNIA
33             HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34              
35             THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE
36             UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO PROVIDE
37             MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
38             THE UNIVERSITY OF CALIFORNIA MAKES NO REPRESENTATIONS AND
39             EXTENDS NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR IMPLIED,
40             INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
41             MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT
42             THE USE OF THE MATERIAL WILL NOT INFRINGE ANY PATENT,
43             TRADEMARK OR OTHER RIGHTS.
44              
45             =cut
46              
47             package MEME::Alphabet;
48              
49 2     2   48231 use strict;
  2         3  
  2         53  
50 2     2   10 use warnings;
  2         3  
  2         46  
51 2     2   8 use feature 'unicode_strings';
  2         8  
  2         142  
52              
53 2     2   9 use Carp;
  2         4  
  2         106  
54 2     2   11 use Fcntl qw(O_RDONLY);
  2         2  
  2         70  
55              
56             # pinned to the latest MEME Suite version (x.yy.z), and then versioned from there
57 2     2   478 use version; our $VERSION = version->declare("v4.12.0.1.3");
  2         2666  
  2         9  
58              
59             require Exporter;
60             our @ISA = qw(Exporter);
61             our @EXPORT_OK = qw(rna dna moddna protein);
62              
63             my $SYM_RE = qr/[A-Za-z0-9\?\.\*\-]/;
64             my $COLOUR_RE = qr/[0-9A-Fa-f]{6}/;
65             my $NAME_RE = qr/"((?:[^\\"]+|\\["\/bfnrt]|\\u[0-9A-Fa-f]{4})*)"/;
66             my $CORE_RE = qr/($SYM_RE)(?:\s+$NAME_RE)?(?:\s+($COLOUR_RE))?/;
67             my $COMMENT_RE = qr/^(\s*(?:[^#](?:[^"#]*|"(?:\\.|[^"\\]*)*")*)?)#.*$/;
68             my $HEADER_RE = qr/^\s*ALPHABET(?:\s+v1)?(?:\s+$NAME_RE)?(?:\s+(RNA|DNA|PROTEIN)-LIKE)?\s*$/;
69             my $CORE_SINGLE_RE = qr/^\s*$CORE_RE\s*$/;
70             my $CORE_PAIR_RE = qr/^\s*$CORE_RE\s*~\s*$CORE_RE\s*$/;
71             my $AMBIG_RE = qr/^\s*$CORE_RE\s*=\s*($SYM_RE*)\s*$/;
72              
73             sub new {
74 5     5 0 407 my $classname = shift;
75 5         8 my $self = {};
76 5         8 bless($self, $classname);
77 5         12 $self->_init(@_);
78 5         9 return $self;
79             }
80              
81             sub _init {
82 5     5   8 my $self = shift;
83 5         8 my ($file) = @_;
84 5         12 $self->{errors} = [];
85 5         8 $self->{file_name} = undef;
86 5         6 $self->{line_no} = 0;
87 5         7 $self->{seen_header} = 0; #FALSE
88 5         6 $self->{seen_symbol} = 0; #FALSE
89 5         8 $self->{seen_ambig} = 0; #FALSE
90 5         6 $self->{seen_lc} = 0; #FALSE
91 5         10 $self->{seen_uc} = 0; #FALSE
92 5         5 $self->{name} = undef;
93 5         8 $self->{sym_lookup} = {};
94 5         9 $self->{core_syms} = [];
95 5         6 $self->{ambig_syms} = [];
96 5         7 $self->{parsed} = 0; #FALSE
97 5 50       12 if (defined($file)) {
98 0         0 $self->parse_file($file);
99 0 0       0 if (@{$self->{errors}}) {
  0         0  
100 0         0 foreach my $error ( @{$self->{errors}} ) {
  0         0  
101 0         0 warn($error);
102             }
103 0         0 croak("Invalid alphabet file \"$file\"\n");
104             }
105             }
106             }
107              
108             sub _error {
109 0     0   0 my $self = shift;
110 0         0 my ($reason) = @_;
111 0         0 my $line_info = '';
112 0         0 my $file_info = '';
113 0 0       0 $line_info = 'on line ' . $self->{line_no} . ' ' if ($self->{line_no});
114 0 0       0 $file_info = 'file "' . $self->{file_name} . '"' if ($self->{file_name});
115 0         0 push(@{$self->{errors}}, "Invalid alphabet $file_info- $line_info$reason\n");
  0         0  
116             }
117              
118             sub parse_header {
119 4     4 0 6 my $self = shift;
120 4         7 my ($name, $like) = @_;
121 4 50       9 die("Already finished parsing") if $self->{parsed};
122 4 50       11 $self->_error('repeated header') if ($self->{seen_header});
123 4 50       8 $self->_error('header after symbol') if ($self->{seen_symbol});
124 4 50 33     39 $self->_error('like must be "DNA", "RNA" or "PROTEIN" (ignoring case)') if ($like && $like !~ m/^(RNA|DNA|PROTEIN)$/i);
125 4         7 $self->{name} = $name;
126 4 50       15 $self->{like} = ($like ? uc($like) : '');
127 4         6 $self->{seen_header} = 1; #TRUE
128             }
129              
130             sub uniq {
131 60     60 0 69 my %seen;
132 60         264 grep !$seen{$_}++, @_;
133             }
134              
135             sub parse_symbol {
136 92     92 0 106 my $self = shift;
137 92         151 my ($sym, $name, $colour, $complement, $comprise, $aliases) = @_;
138 92 50       134 die("Already finished parsing") if $self->{parsed};
139 92 0 33     137 $self->_error('expected header but found symbol') unless ($self->{seen_header} || $self->{seen_symbol});
140 92 50       133 if (defined($self->{sym_lookup}->{$sym})) {
141 0         0 $self->_error("symbol $sym is already used");
142 0         0 return; # can't really proceed with a duplicate
143             }
144 92 100 66     175 $self->{seen_lc} = 1 if ($sym ge 'a' && $sym le 'z');
145 92 100 100     195 $self->{seen_uc} = 1 if ($sym ge 'A' && $sym le 'Z');
146 92         239 my $symbol = {sym => $sym, aliases => [], name => $name, colour => $colour};
147 92 100       151 if (defined($aliases)) {
148 8         10 push(@{$symbol->{aliases}}, split(//, $aliases));
  8         25  
149             }
150 92 100       119 if (defined($comprise)) { # ambiguous symbol
151 60         118 my @equals = split //, $comprise;
152 60         114 @equals = sort _symbol_cmp @equals;
153 60         105 @equals = &uniq(@equals);
154 60         122 for (my $i = 0; $i < scalar(@equals); $i++) {
155 160         220 my $csym = $self->{sym_lookup}->{$equals[$i]};
156 160 50       356 if (!defined($csym)) {
    50          
157 0         0 $self->_error('referenced symbol ' . $equals[$i] . ' is unknown');
158             } elsif (defined($csym->{comprise})) {
159 0         0 $self->_error('referenced symbol ' . $equals[$i] . ' is ambiguous');
160             }
161             }
162 60         124 $symbol->{comprise} = join('', @equals);
163 60         68 push(@{$self->{ambig_syms}}, $symbol);
  60         106  
164 60         77 $self->{seen_ambig} = 1;
165 60 50 33     118 if ($sym eq '?' && length($symbol->{comprise}) < scalar(@{$self->{core_syms}})) {
  0         0  
166 0         0 $self->_error('symbol ? is reserved as a wildcard');
167             }
168             } else { # core symbol
169 32 50       47 $self->_error('symbol ? is reserved as a wildcard') if ($sym eq '?');
170 32 50       48 $self->_error('unexpected core symbol (as ambiguous symbols seen)') if ($self->{seen_ambig});
171 32         39 $symbol->{complement} = $complement;
172 32         37 push(@{$self->{core_syms}}, $symbol);
  32         46  
173             }
174 92         135 $self->{sym_lookup}->{$sym} = $symbol;
175 92         132 $self->{seen_symbol} = 1;
176             }
177              
178             sub parse_line {
179 0     0 0 0 my $self = shift;
180 0 0       0 die("Already finished parsing") if $self->{parsed};
181 0         0 my ($line) = @_;
182 0         0 $self->{line_no} += 1;
183 0         0 $line =~ s/$COMMENT_RE/$1/;
184 0 0       0 return unless ($line =~ m/\S/); # skip empty lines
185 0 0       0 if ($line =~ m/$HEADER_RE/) {
    0          
    0          
    0          
186 0         0 $self->parse_header($1, $2);
187             } elsif ($line =~ m/$CORE_PAIR_RE/) {
188 0         0 $self->parse_symbol($1, &_decode_JSON_string($2), &_decode_colour($3), $4);
189 0         0 $self->parse_symbol($4, &_decode_JSON_string($5), &_decode_colour($6), $1);
190             } elsif ($line =~ m/$CORE_SINGLE_RE/) {
191 0         0 $self->parse_symbol($1, &_decode_JSON_string($2), &_decode_colour($3));
192             } elsif ($line =~ m/$AMBIG_RE/) {
193 0         0 $self->parse_symbol($1, &_decode_JSON_string($2), &_decode_colour($3), undef, $4);
194             } else {
195 0         0 $self->_error('unrecognised pattern');
196             }
197             }
198              
199             sub _add_lookup {
200 106     106   160 my ($lookup, $target, $character, $both_case) = @_;
201 106         148 $lookup->{$character} = $target;
202 106 100       163 if ($both_case) {
203 36 100 66     80 if ($character ge 'A' && $character le 'Z') {
    50 33        
204 34         63 $lookup->{chr(ord($character) + 32)} = $target;
205             } elsif ($character ge 'a' && $character le 'z') {
206 0         0 $lookup->{chr(ord($character) - 32)} = $target;
207             }
208             }
209             }
210              
211             sub parse_done {
212 4     4 0 5 my $self = shift;
213 4 50       8 die("Already finished parsing") if $self->{parsed};
214             # now sort the symbols
215 4         5 @{$self->{core_syms}} = sort _symobj_cmp @{$self->{core_syms}};
  4         8  
  4         9  
216 4         8 @{$self->{ambig_syms}} = sort _symobj_cmp @{$self->{ambig_syms}};
  4         15  
  4         9  
217             # check if there is a wildcard
218 4 100 66     5 if (scalar(@{$self->{ambig_syms}}) == 0 || length($self->{ambig_syms}->[0]->{comprise}) != scalar(@{$self->{core_syms}})) {
  4         14  
  4         10  
219             #warn("Warning: wildcard symbol automatically generated\n");
220 2         2 my $wildcard_comprise = '';
221 2         4 for (my $i = 0; $i < scalar(@{$self->{core_syms}}); $i++) {
  26         37  
222 24         34 $wildcard_comprise .= $self->{core_syms}->[$i]->{sym};
223             }
224 2         2 unshift(@{$self->{ambig_syms}}, {sym => '?', aliases => [], comprise => $wildcard_comprise});
  2         11  
225             }
226             # create the final list of symbols and determine aliases
227 4         8 my %comprise_lookup = ();
228 4         4 $self->{ncore} = scalar(@{$self->{core_syms}});
  4         7  
229 4         12 $self->{syms} = [];
230 4         11 for (my $i = 0; $i < $self->{ncore}; $i++) {
231 32         40 my $sym = $self->{core_syms}->[$i];
232 32         46 $comprise_lookup{$sym->{sym}} = $sym;
233 32         34 $sym->{index} = scalar(@{$self->{syms}});
  32         52  
234 32         37 push(@{$self->{syms}}, $sym);
  32         57  
235             }
236 4         5 for (my $i = 0; $i < scalar(@{$self->{ambig_syms}}); $i++) {
  66         96  
237 62         69 my $sym = $self->{ambig_syms}->[$i];
238 62         72 my $real = $comprise_lookup{$sym->{comprise}};
239 62 50       81 if (defined($real)) {
240 0         0 push(@{$real->{aliases}}, $sym->{sym});
  0         0  
241             } else {
242 62         88 $comprise_lookup{$sym->{comprise}} = $sym;
243 62         65 $sym->{index} = scalar(@{$self->{syms}});
  62         86  
244 62         65 push(@{$self->{syms}}, $sym);
  62         94  
245             }
246             }
247             # cleanup some entries we don't need
248 4         16 delete $self->{sym_lookup};
249 4         7 delete $self->{core_syms};
250 4         7 delete $self->{ambig_syms};
251             # map comprising characters to objects
252 4         5 for (my $i = $self->{ncore}; $i < scalar(@{$self->{syms}}); $i++) {
  66         106  
253 62         74 my $sym = $self->{syms}->[$i];
254 62         106 my @list = map { $comprise_lookup{$_} } split(//, $sym->{comprise});
  184         248  
255 62         106 $sym->{comprise} = \@list;
256             }
257             #determine complements
258 4         4 my $fully_complementable = 1;
259 4         9 for (my $i = 0; $i < $self->{ncore}; $i++) {
260 32         40 my $sym = $self->{syms}->[$i];
261 32 50       50 unless (defined($sym->{complement})) {
262 0         0 $fully_complementable = 0;
263 0         0 next;
264             }
265 32         48 $sym->{complement} = $comprise_lookup{$sym->{complement}};
266             }
267 4         9 AMBIG: for (my $i = $self->{ncore}; $i < scalar(@{$self->{syms}}); $i++) {
  66         115  
268 62         70 my $sym = $self->{syms}->[$i];
269 62         84 my @complements = ();
270 62         68 for (my $j = 0; $j < scalar(@{$sym->{comprise}}); $j++) {
  246         386  
271 184         212 my $target = $sym->{comprise}->[$j]->{complement};
272 184 50       262 next AMBIG unless defined $target;
273 184         251 push(@complements, $target->{sym});
274             }
275 62         107 @complements = sort _symbol_cmp @complements;
276 62         100 my $complement = $comprise_lookup{join('', @complements)};
277 62 50       94 next AMBIG unless defined $complement;
278 62         99 $sym->{complement} = $complement;
279             }
280             # create lookup
281 4         6 my $both_case = ($self->{seen_lc} != $self->{seen_uc});
282 4         8 $self->{lookup} = {};
283 4         4 for (my $i = 0; $i < scalar(@{$self->{syms}}); $i++) {
  98         146  
284 94         98 my $sym = $self->{syms}->[$i];
285 94         161 &_add_lookup($self->{lookup}, $sym, $sym->{sym}, $both_case);
286 94         103 for (my $j = 0; $j < scalar(@{$sym->{aliases}}); $j++) {
  106         181  
287 12         28 &_add_lookup($self->{lookup}, $sym, $sym->{aliases}->[$j], $both_case);
288             }
289             }
290 4         8 $self->{comprise_lookup} = \%comprise_lookup;
291 4         4 $self->{fully_complementable} = $fully_complementable;
292             # create regular expressions to match something that isn't a valid symbol
293 4         7 my $all_symbols = quotemeta(join('', (keys %{$self->{lookup}})));
  4         25  
294 4         95 $self->{re_not_valid} = qr/[^$all_symbols]/;
295 4         9 my $ambig_symbols = '';
296 4         8 for (my $i = $self->{ncore}; $i < scalar(@{$self->{syms}}); $i++) {
  66         105  
297 62         70 my $entry = $self->{syms}->[$i];
298 62         66 my $prime_sym = $entry->{sym};
299 62         67 $ambig_symbols .= $prime_sym;
300 62 100 66     119 if ($both_case && $prime_sym =~ m/^[A-Za-z]$/) {
301 22         23 my $prime_sym_copy = $prime_sym; # copy
302 22         29 $prime_sym_copy =~ tr/A-Za-z/a-zA-Z/; # swap case
303 22         27 $ambig_symbols .= $prime_sym_copy;
304             }
305 62         65 foreach my $alias_sym (@{$entry->{aliases}}) {
  62         95  
306 8         8 $ambig_symbols .= $alias_sym;
307 8 100 100     23 if ($both_case && $alias_sym =~ m/^[A-Za-z]$/) {
308 2         4 my $alias_sym_copy = $alias_sym; # copy
309 2         2 $alias_sym_copy =~ tr/A-Za-z/a-zA-Z/; # swap case
310 2         3 $ambig_symbols .= $alias_sym_copy;
311             }
312             }
313             }
314 4         32 $self->{re_ambig} = qr/[\Q$ambig_symbols\E]/;
315 4         45 $self->{re_ambig_only} = qr/^[\Q$ambig_symbols\E]*$/;
316             #$ambig_symbols = quotemeta($ambig_symbols);
317             #$self->{re_ambig} = qr/[$ambig_symbols]/;
318             #$self->{re_ambig_only} = qr/^[$ambig_symbols]*$/;
319              
320             # create translatation strings
321             # Strings to translate aliases of core symbols to prime representation
322 4         11 my ($tr_src_core_alias_to_prime, $tr_dst_core_alias_to_prime) = ('', '');
323 4         10 for (my $i = 0; $i < $self->{ncore}; $i++) {
324 32         45 my $entry = $self->{syms}->[$i];
325 32         35 my $prime_sym = $entry->{sym};
326             # alternate case of primary symbol?
327 32 100 66     53 if ($both_case && $prime_sym =~ m/^[A-Za-z]$/) {
328 8         13 my $prime_sym_copy = $prime_sym; # copy
329 8         10 $prime_sym_copy =~ tr/A-Za-z/a-zA-Z/; # swap case
330 8         9 $tr_src_core_alias_to_prime .= $prime_sym_copy;
331 8         10 $tr_dst_core_alias_to_prime .= $prime_sym;
332             }
333             # alias symbols
334 32         35 foreach my $alias_sym (@{$entry->{aliases}}) {
  32         59  
335 4         6 $tr_src_core_alias_to_prime .= $alias_sym;
336 4         5 $tr_dst_core_alias_to_prime .= $prime_sym;
337             # alternate case of alias symbol?
338 4 100 66     14 if ($both_case && $alias_sym =~ m/^[A-Za-z]$/) {
339 2         2 my $alias_sym_copy = $alias_sym; # copy
340 2         4 $alias_sym_copy =~ tr/A-Za-z/a-zA-Z/; # swap case
341 2         2 $tr_src_core_alias_to_prime .= $alias_sym_copy;
342 2         5 $tr_dst_core_alias_to_prime .= $prime_sym;
343             }
344             }
345             }
346 4         9 $tr_src_core_alias_to_prime =~ s/([\/\-])/\\$1/g; # escape forward slash and dash
347 4         6 $tr_dst_core_alias_to_prime =~ s/([\/\-])/\\$1/g; # escape forward slash and dash
348 4         7 $self->{tr_src_core_alias_to_prime} = $tr_src_core_alias_to_prime;
349 4         6 $self->{tr_dst_core_alias_to_prime} = $tr_dst_core_alias_to_prime;
350             # Strings to translate aliases of ambiguous symbols to prime representation
351 4         6 my ($tr_src_ambig_alias_to_prime, $tr_dst_ambig_alias_to_prime) = ('', '');
352 4         9 for (my $i = $self->{ncore}; $i < scalar(@{$self->{syms}}); $i++) {
  66         96  
353 62         71 my $entry = $self->{syms}->[$i];
354 62         64 my $prime_sym = $entry->{sym};
355             # alternate case of primary symbol?
356 62 100 66     399 if ($both_case && $prime_sym =~ m/^[A-Za-z]$/) {
357 22         28 my $prime_sym_copy = $prime_sym; # copy
358 22         24 $prime_sym_copy =~ tr/A-Za-z/a-zA-Z/; # swap case
359 22         25 $tr_src_ambig_alias_to_prime .= $prime_sym_copy;
360 22         24 $tr_dst_ambig_alias_to_prime .= $prime_sym;
361             }
362             # alias symbols
363 62         64 foreach my $alias_sym (@{$entry->{aliases}}) {
  62         95  
364 8         11 $tr_src_ambig_alias_to_prime .= $alias_sym;
365 8         8 $tr_dst_ambig_alias_to_prime .= $prime_sym;
366             # alternate case of alias symbol?
367 8 100 100     24 if ($both_case && $alias_sym =~ m/^[A-Za-z]$/) {
368 2         3 my $alias_sym_copy = $alias_sym;
369 2         5 $alias_sym_copy =~ tr/A-Za-z/a-zA-Z/;
370 2         3 $tr_src_ambig_alias_to_prime .= $alias_sym_copy;
371 2         3 $tr_dst_ambig_alias_to_prime .= $prime_sym;
372             }
373             }
374             }
375 4         8 $tr_src_ambig_alias_to_prime =~ s/([\/\-])/\\$1/g; # escape forward slash and dash
376 4         7 $tr_dst_ambig_alias_to_prime =~ s/([\/\-])/\\$1/g; # escape forward slash and dash
377 4         6 $self->{tr_src_ambig_alias_to_prime} = $tr_src_ambig_alias_to_prime;
378 4         5 $self->{tr_dst_ambig_alias_to_prime} = $tr_dst_ambig_alias_to_prime;
379             # Strings to translate ambiguous symbols and their aliases to the wildcard
380 4         7 my ($tr_src_ambig_to_wild, $tr_dst_ambig_to_wild) = ('', '');
381 4         6 my $wildcard = $self->{syms}->[$self->{ncore}]->{sym};
382 4         7 for (my $i = $self->{ncore}; $i < scalar(@{$self->{syms}}); $i++) {
  66         104  
383 62         69 my $entry = $self->{syms}->[$i];
384 62         66 my $prime_sym = $entry->{sym};
385             # primary symbol to wildcard (unless it is the wildcard)
386 62 100       86 if ($i != $self->{ncore}) {
387 58         62 $tr_src_ambig_to_wild .= $prime_sym;
388 58         62 $tr_dst_ambig_to_wild .= $wildcard;
389             }
390             # alternate case of primary symbol?
391 62 100 66     123 if ($both_case && $prime_sym =~ m/^[A-Za-z]$/) {
392 22         28 my $prime_sym_copy = $prime_sym;
393 22         28 $prime_sym_copy =~ tr/A-Za-z/a-zA-Z/;
394 22         23 $tr_src_ambig_to_wild .= $prime_sym_copy;
395 22         25 $tr_dst_ambig_to_wild .= $wildcard;
396             }
397             # alias symbols
398 62         65 foreach my $alias_sym (@{$entry->{aliases}}) {
  62         92  
399 8         10 $tr_src_ambig_to_wild .= $alias_sym;
400 8         9 $tr_dst_ambig_to_wild .= $wildcard;
401             # alternate case of alias symbol?
402 8 100 100     20 if ($both_case && $alias_sym =~ m/^[A-Za-z]$/) {
403 2         2 my $alias_sym_copy = $alias_sym;
404 2         3 $alias_sym_copy =~ tr/A-Za-z/a-zA-Z/;
405 2         2 $tr_src_ambig_to_wild .= $alias_sym_copy;
406 2         4 $tr_dst_ambig_to_wild .= $wildcard;
407             }
408             }
409             }
410 4         8 $tr_src_ambig_to_wild =~ s/([\/\-])/\\$1/g; # escape forward slash and dash
411 4         4 $tr_dst_ambig_to_wild =~ s/([\/\-])/\\$1/g; # escape forward slash and dash
412 4         8 $self->{tr_src_ambig_to_wild} = $tr_src_ambig_to_wild;
413 4         4 $self->{tr_dst_ambig_to_wild} = $tr_dst_ambig_to_wild;
414             # Strings to translate unknown symbols to the wildcard
415 4         6 my ($tr_src_unknown_to_wild, $tr_dst_unknown_to_wild) = ('', '');
416 4         8 my $range_start = 0;
417 4         8 for (my $i = 33; $i <= 126; $i++) {
418 376 100       638 if (defined($self->{lookup}->{chr($i)})) {
419 140 100       217 if ($i > ($range_start + 1)) {
    100          
420 38         78 $tr_src_unknown_to_wild .= sprintf("\\%03o-\\%03o", $range_start, $i - 1);
421             } elsif ($i == ($range_start + 1)) {
422 12         18 $tr_src_unknown_to_wild .= sprintf("\\%03o", $range_start);
423             }
424 140         207 $range_start = $i + 1;
425             }
426             }
427 4         7 $tr_src_unknown_to_wild .= sprintf("\\%03o-\\377", $range_start);
428 4 50 33     18 $tr_dst_unknown_to_wild = ($wildcard eq '-' || $wildcard eq '/' ? "\\" : "") .$wildcard; # tr will extend using the last character
429 4         8 $self->{tr_src_unknown_to_wild} = $tr_src_unknown_to_wild;
430 4         7 $self->{tr_dst_unknown_to_wild} = $tr_dst_unknown_to_wild;
431             # check 'like' a standard alphabet
432 4 50       8 if ($self->{like}) {
433 4         6 my ($req_core, $req_comp, $name);
434 4 50       9 if ($self->{like} eq 'RNA') {
    50          
    0          
435 0         0 $req_core = 'ACGU';
436 0         0 $name = 'RNA';
437             } elsif ($self->{like} eq 'DNA') {
438 4         8 $req_core = 'ACGT';
439 4         5 $req_comp = 'TGCA';
440 4         5 $name = 'DNA';
441             } elsif ($self->{like} eq 'PROTEIN') {
442 0         0 $req_core = 'ACDEFGHIKLMNPQRSTVWY';
443 0         0 $name = 'protein';
444             }
445 4 50       6 if (defined $req_core) {
446 4         9 for (my $i = 0; $i < length($req_core); $i++) {
447 16         21 my $sym = substr($req_core, $i, 1);
448 16         22 my $symobj = $self->{lookup}->{$sym};
449 16 50       23 unless (defined $symobj) {
450 0         0 $self->_error("not $name like - expected symbol '$sym'");
451             }
452 16 50       39 unless ($symobj->{index} < $self->{ncore}) {
453 0         0 $self->_error("not $name like - symbol '$sym' must be a core symbol");
454             }
455 16 50       22 if (defined $req_comp) {
456 16         26 my $comp = substr($req_comp, $i, 1);
457 16         20 my $compobj = $self->{lookup}->{$comp};
458 16 50 0     36 if (not defined $compobj || not defined $symobj->{complement} || $symobj->{complement} != $compobj) {
      33        
459 0         0 $self->_error("not $name like - symbol '$sym' complement must be '$comp'");
460             }
461             } else {
462             #if (defined $symobj->{complement}) {
463             # $self->_error("not $name like - symbol '$sym' must not have complement");
464             #}
465             }
466             }
467             }
468             }
469             # done
470 4         16 $self->{parsed} = 1; # TRUE
471             }
472              
473             sub parse_file {
474 0     0 0 0 my $self = shift;
475 0 0       0 die("Already finished parsing") if $self->{parsed};
476 0         0 my ($file) = @_;
477 0         0 $self->{file_name} = $file;
478 0         0 my ($fh, $line);
479 0         0 sysopen($fh, $file, O_RDONLY);
480 0         0 while($line = <$fh>) {
481 0         0 chomp($line);
482 0         0 $self->parse_line($line);
483             }
484 0         0 close($fh);
485 0         0 $self->parse_done();
486             }
487              
488             sub _sym_to_text {
489 0     0   0 my ($sym) = @_;
490 0         0 my $name_info = '';
491 0 0       0 if (defined($sym->{name})) {
492 0         0 $name_info = ' "' . &_encode_JSON_string($sym->{name}) . '"';
493             }
494 0         0 my $colour_info = '';
495 0 0       0 if (defined($sym->{colour})) {
496 0         0 $colour_info = sprintf(" %.06X", $sym->{colour});
497             }
498 0         0 return $sym->{sym} . $name_info . $colour_info;
499             }
500              
501             sub to_text {
502 0     0 0 0 my $self = shift;
503 0         0 my %options = @_;
504 0         0 my $out = '';
505 0         0 my $gap = undef;
506             # set option defaults
507 0 0       0 $options{print_header} = 1 unless defined $options{print_header};
508 0 0       0 $options{print_footer} = 1 unless defined $options{print_footer};
509             # output header
510 0 0       0 if ($options{print_header}) {
511             $out .= "ALPHABET " . $self->name(1) .
512 0 0       0 ($self->{like} ? ' ' . $self->{like} . '-LIKE' : '') . "\n";
513             }
514             # output all paired core symbols
515 0         0 for (my $i = 0; $i < $self->{ncore}; $i++) {
516 0         0 my $sym = $self->{syms}->[$i];
517 0         0 my $csym = $sym->{complement};
518 0 0 0     0 if (defined($csym) && $sym->{index} < $csym->{index}) {
519 0         0 $out .= &_sym_to_text($sym) . ' ~ ' . &_sym_to_text($csym) . "\n";
520             }
521             }
522             # output all unpared core symbols
523 0         0 for (my $i = 0; $i < $self->{ncore}; $i++) {
524 0         0 my $sym = $self->{syms}->[$i];
525 0 0       0 if (!defined($sym->{complement})) {
526 0         0 $out .= &_sym_to_text($sym) . "\n";
527             }
528             }
529             # output ambiguous symbols
530 0         0 for (my $i = $self->{ncore}; $i < scalar(@{$self->{syms}}); $i++) {
  0         0  
531 0         0 my $sym = $self->{syms}->[$i];
532 0 0       0 if (scalar(@{$sym->{comprise}}) == 0) {
  0         0  
533 0         0 $gap = $sym;
534 0         0 next;
535             }
536 0         0 my $comprise = '';
537 0         0 for (my $j = 0; $j < scalar(@{$sym->{comprise}}); $j++) {
  0         0  
538 0         0 $comprise .= $sym->{comprise}->[$j]->{sym};
539             }
540 0         0 $out .= &_sym_to_text($sym) . ' = ' . $comprise . "\n";
541 0         0 for (my $j = 0; $j < scalar(@{$sym->{aliases}}); $j++) {
  0         0  
542 0         0 $out .= $sym->{aliases}->[$j] . ' = ' . $comprise . "\n";
543             }
544             }
545             # output core symbol aliases
546 0         0 for (my $i = 0; $i < $self->{ncore}; $i++) {
547 0         0 my $sym = $self->{syms}->[$i];
548 0         0 for (my $j = 0; $j < scalar(@{$sym->{aliases}}); $j++) {
  0         0  
549 0         0 $out .= $sym->{aliases}->[$j] . ' = ' . $sym->{sym} . "\n";
550             }
551             }
552             # output gap symbol and aliases
553 0 0       0 if ($gap) {
554 0         0 $out .= &_sym_to_text($gap) . " =\n";
555 0         0 for (my $j = 0; $j < scalar(@{$gap->{aliases}}); $j++) {
  0         0  
556 0         0 $out .= $gap->{aliases}->[$j] . " =\n";
557             }
558             }
559             # output footer
560 0 0       0 if ($options{print_footer}) {
561 0         0 $out .= "END ALPHABET\n";
562             }
563 0         0 return $out;
564             }
565              
566             sub to_json() {
567 0     0 0 0 my $self = shift;
568 0         0 my ($jw) = @_;
569 0         0 $jw->start_object_value();
570 0         0 $jw->str_prop("name", $self->{name});
571 0 0       0 $jw->str_prop("like", lc($self->{like})) if ($self->{like});
572 0         0 $jw->num_prop("ncore", $self->{ncore});
573 0         0 $jw->property("symbols");
574 0         0 $jw->start_array_value();
575 0         0 for (my $i = 0; $i < scalar(@{$self->{syms}}); $i++) {
  0         0  
576 0         0 my $sym = $self->{syms}->[$i];
577 0         0 $jw->start_object_value();
578 0         0 $jw->str_prop("symbol", $sym->{sym});
579 0 0       0 $jw->str_prop("aliases", join("", @{$sym->{aliases}})) if (scalar(@{$sym->{aliases}}) > 0);
  0         0  
  0         0  
580 0 0       0 $jw->str_prop("name", $sym->{name}) if (defined($sym->{name}));
581 0 0 0     0 $jw->str_prop("colour", sprintf("%06X", $sym->{colour})) if (defined($sym->{colour}) && $sym->{colour} > 0);
582 0 0       0 if ($i < $self->{ncore}) {
583 0 0       0 $jw->str_prop("complement", $sym->{complement}->{sym}) if (defined($sym->{complement}));
584             } else {
585 0         0 my $comprise = '';
586 0         0 for (my $j = 0; $j < scalar(@{$sym->{comprise}}); $j++) {
  0         0  
587 0         0 $comprise .= $sym->{comprise}->[$j]->{sym};
588             }
589 0         0 $jw->str_prop("equals", $comprise);
590             }
591 0         0 $jw->end_object_value();
592             }
593 0         0 $jw->end_array_value();
594 0         0 $jw->end_object_value();
595             }
596              
597             sub has_errors {
598 4     4 0 6 my $self = shift;
599 4         5 return scalar(@{$self->{errors}});
  4         11  
600             }
601              
602             sub get_errors {
603 0     0 0 0 my $self = shift;
604 0         0 return @{$self->{errors}};
  0         0  
605             }
606              
607             sub name {
608 0     0 0 0 my $self = shift;
609 0         0 my ($should_encode) = @_;
610 0 0       0 if ($should_encode) {
611 0         0 return '"' . &_encode_JSON_string($self->{name}) . '"';
612             } else {
613 0         0 return $self->{name};
614             }
615             }
616              
617             sub has_complement {
618 5     5 0 7 my $self = shift;
619 5 50       10 die("Not finished parsing") unless $self->{parsed};
620 5         11 return $self->{fully_complementable};
621             }
622              
623             sub size_core {
624 8     8 0 10 my $self = shift;
625 8         23 return $self->{ncore};
626             }
627              
628             sub size_full {
629 8     8 0 9 my $self = shift;
630 8         8 return scalar(@{$self->{syms}});
  8         16  
631             }
632              
633             #
634             # Return the input list in list context
635             # and the flattened list in scalar context.
636             #
637             sub _flatten_list_in_scalar_context {
638 8 100   8   13 if (wantarray) {
639 7         27 return @_;
640             } else {
641 1         8 return join('' , @_);
642             }
643             }
644              
645             #
646             # Returns the concatenation of all the
647             # possible ambiguity codes of the alphabet.
648             # Returns a list in list context, otherwise a scalar.
649             #
650             sub get_ambig_syms {
651 0     0 0 0 my $self = shift;
652 0         0 my @ambig_syms;
653              
654 0         0 foreach my $sym ($self->get_syms()) {
655 0 0       0 unless ($self->is_core($sym)) {
656 0         0 push(@ambig_syms, $sym);
657             }
658             }
659              
660 0         0 return _flatten_list_in_scalar_context(@ambig_syms);
661             }
662              
663             #
664             # Returns the concatenation of all the possible symbols
665             # of the alphabet.
666             # Returns a list in list context, otherwise a scalar.
667             #
668             sub get_syms {
669 4     4 0 5 my $self = shift;
670 4         5 my @syms;
671              
672 4         7 foreach my $symbol (@{$self->{syms}}) {
  4         8  
673 77         105 push(@syms, $symbol->{sym});
674             }
675              
676 4         11 return _flatten_list_in_scalar_context(@syms);
677             }
678              
679             #
680             # Returns the concatenation of all the core symbols
681             # of the alphabet.
682             # Returns a list in list context, otherwise a scalar.
683             #
684             sub get_core {
685 4     4 0 3468 my $self = shift;
686 4         6 my @core_syms;
687              
688 4         11 foreach my $sym ($self->get_syms()) {
689 77 100       137 if ($self->is_core($sym)) {
690 24         32 push(@core_syms, $sym);
691             }
692             }
693              
694 4         10 return _flatten_list_in_scalar_context(@core_syms);
695             }
696              
697             #
698             # Returns a dictionary containing the core symbols that
699             # each ambiguity code of the alphabet is composed of.
700             #
701             sub get_alternates_dict {
702 0     0 0 0 my $self = shift;
703 0         0 my %alternates;
704 0         0 foreach my $ambig_base (split(//, $self->get_ambig_syms())) {
705 0         0 %alternates = (%alternates, %{$self->comprise($ambig_base)});
  0         0  
706             }
707 0         0 return %alternates;
708             }
709              
710             #
711             # Is the symbol part of the core alphabet.
712             #
713             sub is_core {
714 77     77 0 80 my $self = shift;
715 77         99 my ($letter) = @_;
716 77         95 my $sym = $self->{lookup}->{$letter};
717 77   66     229 return defined($sym) && $sym->{index} < $self->{ncore};
718             }
719              
720             #
721             # Is the symbol part of the alphabet.
722             #
723             sub is_known {
724 0     0 0 0 my $self = shift;
725 0         0 my ($letter) = @_;
726 0         0 my $sym = $self->{lookup}->{$letter};
727 0         0 return defined($sym);
728             }
729              
730             #
731             # Does the sequence only contain ambiguous symbols.
732             # This might be useful if you want to throw away something containing only
733             # ambiguous symbols as that has no real useful information.
734             #
735             sub is_ambig_seq {
736 0     0 0 0 my $self = shift;
737 0         0 my ($seq) = @_;
738 0         0 my $ambig_only_re = $self->{re_ambig_only};
739              
740 0         0 return scalar($seq =~ m/$ambig_only_re/);
741             }
742              
743             #
744             # Does the sequence contain ambiguous symbols.
745             #
746             sub is_part_ambig_seq {
747 0     0 0 0 my $self = shift;
748 0         0 my ($seq) = @_;
749 0         0 my $ambig_re = $self->{re_ambig};
750              
751 0         0 return scalar($seq =~ m/$ambig_re/);
752             }
753              
754             sub char {
755 0     0 0 0 my $self = shift;
756 0         0 my ($index) = @_;
757              
758 0 0 0     0 return if ($index < 0 || $index > scalar(@{$self->{syms}}));
  0         0  
759              
760 0         0 return $self->{syms}->[$index]->{sym};
761             }
762              
763             #
764             # Converts a symbol into the index in the alphabet.
765             #
766             sub index {
767 0     0 0 0 my $self = shift;
768 0         0 my ($letter) = @_;
769 0         0 my $sym = $self->{lookup}->{$letter};
770              
771 0 0       0 return unless defined $sym;
772              
773 0         0 return $sym->{index};
774             }
775              
776             #
777             # Converts a symbol that could be an alias into the primary symbol.
778             #
779             sub encode {
780 0     0 0 0 my $self = shift;
781 0         0 my ($letter) = @_;
782 0         0 my $sym = $self->{lookup}->{$letter};
783              
784 0 0       0 return unless defined $sym;
785 0         0 return $sym->{sym};
786             }
787              
788             #
789             # Takes a letter and returns the set of comprising core symbols.
790             #
791             sub comprise {
792 0     0 0 0 my $self = shift;
793 0         0 my ($letter) = @_;
794 0         0 my $sym = $self->{lookup}->{$letter};
795 0         0 my %set = ();
796 0 0       0 if (defined($sym->{comprise})) {
797 0         0 for (my $i = 0; $i < scalar(@{$sym->{comprise}}); $i++) {
  0         0  
798 0         0 $set{$sym->{comprise}->[$i]->{sym}} = 1;
799             }
800             } else {
801 0         0 $set{$sym->{sym}} = 1;
802             }
803 0         0 return \%set;
804             }
805              
806             #
807             # Converts a set of core symbols into either a matching ambigous symbol or
808             # a group of core symbols.
809             #
810             sub regex {
811 0     0 0 0 my $self = shift;
812 0         0 my ($set) = @_;
813 0         0 my $key = join('', sort _symbol_cmp keys %{$set});
  0         0  
814 0         0 my $sym = $self->{comprise_lookup}->{$key};
815 0 0       0 if (defined($sym)) {
816 0         0 return $sym->{sym};
817             } else {
818 0         0 return '['.$key.']';
819             }
820             }
821              
822             #
823             # Takes a set of core symbols and creates a set containing the other core symbols.
824             #
825             sub negate_set {
826 0     0 0 0 my $self = shift;
827 0         0 my ($set) = @_;
828 0         0 my %negated_set = ();
829              
830 0         0 for (my $i = 0; $i < $self->{ncore}; $i++) {
831 0         0 my $a = $self->{syms}->[$i]->{sym};
832 0 0       0 if (!($set->{$a})) {
833 0         0 $negated_set{$a} = 1;
834             }
835             }
836              
837 0         0 return \%negated_set;
838             }
839              
840             #
841             # Finds the first unknown symbol.
842             #
843             sub find_unknown {
844 0     0 0 0 my $self = shift;
845 0         0 my ($seq) = @_;
846 0         0 my $not_valid = $self->{re_not_valid};
847              
848 0 0       0 if ($seq =~ m/($not_valid)/) {
849 0         0 return ($-[1], $1);
850             } else {
851 0         0 return;
852             }
853             }
854              
855             #
856             # Reverse complements a sequence using the alphabet.
857             #
858             sub rc_seq {
859 1     1 0 2 my $self = shift;
860 1 50       3 croak("Alphabet does not allow complementing.") unless $self->has_complement();
861 1         3 my ($sequence) = @_;
862 1         6 my @seq = split //, $sequence;
863              
864 1         3 @seq = reverse @seq;
865 1         2 @seq = map { $self->{lookup}->{$_}->{complement}->{sym} } @seq;
  5         11  
866              
867 1         7 return join('', @seq);
868             }
869              
870             #
871             # Translate a sequence using the alphabet.
872             #
873             # NO_ALIASES Convert any alias symbols into the main symbol.
874             # NO_AMBIGS Convert any ambiguous symbols into the main wildcard symbol.
875             # NO_UNKNOWN Convert any unknown symbols into the main wildcard symbol.
876             #
877             sub translate_seq {
878 0     0 0 0 my $self = shift;
879 0         0 my ($sequence, %options) = @_;
880 0 0       0 my $no_aliases = (defined($options{NO_ALIASES}) ? $options{NO_ALIASES} : 0);
881 0 0       0 my $no_ambigs = (defined($options{NO_AMBIGS}) ? $options{NO_AMBIGS} : 0);
882 0 0       0 my $no_unknown = (defined($options{NO_UNKNOWN}) ? $options{NO_UNKNOWN} : 0);
883 0         0 my ($src, $dst) = ('', '');
884 0 0       0 if ($no_aliases) {
    0          
885 0         0 $src .= $self->{tr_src_core_alias_to_prime};
886 0         0 $dst .= $self->{tr_dst_core_alias_to_prime};
887 0 0       0 if ($no_ambigs) {
888 0         0 $src .= $self->{tr_src_ambig_to_wild};
889 0         0 $dst .= $self->{tr_dst_ambig_to_wild};
890             } else {
891 0         0 $src .= $self->{tr_src_ambig_alias_to_prime};
892 0         0 $dst .= $self->{tr_dst_ambig_alias_to_prime};
893             }
894             } elsif ($no_ambigs) {
895 0         0 $src .= $self->{tr_src_ambig_to_wild};
896 0         0 $dst .= $self->{tr_dst_ambig_to_wild};
897             }
898 0 0       0 if ($no_unknown) {
899 0         0 $src .= $self->{tr_src_unknown_to_wild};
900 0         0 $dst .= $self->{tr_dst_unknown_to_wild};
901             }
902              
903 0         0 eval {$sequence =~ tr/$src/$dst/};
  0         0  
904              
905 0         0 return $sequence;
906             }
907              
908             #
909             # Returns a scalar of Weblogo 3 colour arguments:
910             # --color COLOR SYMBOLS DESCRIPTION ...
911             # Currently only operates over core symbols.
912             sub get_Weblogo_colour_args {
913 2     2 0 4 my $self = shift;
914              
915 2         5 my $arg_string = "";
916              
917 2         5 foreach my $letter ($self->get_core()) {
918 16         21 my $sym = $self->{lookup}->{$letter};
919              
920             # convert back from decoded hex, adding '#' prefix
921 16         28 my $HTML_colour = sprintf("#%06X", $sym->{colour});
922              
923 16         38 $arg_string .= "--color '$HTML_colour' '$sym->{sym}' '$sym->{name}' ";
924             }
925              
926 2         11 return $arg_string;
927             }
928              
929             #
930             # Compare 2 different alphabet objects for equality
931             #
932             sub equals {
933 2     2 0 2 my $self = shift;
934 2         4 my ($other) = @_;
935 2 50       6 return 0 if ($self->size_core() != $other->size_core());
936 2 50       5 return 0 if ($self->size_full() != $other->size_full());
937 2 50       5 return 0 if ($self->has_complement() != $other->has_complement());
938 2 50       8 return 0 if (defined($self->{name}) != defined($other->{name}));
939 2 50 33     9 if (defined($self->{name}) && defined($other->{name})) {
940 2 50       5 return 0 if ($self->{name} ne $other->{name});
941             }
942 2         4 my $nsyms = $self->size_full();
943 2         6 for (my $i = 0; $i < $nsyms; $i++) {
944 47         55 my $self_sym = $self->{syms}->[$i];
945 47         54 my $other_sym = $other->{syms}->[$i];
946 47 50       69 return 0 if ($self_sym->{sym} ne $other_sym->{sym});
947 47 50       71 return 0 if (defined($self_sym->{name}) != defined($other_sym->{name}));
948 47 100 66     102 if (defined($self_sym->{name}) && defined($other_sym->{name})) {
949 46 50       67 return 0 if ($self_sym->{name} ne $other_sym->{name});
950             }
951 47 50       74 return 0 if (defined($self_sym->{colour}) != defined($other_sym->{colour}));
952 47 100 66     81 if (defined($self_sym->{colour}) && defined($other_sym->{colour})) {
953 16 50       28 return 0 if ($self_sym->{colour} != $other_sym->{colour});
954             }
955 47 50       47 return 0 if (scalar(@{$self_sym->{aliases}}) != scalar(@{$other_sym->{aliases}}));
  47         50  
  47         68  
956 47         50 my $naliases = scalar(@{$self_sym->{aliases}});
  47         60  
957 47         93 for (my $j = 0; $j < $naliases; $j++) {
958 6 50       14 return 0 if ($self_sym->{aliases}->[$j] ne $other_sym->{aliases}->[$j]);
959             }
960             }
961 2         4 $nsyms = $self->size_core();
962 2         4 for (my $i = 0; $i < $nsyms; $i++) {
963 16         18 my $self_sym = $self->{syms}->[$i];
964 16         20 my $other_sym = $other->{syms}->[$i];
965 16 50       24 return 0 if (defined($self_sym->{complement}) != defined($other_sym->{complement}));
966 16 50 33     37 if (defined($self_sym->{complement}) && defined($other_sym->{complement})) {
967 16 50       31 return 0 if ($self_sym->{complement}->{sym} ne $other_sym->{complement}->{sym});
968             }
969             }
970 2         4 $nsyms = $self->size_full();
971 2         3 for (my $i = $self->size_core(); $i < $nsyms; $i++) {
972 31         36 my $self_sym = $self->{syms}->[$i];
973 31         37 my $other_sym = $other->{syms}->[$i];
974 31 50       40 return 0 if (scalar(@{$self_sym->{comprise}}) != scalar(@{$other_sym->{comprise}}));
  31         40  
  31         41  
975 31         37 for (my $j = 0; $j < scalar(@{$self_sym->{comprise}}); $j++) {
  123         206  
976 92 50       147 return 0 if ($self_sym->{comprise}->[$j]->{sym} ne $other_sym->{comprise}->[$j]->{sym});
977             }
978             }
979 2         5 return 1;
980             }
981              
982             sub is_dna {
983 1     1 0 4 my $self = shift;
984 1 50       4 $self->{cached_is_dna} = $self->equals(&dna()) unless defined $self->{cached_is_dna};
985 1         23 return $self->{cached_is_dna};
986             }
987              
988             sub is_moddna {
989 1     1 0 6 my $self = shift;
990 1 50       5 $self->{cached_is_moddna} = $self->equals(&moddna()) unless defined $self->{cached_is_moddna};
991 1         14 return $self->{cached_is_moddna};
992             }
993              
994             sub is_rna {
995 0     0 0 0 my $self = shift;
996 0 0       0 $self->{cached_is_rna} = $self->equals(&rna()) unless defined $self->{cached_is_rna};
997 0         0 return $self->{cached_is_rna};
998             }
999              
1000             sub is_protein {
1001 0     0 0 0 my $self = shift;
1002 0 0       0 $self->{cached_is_protein} = $self->equals(&protein()) unless defined $self->{cached_is_protein};
1003 0         0 return $self->{cached_is_protein};
1004             }
1005              
1006             #
1007             # Helper function to decode a string encoded using the method defined in the JSON specification.
1008             #
1009             sub _decode_JSON_string {
1010 0     0   0 my ($text) = @_;
1011              
1012 0 0       0 return unless defined $text;
1013              
1014 0         0 $text =~ s/\\(["\\\/])/$1/g;
1015 0         0 $text =~ s/\\b/\x{8}/g;
1016 0         0 $text =~ s/\\f/\x{C}/g;
1017 0         0 $text =~ s/\\n/\x{A}/g;
1018 0         0 $text =~ s/\\r/\x{D}/g;
1019 0         0 $text =~ s/\\t/\x{9}/g;
1020 0         0 $text =~ s/\\u([0-9A-Fa-f]{4})/chr(hex($1))/ge;
  0         0  
1021              
1022 0         0 return $text;
1023             }
1024              
1025             #
1026             # Helper function to encode a string using the method defined in the JSON specification.
1027             #
1028             sub _encode_JSON_string {
1029 0     0   0 my ($text) = @_;
1030              
1031 0         0 $text =~ s/(["\\\/])/\\$1/g;
1032 0         0 $text =~ s/\x{8}/\\b/g;
1033 0         0 $text =~ s/\x{C}/\\f/g;
1034 0         0 $text =~ s/\x{A}/\\n/g;
1035 0         0 $text =~ s/\x{D}/\\r/g;
1036 0         0 $text =~ s/\x{9}/\\t/g;
1037 0         0 $text =~ s/([\x{0}-\x{1F}\x{7F}-\x{9F}\x{2028}\x{2029}])/sprintf("\\u%.04X",ord($1))/ge;
  0         0  
1038              
1039 0         0 return $text;
1040             }
1041              
1042             #
1043             # Helper function to convert the colour into a number.
1044             #
1045             sub _decode_colour {
1046 0     0   0 my ($text) = @_;
1047              
1048 0 0       0 return unless defined $text;
1049              
1050 0         0 return hex($text);
1051             }
1052              
1053             # _symbol_cmp
1054             # Sorts so letters are before numbers which are before symbols.
1055             # Otherwise sorts using normal string comparison.
1056             sub _symbol_cmp($$) { ## no critic
1057 526     526   772 my ($a, $b) = @_;
1058              
1059 526 100       755 if (length($a) == length($b)) {
    100          
1060 470         709 for (my $i = 0; $i < length($a); $i++) {
1061 504         582 my ($sym_a, $sym_b);
1062 504         610 $sym_a = ord(substr($a, $i, 1));
1063 504         529 $sym_b = ord(substr($b, $i, 1));
1064             # check to see if either is a letter
1065 504         556 my ($is_letter_a, $is_letter_b);
1066 504   100     1303 $is_letter_a = (($sym_a >= ord('A') && $sym_a <= ord('Z')) || ($sym_a >= ord('a') && $sym_a <= ord('z')));
1067 504   100     1223 $is_letter_b = (($sym_b >= ord('A') && $sym_b <= ord('Z')) || ($sym_b >= ord('a') && $sym_b <= ord('z')));
1068 504 100       671 if ($is_letter_a) {
    100          
1069 438 100       531 if ($is_letter_b) {
1070 394 100       569 if ($sym_a < $sym_b) {
    100          
1071 168         265 return -1;
1072             } elsif ($sym_b < $sym_a) {
1073 192         293 return 1;
1074             }
1075 34         61 next;
1076             } else {
1077 44         67 return -1;
1078             }
1079             } elsif ($is_letter_b) {
1080 18         25 return 1;
1081             }
1082              
1083             # check to see if either is a number
1084 48         55 my ($is_num_a, $is_num_b);
1085 48   33     97 $is_num_a = ($sym_a ge '0' && $sym_a le '9');
1086 48   33     97 $is_num_b = ($sym_b ge '0' && $sym_b le '9');
1087 48 50       63 if ($is_num_a) {
    0          
1088 48 50       61 if ($is_num_b) {
1089 48 100       74 if ($sym_a < $sym_b) {
    50          
1090 16         26 return -1;
1091             } elsif ($sym_b < $sym_a) {
1092 32         73 return 1;
1093             }
1094 0         0 next;
1095             } else {
1096 0         0 return -1;
1097             }
1098             } elsif ($is_num_b) {
1099 0         0 return 1;
1100             }
1101              
1102             # both must be symbols
1103 0 0       0 if ($sym_a < $sym_b) {
    0          
1104 0         0 return -1;
1105             } elsif ($sym_b < $sym_a) {
1106 0         0 return 1;
1107             }
1108             }
1109              
1110 0         0 return 0;
1111             } elsif (length($a) > length($b)) {
1112 14         22 return -1;
1113             } else {
1114 42         57 return 1;
1115             }
1116             }
1117              
1118             sub _symobj_cmp($$) { ## no critic
1119 216     216   289 my ($sym1, $sym2) = @_;
1120 216         220 my $ret;
1121              
1122             # compare comprise
1123 216 100 66     505 if (defined($sym1->{comprise}) && defined($sym2->{comprise})) {
    50          
    50          
1124 150         199 $ret = _symbol_cmp($sym1->{comprise}, $sym2->{comprise});
1125 150 50       248 return $ret if $ret != 0;
1126             } elsif (defined($sym1->{comprise})) {
1127 0         0 return 1; # sym2 first because it is a core symbol
1128             } elsif (defined($sym2->{comprise})) {
1129 0         0 return -1; # sym1 first because it is a core symbol
1130             }
1131              
1132             # compare symbol
1133 66         93 $ret = _symbol_cmp($sym1->{sym}, $sym2->{sym});
1134 66 50       106 return $ret if $ret != 0;
1135              
1136             # compare complement
1137 0 0 0     0 if (defined($sym1->{complement}) && defined($sym2->{complement})) {
    0          
    0          
1138 0         0 $ret = _symbol_cmp($sym1->{complement}, $sym2->{complement});
1139 0 0       0 return $ret if $ret != 0;
1140             } elsif (defined($sym1->{complement})) {
1141 0         0 return 1;
1142             } elsif (defined($sym2->{complement})) {
1143 0         0 return -1;
1144             }
1145              
1146             # compare aliases
1147 0 0       0 if (scalar(@{$sym1->{aliases}}) != scalar(@{$sym2->{aliases}})) {
  0         0  
  0         0  
1148             # sort length decending
1149 0         0 return scalar(@{$sym2->{aliases}}) <=> scalar(@{$sym1->{aliases}});
  0         0  
  0         0  
1150             }
1151 0         0 for (my $i = 0; $i < scalar(@{$sym1->{aliases}}); $i++) {
  0         0  
1152 0         0 $ret = _symbol_cmp($sym1->{aliases}->[$i], $sym2->{aliases}->[$i]);
1153 0 0       0 return $ret if $ret != 0;
1154             }
1155              
1156             # compare name
1157 0 0 0     0 if (defined($sym1->{name}) && defined($sym2->{name})) {
    0          
    0          
1158 0         0 $ret = $sym1->{name} cmp $sym2->{name};
1159 0 0       0 return $ret if $ret != 0;
1160             } elsif (defined($sym1->{name})) {
1161 0         0 return 1;
1162             } elsif (defined($sym2->{name})) {
1163 0         0 return -1;
1164             }
1165              
1166             # compare colour
1167 0 0 0     0 if (defined($sym1->{colour}) && defined($sym2->{colour})) {
    0          
    0          
1168 0         0 return $sym1->{colour} <=> $sym2->{colour};
1169             } elsif (defined($sym1->{colour})) {
1170 0         0 return 1;
1171             } elsif (defined($sym2->{colour})) {
1172 0         0 return -1;
1173             }
1174 0         0 return 0;
1175             }
1176              
1177             #
1178             # Return the RNA alphabet.
1179             #
1180             sub rna {
1181 0     0 0 0 my $alph = new __PACKAGE__;
1182              
1183 0         0 $alph->parse_header('RNA', 'RNA');
1184              
1185             # core symbols
1186 0         0 $alph->parse_symbol('A', 'Adenine', 0xCC0000);
1187 0         0 $alph->parse_symbol('C', 'Cytosine', 0x0000CC);
1188 0         0 $alph->parse_symbol('G', 'Guanine', 0xFFB300);
1189 0         0 $alph->parse_symbol('U', 'Uracil', 0x008000, undef, undef, 'T');
1190             # ambiguous symbols
1191 0         0 $alph->parse_symbol('W', 'Weak', undef, undef, 'AU');
1192 0         0 $alph->parse_symbol('S', 'Strong', undef, undef, 'CG');
1193 0         0 $alph->parse_symbol('M', 'Amino', undef, undef, 'AC');
1194 0         0 $alph->parse_symbol('K', 'Keto', undef, undef, 'GU');
1195 0         0 $alph->parse_symbol('R', 'Purine', undef, undef, 'AG');
1196 0         0 $alph->parse_symbol('Y', 'Pyrimidine', undef, undef, 'CU');
1197 0         0 $alph->parse_symbol('B', 'Not A', undef, undef, 'CGU');
1198 0         0 $alph->parse_symbol('D', 'Not C', undef, undef, 'AGU');
1199 0         0 $alph->parse_symbol('H', 'Not G', undef, undef, 'ACU');
1200 0         0 $alph->parse_symbol('V', 'Not U', undef, undef, 'ACG');
1201 0         0 $alph->parse_symbol('N', 'Any base', undef, undef, 'ACGU', 'X.');
1202             # for now treat '.' (gap) as a wildcard
1203              
1204             # process
1205 0         0 $alph->parse_done();
1206              
1207             # error check
1208 0 0       0 die("Uhoh, this shouldn't happen:\n" + join("\n", $alph->get_errors())) if ($alph->has_errors());
1209              
1210 0         0 return $alph;
1211             }
1212              
1213             #
1214             # Parse core symbols in the DNA alphabet.
1215             # This is done separately, to ease expansion.
1216             # All core symbols must be parsed before any ambiguous symbols.
1217             # Optionally accepts arguments to set colours (alphabetical order).
1218             #
1219             sub _parse_core_DNA_symbols {
1220 4     4   6 my $alph = shift;
1221              
1222 4   100     16 $$alph->parse_symbol('A', 'Adenine', shift || 0xCC0000, 'T');
1223 4   100     11 $$alph->parse_symbol('C', 'Cytosine', shift || 0x0000CC, 'G');
1224 4   100     14 $$alph->parse_symbol('G', 'Guanine', shift || 0xFFB300, 'C');
1225 4   100     12 $$alph->parse_symbol('T', 'Thymine', shift || 0x008000, 'A', undef, 'U');
1226             }
1227              
1228             #
1229             # Parse ambiguous symbols in the DNA alphabet.
1230             # This is done separately, to ease expansion.
1231             #
1232             sub _parse_ambiguous_DNA_symbols {
1233 4     4   6 my $alph = shift;
1234              
1235 4         9 $$alph->parse_symbol('W', 'Weak', undef, undef, 'AT');
1236 4         11 $$alph->parse_symbol('S', 'Strong', undef, undef, 'CG');
1237 4         9 $$alph->parse_symbol('M', 'Amino', undef, undef, 'AC');
1238 4         10 $$alph->parse_symbol('K', 'Keto', undef, undef, 'GT');
1239 4         9 $$alph->parse_symbol('R', 'Purine', undef, undef, 'AG');
1240 4         8 $$alph->parse_symbol('Y', 'Pyrimidine', undef, undef, 'CT');
1241 4         8 $$alph->parse_symbol('B', 'Not A', undef, undef, 'CGT');
1242 4         10 $$alph->parse_symbol('D', 'Not C', undef, undef, 'AGT');
1243 4         8 $$alph->parse_symbol('H', 'Not G', undef, undef, 'ACT');
1244 4         9 $$alph->parse_symbol('V', 'Not T', undef, undef, 'ACG');
1245 4         9 $$alph->parse_symbol('N', 'Any base', undef, undef, 'ACGT', 'X.');
1246             }
1247              
1248             #
1249             # Return the DNA alphabet.
1250             #
1251             sub dna {
1252 2     2 0 6 my $alph = new __PACKAGE__;
1253              
1254 2         5 $alph->parse_header('DNA', 'DNA');
1255              
1256 2         5 _parse_core_DNA_symbols(\$alph);
1257 2         4 _parse_ambiguous_DNA_symbols(\$alph);
1258              
1259             # for now treat '.' (gap) as a wildcard
1260             # process
1261 2         5 $alph->parse_done();
1262              
1263 2 50       7 die("Uhoh, this shouldn't happen:\n" + join("\n", $alph->get_errors())) if ($alph->has_errors());
1264              
1265 2         7 return $alph;
1266             }
1267              
1268             #
1269             # Return the DNA with cytosine modifications alphabet.
1270             #
1271             sub moddna {
1272 2     2 0 5 my $alph = new __PACKAGE__;
1273              
1274 2         5 $alph->parse_header('modDNA', 'DNA');
1275              
1276 2         6 _parse_core_DNA_symbols(\$alph, 0x8510A8, 0xA50026, 0x313695, 0xA89610);
1277              
1278             # additional core symbols
1279 2         5 $alph->parse_symbol('m', '5-Methylcytosine', 0xD73027, '1');
1280 2         10 $alph->parse_symbol('1', 'Guanine:5-Methylcytosine', 0x4575B4, 'm');
1281 2         6 $alph->parse_symbol('h', '5-Hydroxymethylcytosine', 0xF46D43, '2');
1282 2         4 $alph->parse_symbol('2', 'Guanine:5-Hydroxymethylcytosine', 0x74ADD1, 'h');
1283 2         5 $alph->parse_symbol('f', '5-Formylcytosine', 0xFDAE61, '3');
1284 2         4 $alph->parse_symbol('3', 'Guanine:5-Formylcytosine', 0xABD9E9, 'f');
1285 2         3 $alph->parse_symbol('c', '5-Carboxylcytosine', 0xFEE090, '4');
1286 2         5 $alph->parse_symbol('4', 'Guanine:5-Carboxylcytosine', 0xE0F3F8, 'c');
1287              
1288 2         8 _parse_ambiguous_DNA_symbols(\$alph);
1289              
1290             # additional ambiguous symbols
1291 2         5 $alph->parse_symbol('z', 'Any C mod', undef, undef, 'Cmhfc');
1292 2         5 $alph->parse_symbol('9', 'Guanine:any C mod', undef, undef, 'G1234');
1293 2         5 $alph->parse_symbol('y', 'C, not (hydroxy)methylcytosine', undef, undef, 'Cfc');
1294 2         24 $alph->parse_symbol('8', 'Guanine:C, not (hydroxy)methylcytosine', undef, undef, 'G34');
1295 2         5 $alph->parse_symbol('x', '(Hydroxy)methylcytosine', undef, undef, 'mh');
1296 2         5 $alph->parse_symbol('7', 'Guanine:(hydroxy)methylcytosine', undef, undef, '12');
1297 2         5 $alph->parse_symbol('w', '(Formyl/carboxyl)cytosine', undef, undef, 'fc');
1298 2         5 $alph->parse_symbol('6', 'Guanine:(formyl/carboxyl)cytosine', undef, undef, '34');
1299              
1300             # process
1301 2         5 $alph->parse_done();
1302              
1303 2 50       7 die("Uhoh, this shouldn't happen:\n" + join("\n", $alph->get_errors())) if ($alph->has_errors());
1304              
1305 2         9 return $alph;
1306             }
1307              
1308             #
1309             # Return the Protein alphabet.
1310             #
1311             sub protein {
1312 0     0 0   my $alph = new __PACKAGE__;
1313              
1314 0           $alph->parse_header('Protein', 'PROTEIN');
1315              
1316             # core symbols
1317 0           $alph->parse_symbol('A', 'Alanine', 0x0000CC);
1318 0           $alph->parse_symbol('R', 'Arginine', 0xCC0000);
1319 0           $alph->parse_symbol('N', 'Asparagine', 0x008000);
1320 0           $alph->parse_symbol('D', 'Aspartic acid', 0xFF00FF);
1321 0           $alph->parse_symbol('C', 'Cysteine', 0x0000CC);
1322 0           $alph->parse_symbol('E', 'Glutamic acid', 0xFF00FF);
1323 0           $alph->parse_symbol('Q', 'Glutamine', 0x008000);
1324 0           $alph->parse_symbol('G', 'Glycine', 0xFFB300);
1325 0           $alph->parse_symbol('H', 'Histidine', 0xFFCCCC);
1326 0           $alph->parse_symbol('I', 'Isoleucine', 0x0000CC);
1327 0           $alph->parse_symbol('L', 'Leucine', 0x0000CC);
1328 0           $alph->parse_symbol('K', 'Lysine', 0xCC0000);
1329 0           $alph->parse_symbol('M', 'Methionine', 0x0000CC);
1330 0           $alph->parse_symbol('F', 'Phenylalanine', 0x0000CC);
1331 0           $alph->parse_symbol('P', 'Proline', 0xFFFF00);
1332 0           $alph->parse_symbol('S', 'Serine', 0x008000);
1333 0           $alph->parse_symbol('T', 'Threonine', 0x008000);
1334 0           $alph->parse_symbol('W', 'Tryptophan', 0x0000CC);
1335 0           $alph->parse_symbol('Y', 'Tyrosine', 0x33E6CC);
1336 0           $alph->parse_symbol('V', 'Valine', 0x0000CC);
1337             # ambiguous symbols
1338 0           $alph->parse_symbol('B', 'Asparagine or Aspartic acid', undef, undef, 'ND');
1339 0           $alph->parse_symbol('Z', 'Glutamine or Glutamic acid', undef, undef, 'QE');
1340 0           $alph->parse_symbol('J', 'Leucine or Isoleucine', undef, undef, 'LI');
1341 0           $alph->parse_symbol('X', 'Any amino acid', undef, undef, 'ARNDCEQGHILKMFPSTWYV', '*.');
1342             # for now treat '*' (stop codon) and '.' (gap) as a wildcard
1343              
1344             # process
1345 0           $alph->parse_done();
1346              
1347             # error check
1348 0 0         die("Uhoh, this shouldn't happen:\n" + join("\n", $alph->get_errors())) if ($alph->has_errors());
1349              
1350 0           return $alph;
1351             }
1352              
1353             1;
1354             __END__