File Coverage

blib/lib/MEME/Alphabet.pm
Criterion Covered Total %
statement 460 812 56.6
branch 123 316 38.9
condition 60 113 53.1
subroutine 32 60 53.3
pod 0 42 0.0
total 675 1343 50.2


line stmt bran cond sub pod time code
1             =pod
2              
3             =head1 NAME
4              
5             MEME::Alphabet - Provides nucleobase alphabet capabilities for Perl code.
6              
7             =head1 AUTHORS
8              
9             James Johnson and Timothy L. Bailey.
10              
11             =head1 LICENSE
12              
13             Copyright
14             (1994 - 2017) The Regents of the University of California.
15             All Rights Reserved.
16              
17             Permission to use, copy, modify, and distribute any part of
18             this software for educational, research and non-profit purposes,
19             without fee, and without a written agreement is hereby granted,
20             provided that the above copyright notice, this paragraph and
21             the following three paragraphs appear in all copies.
22              
23             Those desiring to incorporate this software into commercial
24             products or use for commercial purposes should contact the
25             Technology Transfer Office, University of California, San Diego,
26             9500 Gilman Drive, La Jolla, California, 92093-0910,
27             Ph: (619) 534 5815.
28              
29             IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO
30             ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR
31             CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF
32             THE USE OF THIS SOFTWARE, EVEN IF THE UNIVERSITY OF CALIFORNIA
33             HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34              
35             THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE
36             UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO PROVIDE
37             MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
38             THE UNIVERSITY OF CALIFORNIA MAKES NO REPRESENTATIONS AND
39             EXTENDS NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR IMPLIED,
40             INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
41             MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT
42             THE USE OF THE MATERIAL WILL NOT INFRINGE ANY PATENT,
43             TRADEMARK OR OTHER RIGHTS.
44              
45             =cut
46              
47             package MEME::Alphabet;
48              
49 2     2   51283 use strict;
  2         7  
  2         62  
50 2     2   26 use warnings;
  2         3  
  2         71  
51 2     2   12 use feature 'unicode_strings';
  2         10  
  2         194  
52              
53 2     2   12 use Carp;
  2         4  
  2         138  
54 2     2   13 use Fcntl qw(O_RDONLY);
  2         5  
  2         92  
55              
56             # pinned to the latest MEME Suite version (x.yy.z), and then versioned from there
57 2     2   610 use version; our $VERSION = version->declare("v4.12.0.1.2");
  2         4189  
  2         14  
58              
59             require Exporter;
60             our @ISA = qw(Exporter);
61             our @EXPORT_OK = qw(rna dna moddna protein);
62              
63             my $SYM_RE = qr/[A-Za-z0-9\?\.\*\-]/;
64             my $COLOUR_RE = qr/[0-9A-Fa-f]{6}/;
65             my $NAME_RE = qr/"((?:[^\\"]+|\\["\/bfnrt]|\\u[0-9A-Fa-f]{4})*)"/;
66             my $CORE_RE = qr/($SYM_RE)(?:\s+$NAME_RE)?(?:\s+($COLOUR_RE))?/;
67             my $COMMENT_RE = qr/^(\s*(?:[^#](?:[^"#]*|"(?:\\.|[^"\\]*)*")*)?)#.*$/;
68             my $HEADER_RE = qr/^\s*ALPHABET(?:\s+v1)?(?:\s+$NAME_RE)?(?:\s+(RNA|DNA|PROTEIN)-LIKE)?\s*$/;
69             my $CORE_SINGLE_RE = qr/^\s*$CORE_RE\s*$/;
70             my $CORE_PAIR_RE = qr/^\s*$CORE_RE\s*~\s*$CORE_RE\s*$/;
71             my $AMBIG_RE = qr/^\s*$CORE_RE\s*=\s*($SYM_RE*)\s*$/;
72              
73             sub new {
74 5     5 0 775 my $classname = shift;
75 5         15 my $self = {};
76 5         19 bless($self, $classname);
77 5         28 $self->_init(@_);
78 5         12 return $self;
79             }
80              
81             sub _init {
82 5     5   12 my $self = shift;
83 5         16 my ($file) = @_;
84 5         23 $self->{errors} = [];
85 5         16 $self->{file_name} = undef;
86 5         13 $self->{line_no} = 0;
87 5         13 $self->{seen_header} = 0; #FALSE
88 5         13 $self->{seen_symbol} = 0; #FALSE
89 5         13 $self->{seen_ambig} = 0; #FALSE
90 5         13 $self->{seen_lc} = 0; #FALSE
91 5         16 $self->{seen_uc} = 0; #FALSE
92 5         14 $self->{name} = undef;
93 5         15 $self->{sym_lookup} = {};
94 5         15 $self->{core_syms} = [];
95 5         14 $self->{ambig_syms} = [];
96 5         14 $self->{parsed} = 0; #FALSE
97 5 50       21 if (defined($file)) {
98 0         0 $self->parse_file($file);
99 0 0       0 if (@{$self->{errors}}) {
  0         0  
100 0         0 foreach my $error ( @{$self->{errors}} ) {
  0         0  
101 0         0 warn($error);
102             }
103 0         0 croak("Invalid alphabet file \"$file\"\n");
104             }
105             }
106             }
107              
108             sub _error {
109 0     0   0 my $self = shift;
110 0         0 my ($reason) = @_;
111 0         0 my $line_info = '';
112 0         0 my $file_info = '';
113 0 0       0 $line_info = 'on line ' . $self->{line_no} . ' ' if ($self->{line_no});
114 0 0       0 $file_info = 'file "' . $self->{file_name} . '"' if ($self->{file_name});
115 0         0 push(@{$self->{errors}}, "Invalid alphabet $file_info- $line_info$reason\n");
  0         0  
116             }
117              
118             sub parse_header {
119 4     4 0 12 my $self = shift;
120 4         15 my ($name, $like) = @_;
121 4 50       16 die("Already finished parsing") if $self->{parsed};
122 4 50       15 $self->_error('repeated header') if ($self->{seen_header});
123 4 50       14 $self->_error('header after symbol') if ($self->{seen_symbol});
124 4 50 33     74 $self->_error('like must be "DNA", "RNA" or "PROTEIN" (ignoring case)') if ($like && $like !~ m/^(RNA|DNA|PROTEIN)$/i);
125 4         13 $self->{name} = $name;
126 4 50       36 $self->{like} = ($like ? uc($like) : '');
127 4         13 $self->{seen_header} = 1; #TRUE
128             }
129              
130             sub uniq {
131 60     60 0 91 my %seen;
132 60         353 grep !$seen{$_}++, @_;
133             }
134              
135             sub parse_symbol {
136 92     92 0 142 my $self = shift;
137 92         197 my ($sym, $name, $colour, $complement, $comprise, $aliases) = @_;
138 92 50       185 die("Already finished parsing") if $self->{parsed};
139 92 0 33     170 $self->_error('expected header but found symbol') unless ($self->{seen_header} || $self->{seen_symbol});
140 92 50       194 if (defined($self->{sym_lookup}->{$sym})) {
141 0         0 $self->_error("symbol $sym is already used");
142 0         0 return; # can't really proceed with a duplicate
143             }
144 92 100 66     203 $self->{seen_lc} = 1 if ($sym ge 'a' && $sym le 'z');
145 92 100 100     280 $self->{seen_uc} = 1 if ($sym ge 'A' && $sym le 'Z');
146 92         312 my $symbol = {sym => $sym, aliases => [], name => $name, colour => $colour};
147 92 100       195 if (defined($aliases)) {
148 8         24 push(@{$symbol->{aliases}}, split(//, $aliases));
  8         36  
149             }
150 92 100       174 if (defined($comprise)) { # ambiguous symbol
151 60         157 my @equals = split //, $comprise;
152 60         183 @equals = sort _symbol_cmp @equals;
153 60         145 @equals = &uniq(@equals);
154 60         178 for (my $i = 0; $i < scalar(@equals); $i++) {
155 160         299 my $csym = $self->{sym_lookup}->{$equals[$i]};
156 160 50       485 if (!defined($csym)) {
    50          
157 0         0 $self->_error('referenced symbol ' . $equals[$i] . ' is unknown');
158             } elsif (defined($csym->{comprise})) {
159 0         0 $self->_error('referenced symbol ' . $equals[$i] . ' is ambiguous');
160             }
161             }
162 60         162 $symbol->{comprise} = join('', @equals);
163 60         91 push(@{$self->{ambig_syms}}, $symbol);
  60         133  
164 60         100 $self->{seen_ambig} = 1;
165 60 50 33     166 if ($sym eq '?' && length($symbol->{comprise}) < scalar(@{$self->{core_syms}})) {
  0         0  
166 0         0 $self->_error('symbol ? is reserved as a wildcard');
167             }
168             } else { # core symbol
169 32 50       64 $self->_error('symbol ? is reserved as a wildcard') if ($sym eq '?');
170 32 50       64 $self->_error('unexpected core symbol (as ambiguous symbols seen)') if ($self->{seen_ambig});
171 32         49 $symbol->{complement} = $complement;
172 32         44 push(@{$self->{core_syms}}, $symbol);
  32         70  
173             }
174 92         191 $self->{sym_lookup}->{$sym} = $symbol;
175 92         171 $self->{seen_symbol} = 1;
176             }
177              
178             sub parse_line {
179 0     0 0 0 my $self = shift;
180 0 0       0 die("Already finished parsing") if $self->{parsed};
181 0         0 my ($line) = @_;
182 0         0 $self->{line_no} += 1;
183 0         0 $line =~ s/$COMMENT_RE/$1/;
184 0 0       0 return unless ($line =~ m/\S/); # skip empty lines
185 0 0       0 if ($line =~ m/$HEADER_RE/) {
    0          
    0          
    0          
186 0         0 $self->parse_header($1, $2);
187             } elsif ($line =~ m/$CORE_PAIR_RE/) {
188 0         0 $self->parse_symbol($1, &_decode_JSON_string($2), &_decode_colour($3), $4);
189 0         0 $self->parse_symbol($4, &_decode_JSON_string($5), &_decode_colour($6), $1);
190             } elsif ($line =~ m/$CORE_SINGLE_RE/) {
191 0         0 $self->parse_symbol($1, &_decode_JSON_string($2), &_decode_colour($3));
192             } elsif ($line =~ m/$AMBIG_RE/) {
193 0         0 $self->parse_symbol($1, &_decode_JSON_string($2), &_decode_colour($3), undef, $4);
194             } else {
195 0         0 $self->_error('unrecognised pattern');
196             }
197             }
198              
199             sub _add_lookup {
200 106     106   208 my ($lookup, $target, $character, $both_case) = @_;
201 106         241 $lookup->{$character} = $target;
202 106 100       203 if ($both_case) {
203 36 100 66     176 if ($character ge 'A' && $character le 'Z') {
    50 33        
204 34         143 $lookup->{chr(ord($character) + 32)} = $target;
205             } elsif ($character ge 'a' && $character le 'z') {
206 0         0 $lookup->{chr(ord($character) - 32)} = $target;
207             }
208             }
209             }
210              
211             sub parse_done {
212 4     4 0 10 my $self = shift;
213 4 50       17 die("Already finished parsing") if $self->{parsed};
214             # now sort the symbols
215 4         9 @{$self->{core_syms}} = sort _symobj_cmp @{$self->{core_syms}};
  4         20  
  4         23  
216 4         11 @{$self->{ambig_syms}} = sort _symobj_cmp @{$self->{ambig_syms}};
  4         18  
  4         17  
217             # check if there is a wildcard
218 4 100 66     11 if (scalar(@{$self->{ambig_syms}}) == 0 || length($self->{ambig_syms}->[0]->{comprise}) != scalar(@{$self->{core_syms}})) {
  4         23  
  4         28  
219             #warn("Warning: wildcard symbol automatically generated\n");
220 2         5 my $wildcard_comprise = '';
221 2         4 for (my $i = 0; $i < scalar(@{$self->{core_syms}}); $i++) {
  26         43  
222 24         36 $wildcard_comprise .= $self->{core_syms}->[$i]->{sym};
223             }
224 2         4 unshift(@{$self->{ambig_syms}}, {sym => '?', aliases => [], comprise => $wildcard_comprise});
  2         13  
225             }
226             # create the final list of symbols and determine aliases
227 4         14 my %comprise_lookup = ();
228 4         6 $self->{ncore} = scalar(@{$self->{core_syms}});
  4         16  
229 4         19 $self->{syms} = [];
230 4         21 for (my $i = 0; $i < $self->{ncore}; $i++) {
231 32         53 my $sym = $self->{core_syms}->[$i];
232 32         63 $comprise_lookup{$sym->{sym}} = $sym;
233 32         44 $sym->{index} = scalar(@{$self->{syms}});
  32         55  
234 32         43 push(@{$self->{syms}}, $sym);
  32         87  
235             }
236 4         9 for (my $i = 0; $i < scalar(@{$self->{ambig_syms}}); $i++) {
  66         144  
237 62         96 my $sym = $self->{ambig_syms}->[$i];
238 62         98 my $real = $comprise_lookup{$sym->{comprise}};
239 62 50       105 if (defined($real)) {
240 0         0 push(@{$real->{aliases}}, $sym->{sym});
  0         0  
241             } else {
242 62         155 $comprise_lookup{$sym->{comprise}} = $sym;
243 62         81 $sym->{index} = scalar(@{$self->{syms}});
  62         112  
244 62         87 push(@{$self->{syms}}, $sym);
  62         133  
245             }
246             }
247             # cleanup some entries we don't need
248 4         27 delete $self->{sym_lookup};
249 4         11 delete $self->{core_syms};
250 4         10 delete $self->{ambig_syms};
251             # map comprising characters to objects
252 4         11 for (my $i = $self->{ncore}; $i < scalar(@{$self->{syms}}); $i++) {
  66         145  
253 62         99 my $sym = $self->{syms}->[$i];
254 62         145 my @list = map { $comprise_lookup{$_} } split(//, $sym->{comprise});
  184         335  
255 62         152 $sym->{comprise} = \@list;
256             }
257             #determine complements
258 4         11 my $fully_complementable = 1;
259 4         18 for (my $i = 0; $i < $self->{ncore}; $i++) {
260 32         48 my $sym = $self->{syms}->[$i];
261 32 50       60 unless (defined($sym->{complement})) {
262 0         0 $fully_complementable = 0;
263 0         0 next;
264             }
265 32         77 $sym->{complement} = $comprise_lookup{$sym->{complement}};
266             }
267 4         15 AMBIG: for (my $i = $self->{ncore}; $i < scalar(@{$self->{syms}}); $i++) {
  66         173  
268 62         111 my $sym = $self->{syms}->[$i];
269 62         153 my @complements = ();
270 62         99 for (my $j = 0; $j < scalar(@{$sym->{comprise}}); $j++) {
  246         556  
271 184         303 my $target = $sym->{comprise}->[$j]->{complement};
272 184 50       305 next AMBIG unless defined $target;
273 184         356 push(@complements, $target->{sym});
274             }
275 62         167 @complements = sort _symbol_cmp @complements;
276 62         154 my $complement = $comprise_lookup{join('', @complements)};
277 62 50       121 next AMBIG unless defined $complement;
278 62         157 $sym->{complement} = $complement;
279             }
280             # create lookup
281 4         14 my $both_case = ($self->{seen_lc} != $self->{seen_uc});
282 4         12 $self->{lookup} = {};
283 4         13 for (my $i = 0; $i < scalar(@{$self->{syms}}); $i++) {
  98         226  
284 94         163 my $sym = $self->{syms}->[$i];
285 94         205 &_add_lookup($self->{lookup}, $sym, $sym->{sym}, $both_case);
286 94         141 for (my $j = 0; $j < scalar(@{$sym->{aliases}}); $j++) {
  106         265  
287 12         37 &_add_lookup($self->{lookup}, $sym, $sym->{aliases}->[$j], $both_case);
288             }
289             }
290 4         14 $self->{comprise_lookup} = \%comprise_lookup;
291 4         11 $self->{fully_complementable} = $fully_complementable;
292             # create regular expressions to match something that isn't a valid symbol
293 4         9 my $all_symbols = quotemeta(join('', (keys %{$self->{lookup}})));
  4         48  
294 4         127 $self->{re_not_valid} = qr/[^$all_symbols]/;
295 4         16 my $ambig_symbols = '';
296 4         12 for (my $i = $self->{ncore}; $i < scalar(@{$self->{syms}}); $i++) {
  66         160  
297 62         102 my $entry = $self->{syms}->[$i];
298 62         91 my $prime_sym = $entry->{sym};
299 62         92 $ambig_symbols .= $prime_sym;
300 62 100 66     175 if ($both_case && $prime_sym =~ m/^[A-Za-z]$/) {
301 22         53 my $prime_sym_copy = $prime_sym; # copy
302 22         42 $prime_sym_copy =~ tr/A-Za-z/a-zA-Z/; # swap case
303 22         53 $ambig_symbols .= $prime_sym_copy;
304             }
305 62         84 foreach my $alias_sym (@{$entry->{aliases}}) {
  62         164  
306 8         14 $ambig_symbols .= $alias_sym;
307 8 100 100     50 if ($both_case && $alias_sym =~ m/^[A-Za-z]$/) {
308 2         7 my $alias_sym_copy = $alias_sym; # copy
309 2         7 $alias_sym_copy =~ tr/A-Za-z/a-zA-Z/; # swap case
310 2         8 $ambig_symbols .= $alias_sym_copy;
311             }
312             }
313             }
314 4         60 $self->{re_ambig} = qr/[\Q$ambig_symbols\E]/;
315 4         68 $self->{re_ambig_only} = qr/^[\Q$ambig_symbols\E]*$/;
316             #$ambig_symbols = quotemeta($ambig_symbols);
317             #$self->{re_ambig} = qr/[$ambig_symbols]/;
318             #$self->{re_ambig_only} = qr/^[$ambig_symbols]*$/;
319              
320             # create translatation strings
321             # Strings to translate aliases of core symbols to prime representation
322 4         17 my ($tr_src_core_alias_to_prime, $tr_dst_core_alias_to_prime) = ('', '');
323 4         19 for (my $i = 0; $i < $self->{ncore}; $i++) {
324 32         66 my $entry = $self->{syms}->[$i];
325 32         44 my $prime_sym = $entry->{sym};
326             # alternate case of primary symbol?
327 32 100 66     84 if ($both_case && $prime_sym =~ m/^[A-Za-z]$/) {
328 8         20 my $prime_sym_copy = $prime_sym; # copy
329 8         20 $prime_sym_copy =~ tr/A-Za-z/a-zA-Z/; # swap case
330 8         18 $tr_src_core_alias_to_prime .= $prime_sym_copy;
331 8         20 $tr_dst_core_alias_to_prime .= $prime_sym;
332             }
333             # alias symbols
334 32         42 foreach my $alias_sym (@{$entry->{aliases}}) {
  32         91  
335 4         8 $tr_src_core_alias_to_prime .= $alias_sym;
336 4         8 $tr_dst_core_alias_to_prime .= $prime_sym;
337             # alternate case of alias symbol?
338 4 100 66     26 if ($both_case && $alias_sym =~ m/^[A-Za-z]$/) {
339 2         7 my $alias_sym_copy = $alias_sym; # copy
340 2         6 $alias_sym_copy =~ tr/A-Za-z/a-zA-Z/; # swap case
341 2         6 $tr_src_core_alias_to_prime .= $alias_sym_copy;
342 2         11 $tr_dst_core_alias_to_prime .= $prime_sym;
343             }
344             }
345             }
346 4         19 $tr_src_core_alias_to_prime =~ s/([\/\-])/\\$1/g; # escape forward slash and dash
347 4         11 $tr_dst_core_alias_to_prime =~ s/([\/\-])/\\$1/g; # escape forward slash and dash
348 4         13 $self->{tr_src_core_alias_to_prime} = $tr_src_core_alias_to_prime;
349 4         10 $self->{tr_dst_core_alias_to_prime} = $tr_dst_core_alias_to_prime;
350             # Strings to translate aliases of ambiguous symbols to prime representation
351 4         16 my ($tr_src_ambig_alias_to_prime, $tr_dst_ambig_alias_to_prime) = ('', '');
352 4         12 for (my $i = $self->{ncore}; $i < scalar(@{$self->{syms}}); $i++) {
  66         153  
353 62         93 my $entry = $self->{syms}->[$i];
354 62         100 my $prime_sym = $entry->{sym};
355             # alternate case of primary symbol?
356 62 100 66     183 if ($both_case && $prime_sym =~ m/^[A-Za-z]$/) {
357 22         49 my $prime_sym_copy = $prime_sym; # copy
358 22         46 $prime_sym_copy =~ tr/A-Za-z/a-zA-Z/; # swap case
359 22         49 $tr_src_ambig_alias_to_prime .= $prime_sym_copy;
360 22         41 $tr_dst_ambig_alias_to_prime .= $prime_sym;
361             }
362             # alias symbols
363 62         87 foreach my $alias_sym (@{$entry->{aliases}}) {
  62         148  
364 8         16 $tr_src_ambig_alias_to_prime .= $alias_sym;
365 8         15 $tr_dst_ambig_alias_to_prime .= $prime_sym;
366             # alternate case of alias symbol?
367 8 100 100     51 if ($both_case && $alias_sym =~ m/^[A-Za-z]$/) {
368 2         9 my $alias_sym_copy = $alias_sym;
369 2         7 $alias_sym_copy =~ tr/A-Za-z/a-zA-Z/;
370 2         5 $tr_src_ambig_alias_to_prime .= $alias_sym_copy;
371 2         7 $tr_dst_ambig_alias_to_prime .= $prime_sym;
372             }
373             }
374             }
375 4         17 $tr_src_ambig_alias_to_prime =~ s/([\/\-])/\\$1/g; # escape forward slash and dash
376 4         9 $tr_dst_ambig_alias_to_prime =~ s/([\/\-])/\\$1/g; # escape forward slash and dash
377 4         16 $self->{tr_src_ambig_alias_to_prime} = $tr_src_ambig_alias_to_prime;
378 4         11 $self->{tr_dst_ambig_alias_to_prime} = $tr_dst_ambig_alias_to_prime;
379             # Strings to translate ambiguous symbols and their aliases to the wildcard
380 4         12 my ($tr_src_ambig_to_wild, $tr_dst_ambig_to_wild) = ('', '');
381 4         15 my $wildcard = $self->{syms}->[$self->{ncore}]->{sym};
382 4         14 for (my $i = $self->{ncore}; $i < scalar(@{$self->{syms}}); $i++) {
  66         149  
383 62         100 my $entry = $self->{syms}->[$i];
384 62         92 my $prime_sym = $entry->{sym};
385             # primary symbol to wildcard (unless it is the wildcard)
386 62 100       125 if ($i != $self->{ncore}) {
387 58         91 $tr_src_ambig_to_wild .= $prime_sym;
388 58         81 $tr_dst_ambig_to_wild .= $wildcard;
389             }
390             # alternate case of primary symbol?
391 62 100 66     181 if ($both_case && $prime_sym =~ m/^[A-Za-z]$/) {
392 22         54 my $prime_sym_copy = $prime_sym;
393 22         41 $prime_sym_copy =~ tr/A-Za-z/a-zA-Z/;
394 22         47 $tr_src_ambig_to_wild .= $prime_sym_copy;
395 22         53 $tr_dst_ambig_to_wild .= $wildcard;
396             }
397             # alias symbols
398 62         86 foreach my $alias_sym (@{$entry->{aliases}}) {
  62         153  
399 8         13 $tr_src_ambig_to_wild .= $alias_sym;
400 8         15 $tr_dst_ambig_to_wild .= $wildcard;
401             # alternate case of alias symbol?
402 8 100 100     43 if ($both_case && $alias_sym =~ m/^[A-Za-z]$/) {
403 2         8 my $alias_sym_copy = $alias_sym;
404 2         6 $alias_sym_copy =~ tr/A-Za-z/a-zA-Z/;
405 2         6 $tr_src_ambig_to_wild .= $alias_sym_copy;
406 2         7 $tr_dst_ambig_to_wild .= $wildcard;
407             }
408             }
409             }
410 4         14 $tr_src_ambig_to_wild =~ s/([\/\-])/\\$1/g; # escape forward slash and dash
411 4         12 $tr_dst_ambig_to_wild =~ s/([\/\-])/\\$1/g; # escape forward slash and dash
412 4         12 $self->{tr_src_ambig_to_wild} = $tr_src_ambig_to_wild;
413 4         11 $self->{tr_dst_ambig_to_wild} = $tr_dst_ambig_to_wild;
414             # Strings to translate unknown symbols to the wildcard
415 4         12 my ($tr_src_unknown_to_wild, $tr_dst_unknown_to_wild) = ('', '');
416 4         9 my $range_start = 0;
417 4         24 for (my $i = 33; $i <= 126; $i++) {
418 376 100       1037 if (defined($self->{lookup}->{chr($i)})) {
419 140 100       346 if ($i > ($range_start + 1)) {
    100          
420 38         125 $tr_src_unknown_to_wild .= sprintf("\\%03o-\\%03o", $range_start, $i - 1);
421             } elsif ($i == ($range_start + 1)) {
422 12         32 $tr_src_unknown_to_wild .= sprintf("\\%03o", $range_start);
423             }
424 140         299 $range_start = $i + 1;
425             }
426             }
427 4         17 $tr_src_unknown_to_wild .= sprintf("\\%03o-\\377", $range_start);
428 4 50 33     28 $tr_dst_unknown_to_wild = ($wildcard eq '-' || $wildcard eq '/' ? "\\" : "") .$wildcard; # tr will extend using the last character
429 4         14 $self->{tr_src_unknown_to_wild} = $tr_src_unknown_to_wild;
430 4         11 $self->{tr_dst_unknown_to_wild} = $tr_dst_unknown_to_wild;
431             # check 'like' a standard alphabet
432 4 50       26 if ($self->{like}) {
433 4         14 my ($req_core, $req_comp, $name);
434 4 50       25 if ($self->{like} eq 'RNA') {
    50          
    0          
435 0         0 $req_core = 'ACGU';
436 0         0 $name = 'RNA';
437             } elsif ($self->{like} eq 'DNA') {
438 4         10 $req_core = 'ACGT';
439 4         7 $req_comp = 'TGCA';
440 4         8 $name = 'DNA';
441             } elsif ($self->{like} eq 'PROTEIN') {
442 0         0 $req_core = 'ACDEFGHIKLMNPQRSTVWY';
443 0         0 $name = 'protein';
444             }
445 4 50       11 if (defined $req_core) {
446 4         20 for (my $i = 0; $i < length($req_core); $i++) {
447 16         39 my $sym = substr($req_core, $i, 1);
448 16         28 my $symobj = $self->{lookup}->{$sym};
449 16 50       34 unless (defined $symobj) {
450 0         0 $self->_error("not $name like - expected symbol '$sym'");
451             }
452 16 50       44 unless ($symobj->{index} < $self->{ncore}) {
453 0         0 $self->_error("not $name like - symbol '$sym' must be a core symbol");
454             }
455 16 50       33 if (defined $req_comp) {
456 16         31 my $comp = substr($req_comp, $i, 1);
457 16         30 my $compobj = $self->{lookup}->{$comp};
458 16 50 0     57 if (not defined $compobj || not defined $symobj->{complement} || $symobj->{complement} != $compobj) {
      33        
459 0         0 $self->_error("not $name like - symbol '$sym' complement must be '$comp'");
460             }
461             } else {
462             #if (defined $symobj->{complement}) {
463             # $self->_error("not $name like - symbol '$sym' must not have complement");
464             #}
465             }
466             }
467             }
468             }
469             # done
470 4         22 $self->{parsed} = 1; # TRUE
471             }
472              
473             sub parse_file {
474 0     0 0 0 my $self = shift;
475 0 0       0 die("Already finished parsing") if $self->{parsed};
476 0         0 my ($file) = @_;
477 0         0 $self->{file_name} = $file;
478 0         0 my ($fh, $line);
479 0         0 sysopen($fh, $file, O_RDONLY);
480 0         0 while($line = <$fh>) {
481 0         0 chomp($line);
482 0         0 $self->parse_line($line);
483             }
484 0         0 close($fh);
485 0         0 $self->parse_done();
486             }
487              
488             sub _sym_to_text {
489 0     0   0 my ($sym) = @_;
490 0         0 my $name_info = '';
491 0 0       0 if (defined($sym->{name})) {
492 0         0 $name_info = ' "' . &_encode_JSON_string($sym->{name}) . '"';
493             }
494 0         0 my $colour_info = '';
495 0 0       0 if (defined($sym->{colour})) {
496 0         0 $colour_info = sprintf(" %.06X", $sym->{colour});
497             }
498 0         0 return $sym->{sym} . $name_info . $colour_info;
499             }
500              
501             sub to_text {
502 0     0 0 0 my $self = shift;
503 0         0 my %options = @_;
504 0         0 my $out = '';
505 0         0 my $gap = undef;
506             # set option defaults
507 0 0       0 $options{print_header} = 1 unless defined $options{print_header};
508 0 0       0 $options{print_footer} = 1 unless defined $options{print_footer};
509             # output header
510 0 0       0 if ($options{print_header}) {
511             $out .= "ALPHABET " . $self->name(1) .
512 0 0       0 ($self->{like} ? ' ' . $self->{like} . '-LIKE' : '') . "\n";
513             }
514             # output all paired core symbols
515 0         0 for (my $i = 0; $i < $self->{ncore}; $i++) {
516 0         0 my $sym = $self->{syms}->[$i];
517 0         0 my $csym = $sym->{complement};
518 0 0 0     0 if (defined($csym) && $sym->{index} < $csym->{index}) {
519 0         0 $out .= &_sym_to_text($sym) . ' ~ ' . &_sym_to_text($csym) . "\n";
520             }
521             }
522             # output all unpared core symbols
523 0         0 for (my $i = 0; $i < $self->{ncore}; $i++) {
524 0         0 my $sym = $self->{syms}->[$i];
525 0 0       0 if (!defined($sym->{complement})) {
526 0         0 $out .= &_sym_to_text($sym) . "\n";
527             }
528             }
529             # output ambiguous symbols
530 0         0 for (my $i = $self->{ncore}; $i < scalar(@{$self->{syms}}); $i++) {
  0         0  
531 0         0 my $sym = $self->{syms}->[$i];
532 0 0       0 if (scalar(@{$sym->{comprise}}) == 0) {
  0         0  
533 0         0 $gap = $sym;
534 0         0 next;
535             }
536 0         0 my $comprise = '';
537 0         0 for (my $j = 0; $j < scalar(@{$sym->{comprise}}); $j++) {
  0         0  
538 0         0 $comprise .= $sym->{comprise}->[$j]->{sym};
539             }
540 0         0 $out .= &_sym_to_text($sym) . ' = ' . $comprise . "\n";
541 0         0 for (my $j = 0; $j < scalar(@{$sym->{aliases}}); $j++) {
  0         0  
542 0         0 $out .= $sym->{aliases}->[$j] . ' = ' . $comprise . "\n";
543             }
544             }
545             # output core symbol aliases
546 0         0 for (my $i = 0; $i < $self->{ncore}; $i++) {
547 0         0 my $sym = $self->{syms}->[$i];
548 0         0 for (my $j = 0; $j < scalar(@{$sym->{aliases}}); $j++) {
  0         0  
549 0         0 $out .= $sym->{aliases}->[$j] . ' = ' . $sym->{sym} . "\n";
550             }
551             }
552             # output gap symbol and aliases
553 0 0       0 if ($gap) {
554 0         0 $out .= &_sym_to_text($gap) . " =\n";
555 0         0 for (my $j = 0; $j < scalar(@{$gap->{aliases}}); $j++) {
  0         0  
556 0         0 $out .= $gap->{aliases}->[$j] . " =\n";
557             }
558             }
559             # output footer
560 0 0       0 if ($options{print_footer}) {
561 0         0 $out .= "END ALPHABET\n";
562             }
563 0         0 return $out;
564             }
565              
566             sub to_json() {
567 0     0 0 0 my $self = shift;
568 0         0 my ($jw) = @_;
569 0         0 $jw->start_object_value();
570 0         0 $jw->str_prop("name", $self->{name});
571 0 0       0 $jw->str_prop("like", lc($self->{like})) if ($self->{like});
572 0         0 $jw->num_prop("ncore", $self->{ncore});
573 0         0 $jw->property("symbols");
574 0         0 $jw->start_array_value();
575 0         0 for (my $i = 0; $i < scalar(@{$self->{syms}}); $i++) {
  0         0  
576 0         0 my $sym = $self->{syms}->[$i];
577 0         0 $jw->start_object_value();
578 0         0 $jw->str_prop("symbol", $sym->{sym});
579 0 0       0 $jw->str_prop("aliases", join("", @{$sym->{aliases}})) if (scalar(@{$sym->{aliases}}) > 0);
  0         0  
  0         0  
580 0 0       0 $jw->str_prop("name", $sym->{name}) if (defined($sym->{name}));
581 0 0 0     0 $jw->str_prop("colour", sprintf("%06X", $sym->{colour})) if (defined($sym->{colour}) && $sym->{colour} > 0);
582 0 0       0 if ($i < $self->{ncore}) {
583 0 0       0 $jw->str_prop("complement", $sym->{complement}->{sym}) if (defined($sym->{complement}));
584             } else {
585 0         0 my $comprise = '';
586 0         0 for (my $j = 0; $j < scalar(@{$sym->{comprise}}); $j++) {
  0         0  
587 0         0 $comprise .= $sym->{comprise}->[$j]->{sym};
588             }
589 0         0 $jw->str_prop("equals", $comprise);
590             }
591 0         0 $jw->end_object_value();
592             }
593 0         0 $jw->end_array_value();
594 0         0 $jw->end_object_value();
595             }
596              
597             sub has_errors {
598 4     4 0 11 my $self = shift;
599 4         7 return scalar(@{$self->{errors}});
  4         23  
600             }
601              
602             sub get_errors {
603 0     0 0 0 my $self = shift;
604 0         0 return @{$self->{errors}};
  0         0  
605             }
606              
607             sub name {
608 0     0 0 0 my $self = shift;
609 0         0 my ($should_encode) = @_;
610 0 0       0 if ($should_encode) {
611 0         0 return '"' . &_encode_JSON_string($self->{name}) . '"';
612             } else {
613 0         0 return $self->{name};
614             }
615             }
616              
617             sub has_complement {
618 5     5 0 7 my $self = shift;
619 5 50       14 die("Not finished parsing") unless $self->{parsed};
620 5         16 return $self->{fully_complementable};
621             }
622              
623             sub size_core {
624 8     8 0 13 my $self = shift;
625 8         28 return $self->{ncore};
626             }
627              
628             sub size_full {
629 8     8 0 15 my $self = shift;
630 8         20 return scalar(@{$self->{syms}});
  8         23  
631             }
632              
633             #
634             # Return the input list in list context
635             # and the flattened list in scalar context.
636             #
637             sub _flatten_list_in_scalar_context {
638 6 100   6   13 if (wantarray) {
639 5         25 return @_;
640             } else {
641 1         9 return join('' , @_);
642             }
643             }
644              
645             #
646             # Returns the concatenation of all the
647             # possible ambiguity codes of the alphabet.
648             # Returns a list in list context, otherwise a scalar.
649             #
650             sub get_ambig_syms {
651 0     0 0 0 my $self = shift;
652 0         0 my @ambig_syms;
653              
654 0         0 foreach my $sym ($self->get_syms()) {
655 0 0       0 unless ($self->is_core($sym)) {
656 0         0 push(@ambig_syms, $sym);
657             }
658             }
659              
660 0         0 return _flatten_list_in_scalar_context(@ambig_syms);
661             }
662              
663             #
664             # Returns the concatenation of all the possible symbols
665             # of the alphabet.
666             # Returns a list in list context, otherwise a scalar.
667             #
668             sub get_syms {
669 3     3 0 6 my $self = shift;
670 3         6 my @syms;
671              
672 3         4 foreach my $symbol (@{$self->{syms}}) {
  3         10  
673 62         92 push(@syms, $symbol->{sym});
674             }
675              
676 3         12 return _flatten_list_in_scalar_context(@syms);
677             }
678              
679             #
680             # Returns the concatenation of all the core symbols
681             # of the alphabet.
682             # Returns a list in list context, otherwise a scalar.
683             #
684             sub get_core {
685 3     3 0 3862 my $self = shift;
686 3         6 my @core_syms;
687              
688 3         11 foreach my $sym ($self->get_syms()) {
689 62 100       94 if ($self->is_core($sym)) {
690 20         36 push(@core_syms, $sym);
691             }
692             }
693              
694 3         8 return _flatten_list_in_scalar_context(@core_syms);
695             }
696              
697             #
698             # Returns a dictionary containing the core symbols that
699             # each ambiguity code of the alphabet is composed of.
700             #
701             sub get_alternates_dict {
702 0     0 0 0 my $self = shift;
703 0         0 my %alternates;
704 0         0 foreach my $ambig_base (split(//, $self->get_ambig_syms())) {
705 0         0 %alternates = (%alternates, %{$self->comprise($ambig_base)});
  0         0  
706             }
707 0         0 return %alternates;
708             }
709              
710             #
711             # Is the symbol part of the core alphabet.
712             #
713             sub is_core {
714 62     62 0 66 my $self = shift;
715 62         83 my ($letter) = @_;
716 62         76 my $sym = $self->{lookup}->{$letter};
717 62   66     189 return defined($sym) && $sym->{index} < $self->{ncore};
718             }
719              
720             #
721             # Is the symbol part of the alphabet.
722             #
723             sub is_known {
724 0     0 0 0 my $self = shift;
725 0         0 my ($letter) = @_;
726 0         0 my $sym = $self->{lookup}->{$letter};
727 0         0 return defined($sym);
728             }
729              
730             #
731             # Does the sequence only contain ambiguous symbols.
732             # This might be useful if you want to throw away something containing only
733             # ambiguous symbols as that has no real useful information.
734             #
735             sub is_ambig_seq {
736 0     0 0 0 my $self = shift;
737 0         0 my ($seq) = @_;
738 0         0 my $ambig_only_re = $self->{re_ambig_only};
739              
740 0         0 return scalar($seq =~ m/$ambig_only_re/);
741             }
742              
743             #
744             # Does the sequence contain ambiguous symbols.
745             #
746             sub is_part_ambig_seq {
747 0     0 0 0 my $self = shift;
748 0         0 my ($seq) = @_;
749 0         0 my $ambig_re = $self->{re_ambig};
750              
751 0         0 return scalar($seq =~ m/$ambig_re/);
752             }
753              
754             sub char {
755 0     0 0 0 my $self = shift;
756 0         0 my ($index) = @_;
757              
758 0 0 0     0 return if ($index < 0 || $index > scalar(@{$self->{syms}}));
  0         0  
759              
760 0         0 return $self->{syms}->[$index]->{sym};
761             }
762              
763             #
764             # Converts a symbol into the index in the alphabet.
765             #
766             sub index {
767 0     0 0 0 my $self = shift;
768 0         0 my ($letter) = @_;
769 0         0 my $sym = $self->{lookup}->{$letter};
770              
771 0 0       0 return unless defined $sym;
772              
773 0         0 return $sym->{index};
774             }
775              
776             #
777             # Converts a symbol that could be an alias into the primary symbol.
778             #
779             sub encode {
780 0     0 0 0 my $self = shift;
781 0         0 my ($letter) = @_;
782 0         0 my $sym = $self->{lookup}->{$letter};
783              
784 0 0       0 return unless defined $sym;
785 0         0 return $sym->{sym};
786             }
787              
788             #
789             # Takes a letter and returns the set of comprising core symbols.
790             #
791             sub comprise {
792 0     0 0 0 my $self = shift;
793 0         0 my ($letter) = @_;
794 0         0 my $sym = $self->{lookup}->{$letter};
795 0         0 my %set = ();
796 0 0       0 if (defined($sym->{comprise})) {
797 0         0 for (my $i = 0; $i < scalar(@{$sym->{comprise}}); $i++) {
  0         0  
798 0         0 $set{$sym->{comprise}->[$i]->{sym}} = 1;
799             }
800             } else {
801 0         0 $set{$sym->{sym}} = 1;
802             }
803 0         0 return \%set;
804             }
805              
806             #
807             # Converts a set of core symbols into either a matching ambigous symbol or
808             # a group of core symbols.
809             #
810             sub regex {
811 0     0 0 0 my $self = shift;
812 0         0 my ($set) = @_;
813 0         0 my $key = join('', sort _symbol_cmp keys %{$set});
  0         0  
814 0         0 my $sym = $self->{comprise_lookup}->{$key};
815 0 0       0 if (defined($sym)) {
816 0         0 return $sym->{sym};
817             } else {
818 0         0 return '['.$key.']';
819             }
820             }
821              
822             #
823             # Takes a set of core symbols and creates a set containing the other core symbols.
824             #
825             sub negate_set {
826 0     0 0 0 my $self = shift;
827 0         0 my ($set) = @_;
828 0         0 my %negated_set = ();
829              
830 0         0 for (my $i = 0; $i < $self->{ncore}; $i++) {
831 0         0 my $a = $self->{syms}->[$i]->{sym};
832 0 0       0 if (!($set->{$a})) {
833 0         0 $negated_set{$a} = 1;
834             }
835             }
836              
837 0         0 return \%negated_set;
838             }
839              
840             #
841             # Finds the first unknown symbol.
842             #
843             sub find_unknown {
844 0     0 0 0 my $self = shift;
845 0         0 my ($seq) = @_;
846 0         0 my $not_valid = $self->{re_not_valid};
847              
848 0 0       0 if ($seq =~ m/($not_valid)/) {
849 0         0 return ($-[1], $1);
850             } else {
851 0         0 return;
852             }
853             }
854              
855             #
856             # Reverse complements a sequence using the alphabet.
857             #
858             sub rc_seq {
859 1     1 0 4 my $self = shift;
860 1 50       3 croak("Alphabet does not allow complementing.") unless $self->has_complement();
861 1         4 my ($sequence) = @_;
862 1         6 my @seq = split //, $sequence;
863              
864 1         3 @seq = reverse @seq;
865 1         2 @seq = map { $self->{lookup}->{$_}->{complement}->{sym} } @seq;
  5         13  
866              
867 1         8 return join('', @seq);
868             }
869              
870             #
871             # Translate a sequence using the alphabet.
872             #
873             # NO_ALIASES Convert any alias symbols into the main symbol.
874             # NO_AMBIGS Convert any ambiguous symbols into the main wildcard symbol.
875             # NO_UNKNOWN Convert any unknown symbols into the main wildcard symbol.
876             #
877             sub translate_seq {
878 0     0 0 0 my $self = shift;
879 0         0 my ($sequence, %options) = @_;
880 0 0       0 my $no_aliases = (defined($options{NO_ALIASES}) ? $options{NO_ALIASES} : 0);
881 0 0       0 my $no_ambigs = (defined($options{NO_AMBIGS}) ? $options{NO_AMBIGS} : 0);
882 0 0       0 my $no_unknown = (defined($options{NO_UNKNOWN}) ? $options{NO_UNKNOWN} : 0);
883 0         0 my ($src, $dst) = ('', '');
884 0 0       0 if ($no_aliases) {
    0          
885 0         0 $src .= $self->{tr_src_core_alias_to_prime};
886 0         0 $dst .= $self->{tr_dst_core_alias_to_prime};
887 0 0       0 if ($no_ambigs) {
888 0         0 $src .= $self->{tr_src_ambig_to_wild};
889 0         0 $dst .= $self->{tr_dst_ambig_to_wild};
890             } else {
891 0         0 $src .= $self->{tr_src_ambig_alias_to_prime};
892 0         0 $dst .= $self->{tr_dst_ambig_alias_to_prime};
893             }
894             } elsif ($no_ambigs) {
895 0         0 $src .= $self->{tr_src_ambig_to_wild};
896 0         0 $dst .= $self->{tr_dst_ambig_to_wild};
897             }
898 0 0       0 if ($no_unknown) {
899 0         0 $src .= $self->{tr_src_unknown_to_wild};
900 0         0 $dst .= $self->{tr_dst_unknown_to_wild};
901             }
902              
903 0         0 eval {$sequence =~ tr/$src/$dst/};
  0         0  
904              
905 0         0 return $sequence;
906             }
907              
908             #
909             # Returns a scalar of Weblogo 3 colour arguments:
910             # --color COLOR SYMBOLS DESCRIPTION ...
911             # Currently only operates over core symbols.
912             sub get_Weblogo_colour_args {
913 1     1 0 2 my $self = shift;
914              
915 1         3 my $arg_string = "";
916              
917 1         4 foreach my $letter ($self->get_core()) {
918 12         18 my $sym = $self->{lookup}->{$letter};
919              
920             # convert back from decoded hex, adding '#' prefix
921 12         20 my $HTML_colour = sprintf("#%X", $sym->{colour});
922              
923 12         27 $arg_string .= "--color '$HTML_colour' '$sym->{sym}' '$sym->{name}' ";
924             }
925              
926 1         8 return $arg_string;
927             }
928              
929             #
930             # Compare 2 different alphabet objects for equality
931             #
932             sub equals {
933 2     2 0 5 my $self = shift;
934 2         7 my ($other) = @_;
935 2 50       9 return 0 if ($self->size_core() != $other->size_core());
936 2 50       9 return 0 if ($self->size_full() != $other->size_full());
937 2 50       10 return 0 if ($self->has_complement() != $other->has_complement());
938 2 50       11 return 0 if (defined($self->{name}) != defined($other->{name}));
939 2 50 33     18 if (defined($self->{name}) && defined($other->{name})) {
940 2 50       12 return 0 if ($self->{name} ne $other->{name});
941             }
942 2         6 my $nsyms = $self->size_full();
943 2         9 for (my $i = 0; $i < $nsyms; $i++) {
944 47         91 my $self_sym = $self->{syms}->[$i];
945 47         63 my $other_sym = $other->{syms}->[$i];
946 47 50       94 return 0 if ($self_sym->{sym} ne $other_sym->{sym});
947 47 50       89 return 0 if (defined($self_sym->{name}) != defined($other_sym->{name}));
948 47 100 66     127 if (defined($self_sym->{name}) && defined($other_sym->{name})) {
949 46 50       86 return 0 if ($self_sym->{name} ne $other_sym->{name});
950             }
951 47 50       89 return 0 if (defined($self_sym->{colour}) != defined($other_sym->{colour}));
952 47 100 66     96 if (defined($self_sym->{colour}) && defined($other_sym->{colour})) {
953 16 50       31 return 0 if ($self_sym->{colour} != $other_sym->{colour});
954             }
955 47 50       58 return 0 if (scalar(@{$self_sym->{aliases}}) != scalar(@{$other_sym->{aliases}}));
  47         72  
  47         86  
956 47         63 my $naliases = scalar(@{$self_sym->{aliases}});
  47         68  
957 47         117 for (my $j = 0; $j < $naliases; $j++) {
958 6 50       25 return 0 if ($self_sym->{aliases}->[$j] ne $other_sym->{aliases}->[$j]);
959             }
960             }
961 2         9 $nsyms = $self->size_core();
962 2         9 for (my $i = 0; $i < $nsyms; $i++) {
963 16         24 my $self_sym = $self->{syms}->[$i];
964 16         22 my $other_sym = $other->{syms}->[$i];
965 16 50       28 return 0 if (defined($self_sym->{complement}) != defined($other_sym->{complement}));
966 16 50 33     52 if (defined($self_sym->{complement}) && defined($other_sym->{complement})) {
967 16 50       42 return 0 if ($self_sym->{complement}->{sym} ne $other_sym->{complement}->{sym});
968             }
969             }
970 2         6 $nsyms = $self->size_full();
971 2         5 for (my $i = $self->size_core(); $i < $nsyms; $i++) {
972 31         45 my $self_sym = $self->{syms}->[$i];
973 31         38 my $other_sym = $other->{syms}->[$i];
974 31 50       34 return 0 if (scalar(@{$self_sym->{comprise}}) != scalar(@{$other_sym->{comprise}}));
  31         43  
  31         63  
975 31         47 for (my $j = 0; $j < scalar(@{$self_sym->{comprise}}); $j++) {
  123         225  
976 92 50       173 return 0 if ($self_sym->{comprise}->[$j]->{sym} ne $other_sym->{comprise}->[$j]->{sym});
977             }
978             }
979 2         5 return 1;
980             }
981              
982             sub is_dna {
983 1     1 0 10 my $self = shift;
984 1 50       8 $self->{cached_is_dna} = $self->equals(&dna()) unless defined $self->{cached_is_dna};
985 1         38 return $self->{cached_is_dna};
986             }
987              
988             sub is_moddna {
989 1     1 0 7 my $self = shift;
990 1 50       6 $self->{cached_is_moddna} = $self->equals(&moddna()) unless defined $self->{cached_is_moddna};
991 1         20 return $self->{cached_is_moddna};
992             }
993              
994             sub is_rna {
995 0     0 0 0 my $self = shift;
996 0 0       0 $self->{cached_is_rna} = $self->equals(&rna()) unless defined $self->{cached_is_rna};
997 0         0 return $self->{cached_is_rna};
998             }
999              
1000             sub is_protein {
1001 0     0 0 0 my $self = shift;
1002 0 0       0 $self->{cached_is_protein} = $self->equals(&protein()) unless defined $self->{cached_is_protein};
1003 0         0 return $self->{cached_is_protein};
1004             }
1005              
1006             #
1007             # Helper function to decode a string encoded using the method defined in the JSON specification.
1008             #
1009             sub _decode_JSON_string {
1010 0     0   0 my ($text) = @_;
1011              
1012 0 0       0 return unless defined $text;
1013              
1014 0         0 $text =~ s/\\(["\\\/])/$1/g;
1015 0         0 $text =~ s/\\b/\x{8}/g;
1016 0         0 $text =~ s/\\f/\x{C}/g;
1017 0         0 $text =~ s/\\n/\x{A}/g;
1018 0         0 $text =~ s/\\r/\x{D}/g;
1019 0         0 $text =~ s/\\t/\x{9}/g;
1020 0         0 $text =~ s/\\u([0-9A-Fa-f]{4})/chr(hex($1))/ge;
  0         0  
1021              
1022 0         0 return $text;
1023             }
1024              
1025             #
1026             # Helper function to encode a string using the method defined in the JSON specification.
1027             #
1028             sub _encode_JSON_string {
1029 0     0   0 my ($text) = @_;
1030              
1031 0         0 $text =~ s/(["\\\/])/\\$1/g;
1032 0         0 $text =~ s/\x{8}/\\b/g;
1033 0         0 $text =~ s/\x{C}/\\f/g;
1034 0         0 $text =~ s/\x{A}/\\n/g;
1035 0         0 $text =~ s/\x{D}/\\r/g;
1036 0         0 $text =~ s/\x{9}/\\t/g;
1037 0         0 $text =~ s/([\x{0}-\x{1F}\x{7F}-\x{9F}\x{2028}\x{2029}])/sprintf("\\u%.04X",ord($1))/ge;
  0         0  
1038              
1039 0         0 return $text;
1040             }
1041              
1042             #
1043             # Helper function to convert the colour into a number.
1044             #
1045             sub _decode_colour {
1046 0     0   0 my ($text) = @_;
1047              
1048 0 0       0 return unless defined $text;
1049              
1050 0         0 return hex($text);
1051             }
1052              
1053             # _symbol_cmp
1054             # Sorts so letters are before numbers which are before symbols.
1055             # Otherwise sorts using normal string comparison.
1056             sub _symbol_cmp($$) { ## no critic
1057 526     526   887 my ($a, $b) = @_;
1058              
1059 526 100       968 if (length($a) == length($b)) {
    100          
1060 470         893 for (my $i = 0; $i < length($a); $i++) {
1061 504         729 my ($sym_a, $sym_b);
1062 504         788 $sym_a = ord(substr($a, $i, 1));
1063 504         675 $sym_b = ord(substr($b, $i, 1));
1064             # check to see if either is a letter
1065 504         655 my ($is_letter_a, $is_letter_b);
1066 504   100     1566 $is_letter_a = (($sym_a >= ord('A') && $sym_a <= ord('Z')) || ($sym_a >= ord('a') && $sym_a <= ord('z')));
1067 504   100     1485 $is_letter_b = (($sym_b >= ord('A') && $sym_b <= ord('Z')) || ($sym_b >= ord('a') && $sym_b <= ord('z')));
1068 504 100       828 if ($is_letter_a) {
    100          
1069 438 100       762 if ($is_letter_b) {
1070 394 100       764 if ($sym_a < $sym_b) {
    100          
1071 168         396 return -1;
1072             } elsif ($sym_b < $sym_a) {
1073 192         425 return 1;
1074             }
1075 34         84 next;
1076             } else {
1077 44         68 return -1;
1078             }
1079             } elsif ($is_letter_b) {
1080 18         23 return 1;
1081             }
1082              
1083             # check to see if either is a number
1084 48         63 my ($is_num_a, $is_num_b);
1085 48   33     99 $is_num_a = ($sym_a ge '0' && $sym_a le '9');
1086 48   33     98 $is_num_b = ($sym_b ge '0' && $sym_b le '9');
1087 48 50       67 if ($is_num_a) {
    0          
1088 48 50       61 if ($is_num_b) {
1089 48 100       79 if ($sym_a < $sym_b) {
    50          
1090 16         29 return -1;
1091             } elsif ($sym_b < $sym_a) {
1092 32         86 return 1;
1093             }
1094 0         0 next;
1095             } else {
1096 0         0 return -1;
1097             }
1098             } elsif ($is_num_b) {
1099 0         0 return 1;
1100             }
1101              
1102             # both must be symbols
1103 0 0       0 if ($sym_a < $sym_b) {
    0          
1104 0         0 return -1;
1105             } elsif ($sym_b < $sym_a) {
1106 0         0 return 1;
1107             }
1108             }
1109              
1110 0         0 return 0;
1111             } elsif (length($a) > length($b)) {
1112 14         21 return -1;
1113             } else {
1114 42         71 return 1;
1115             }
1116             }
1117              
1118             sub _symobj_cmp($$) { ## no critic
1119 216     216   361 my ($sym1, $sym2) = @_;
1120 216         254 my $ret;
1121              
1122             # compare comprise
1123 216 100 66     615 if (defined($sym1->{comprise}) && defined($sym2->{comprise})) {
    50          
    50          
1124 150         263 $ret = _symbol_cmp($sym1->{comprise}, $sym2->{comprise});
1125 150 50       341 return $ret if $ret != 0;
1126             } elsif (defined($sym1->{comprise})) {
1127 0         0 return 1; # sym2 first because it is a core symbol
1128             } elsif (defined($sym2->{comprise})) {
1129 0         0 return -1; # sym1 first because it is a core symbol
1130             }
1131              
1132             # compare symbol
1133 66         108 $ret = _symbol_cmp($sym1->{sym}, $sym2->{sym});
1134 66 50       133 return $ret if $ret != 0;
1135              
1136             # compare complement
1137 0 0 0     0 if (defined($sym1->{complement}) && defined($sym2->{complement})) {
    0          
    0          
1138 0         0 $ret = _symbol_cmp($sym1->{complement}, $sym2->{complement});
1139 0 0       0 return $ret if $ret != 0;
1140             } elsif (defined($sym1->{complement})) {
1141 0         0 return 1;
1142             } elsif (defined($sym2->{complement})) {
1143 0         0 return -1;
1144             }
1145              
1146             # compare aliases
1147 0 0       0 if (scalar(@{$sym1->{aliases}}) != scalar(@{$sym2->{aliases}})) {
  0         0  
  0         0  
1148             # sort length decending
1149 0         0 return scalar(@{$sym2->{aliases}}) <=> scalar(@{$sym1->{aliases}});
  0         0  
  0         0  
1150             }
1151 0         0 for (my $i = 0; $i < scalar(@{$sym1->{aliases}}); $i++) {
  0         0  
1152 0         0 $ret = _symbol_cmp($sym1->{aliases}->[$i], $sym2->{aliases}->[$i]);
1153 0 0       0 return $ret if $ret != 0;
1154             }
1155              
1156             # compare name
1157 0 0 0     0 if (defined($sym1->{name}) && defined($sym2->{name})) {
    0          
    0          
1158 0         0 $ret = $sym1->{name} cmp $sym2->{name};
1159 0 0       0 return $ret if $ret != 0;
1160             } elsif (defined($sym1->{name})) {
1161 0         0 return 1;
1162             } elsif (defined($sym2->{name})) {
1163 0         0 return -1;
1164             }
1165              
1166             # compare colour
1167 0 0 0     0 if (defined($sym1->{colour}) && defined($sym2->{colour})) {
    0          
    0          
1168 0         0 return $sym1->{colour} <=> $sym2->{colour};
1169             } elsif (defined($sym1->{colour})) {
1170 0         0 return 1;
1171             } elsif (defined($sym2->{colour})) {
1172 0         0 return -1;
1173             }
1174 0         0 return 0;
1175             }
1176              
1177             #
1178             # Return the RNA alphabet.
1179             #
1180             sub rna {
1181 0     0 0 0 my $alph = new __PACKAGE__;
1182              
1183 0         0 $alph->parse_header('RNA', 'RNA');
1184              
1185             # core symbols
1186 0         0 $alph->parse_symbol('A', 'Adenine', 0xCC0000);
1187 0         0 $alph->parse_symbol('C', 'Cytosine', 0x0000CC);
1188 0         0 $alph->parse_symbol('G', 'Guanine', 0xFFB300);
1189 0         0 $alph->parse_symbol('U', 'Uracil', 0x008000, undef, undef, 'T');
1190             # ambiguous symbols
1191 0         0 $alph->parse_symbol('W', 'Weak', undef, undef, 'AU');
1192 0         0 $alph->parse_symbol('S', 'Strong', undef, undef, 'CG');
1193 0         0 $alph->parse_symbol('M', 'Amino', undef, undef, 'AC');
1194 0         0 $alph->parse_symbol('K', 'Keto', undef, undef, 'GU');
1195 0         0 $alph->parse_symbol('R', 'Purine', undef, undef, 'AG');
1196 0         0 $alph->parse_symbol('Y', 'Pyrimidine', undef, undef, 'CU');
1197 0         0 $alph->parse_symbol('B', 'Not A', undef, undef, 'CGU');
1198 0         0 $alph->parse_symbol('D', 'Not C', undef, undef, 'AGU');
1199 0         0 $alph->parse_symbol('H', 'Not G', undef, undef, 'ACU');
1200 0         0 $alph->parse_symbol('V', 'Not U', undef, undef, 'ACG');
1201 0         0 $alph->parse_symbol('N', 'Any base', undef, undef, 'ACGU', 'X.');
1202             # for now treat '.' (gap) as a wildcard
1203              
1204             # process
1205 0         0 $alph->parse_done();
1206              
1207             # error check
1208 0 0       0 die("Uhoh, this shouldn't happen:\n" + join("\n", $alph->get_errors())) if ($alph->has_errors());
1209              
1210 0         0 return $alph;
1211             }
1212              
1213             #
1214             # Parse core symbols in the DNA alphabet.
1215             # This is done separately, to ease expansion.
1216             # All core symbols must be parsed before any ambiguous symbols.
1217             # Optionally accepts arguments to set colours (alphabetical order).
1218             #
1219             sub _parse_core_DNA_symbols {
1220 4     4   11 my $alph = shift;
1221              
1222 4   100     35 $$alph->parse_symbol('A', 'Adenine', shift || 0xCC0000, 'T');
1223 4   100     25 $$alph->parse_symbol('C', 'Cytosine', shift || 0x0000CC, 'G');
1224 4   100     26 $$alph->parse_symbol('G', 'Guanine', shift || 0xFFB300, 'C');
1225 4   100     25 $$alph->parse_symbol('T', 'Thymine', shift || 0x008000, 'A', undef, 'U');
1226             }
1227              
1228             #
1229             # Parse ambiguous symbols in the DNA alphabet.
1230             # This is done separately, to ease expansion.
1231             #
1232             sub _parse_ambiguous_DNA_symbols {
1233 4     4   7 my $alph = shift;
1234              
1235 4         18 $$alph->parse_symbol('W', 'Weak', undef, undef, 'AT');
1236 4         18 $$alph->parse_symbol('S', 'Strong', undef, undef, 'CG');
1237 4         18 $$alph->parse_symbol('M', 'Amino', undef, undef, 'AC');
1238 4         16 $$alph->parse_symbol('K', 'Keto', undef, undef, 'GT');
1239 4         17 $$alph->parse_symbol('R', 'Purine', undef, undef, 'AG');
1240 4         15 $$alph->parse_symbol('Y', 'Pyrimidine', undef, undef, 'CT');
1241 4         17 $$alph->parse_symbol('B', 'Not A', undef, undef, 'CGT');
1242 4         17 $$alph->parse_symbol('D', 'Not C', undef, undef, 'AGT');
1243 4         17 $$alph->parse_symbol('H', 'Not G', undef, undef, 'ACT');
1244 4         18 $$alph->parse_symbol('V', 'Not T', undef, undef, 'ACG');
1245 4         17 $$alph->parse_symbol('N', 'Any base', undef, undef, 'ACGT', 'X.');
1246             }
1247              
1248             #
1249             # Return the DNA alphabet.
1250             #
1251             sub dna {
1252 2     2 0 15 my $alph = new __PACKAGE__;
1253              
1254 2         13 $alph->parse_header('DNA', 'DNA');
1255              
1256 2         12 _parse_core_DNA_symbols(\$alph);
1257 2         18 _parse_ambiguous_DNA_symbols(\$alph);
1258              
1259             # for now treat '.' (gap) as a wildcard
1260             # process
1261 2         11 $alph->parse_done();
1262              
1263 2 50       15 die("Uhoh, this shouldn't happen:\n" + join("\n", $alph->get_errors())) if ($alph->has_errors());
1264              
1265 2         16 return $alph;
1266             }
1267              
1268             #
1269             # Return the DNA with cytosine modifications alphabet.
1270             #
1271             sub moddna {
1272 2     2 0 8 my $alph = new __PACKAGE__;
1273              
1274 2         10 $alph->parse_header('modDNA', 'DNA');
1275              
1276 2         9 _parse_core_DNA_symbols(\$alph, 0x8510A8, 0xA50026, 0x313695, 0xA89610);
1277              
1278             # additional core symbols
1279 2         6 $alph->parse_symbol('m', '5-Methylcytosine', 0xD73027, '1');
1280 2         14 $alph->parse_symbol('1', 'Guanine:5-Methylcytosine', 0x4575B4, 'm');
1281 2         5 $alph->parse_symbol('h', '5-Hydroxymethylcytosine', 0xF46D43, '2');
1282 2         5 $alph->parse_symbol('2', 'Guanine:5-Hydroxymethylcytosine', 0x74ADD1, 'h');
1283 2         5 $alph->parse_symbol('f', '5-Formylcytosine', 0xFDAE61, '3');
1284 2         5 $alph->parse_symbol('3', 'Guanine:5-Formylcytosine', 0xABD9E9, 'f');
1285 2         5 $alph->parse_symbol('c', '5-Carboxylcytosine', 0xFEE090, '4');
1286 2         4 $alph->parse_symbol('4', 'Guanine:5-Carboxylcytosine', 0xE0F3F8, 'c');
1287              
1288 2         6 _parse_ambiguous_DNA_symbols(\$alph);
1289              
1290             # additional ambiguous symbols
1291 2         8 $alph->parse_symbol('z', 'Any C mod', undef, undef, 'Cmhfc');
1292 2         7 $alph->parse_symbol('9', 'Guanine:any C mod', undef, undef, 'G1234');
1293 2         7 $alph->parse_symbol('y', 'C, not (hydroxy)methylcytosine', undef, undef, 'Cfc');
1294 2         30 $alph->parse_symbol('8', 'Guanine:C, not (hydroxy)methylcytosine', undef, undef, 'G34');
1295 2         7 $alph->parse_symbol('x', '(Hydroxy)methylcytosine', undef, undef, 'mh');
1296 2         7 $alph->parse_symbol('7', 'Guanine:(hydroxy)methylcytosine', undef, undef, '12');
1297 2         6 $alph->parse_symbol('w', '(Formyl/carboxyl)cytosine', undef, undef, 'fc');
1298 2         5 $alph->parse_symbol('6', 'Guanine:(formyl/carboxyl)cytosine', undef, undef, '34');
1299              
1300             # process
1301 2         7 $alph->parse_done();
1302              
1303 2 50       12 die("Uhoh, this shouldn't happen:\n" + join("\n", $alph->get_errors())) if ($alph->has_errors());
1304              
1305 2         29 return $alph;
1306             }
1307              
1308             #
1309             # Return the Protein alphabet.
1310             #
1311             sub protein {
1312 0     0 0   my $alph = new __PACKAGE__;
1313              
1314 0           $alph->parse_header('Protein', 'PROTEIN');
1315              
1316             # core symbols
1317 0           $alph->parse_symbol('A', 'Alanine', 0x0000CC);
1318 0           $alph->parse_symbol('R', 'Arginine', 0xCC0000);
1319 0           $alph->parse_symbol('N', 'Asparagine', 0x008000);
1320 0           $alph->parse_symbol('D', 'Aspartic acid', 0xFF00FF);
1321 0           $alph->parse_symbol('C', 'Cysteine', 0x0000CC);
1322 0           $alph->parse_symbol('E', 'Glutamic acid', 0xFF00FF);
1323 0           $alph->parse_symbol('Q', 'Glutamine', 0x008000);
1324 0           $alph->parse_symbol('G', 'Glycine', 0xFFB300);
1325 0           $alph->parse_symbol('H', 'Histidine', 0xFFCCCC);
1326 0           $alph->parse_symbol('I', 'Isoleucine', 0x0000CC);
1327 0           $alph->parse_symbol('L', 'Leucine', 0x0000CC);
1328 0           $alph->parse_symbol('K', 'Lysine', 0xCC0000);
1329 0           $alph->parse_symbol('M', 'Methionine', 0x0000CC);
1330 0           $alph->parse_symbol('F', 'Phenylalanine', 0x0000CC);
1331 0           $alph->parse_symbol('P', 'Proline', 0xFFFF00);
1332 0           $alph->parse_symbol('S', 'Serine', 0x008000);
1333 0           $alph->parse_symbol('T', 'Threonine', 0x008000);
1334 0           $alph->parse_symbol('W', 'Tryptophan', 0x0000CC);
1335 0           $alph->parse_symbol('Y', 'Tyrosine', 0x33E6CC);
1336 0           $alph->parse_symbol('V', 'Valine', 0x0000CC);
1337             # ambiguous symbols
1338 0           $alph->parse_symbol('B', 'Asparagine or Aspartic acid', undef, undef, 'ND');
1339 0           $alph->parse_symbol('Z', 'Glutamine or Glutamic acid', undef, undef, 'QE');
1340 0           $alph->parse_symbol('J', 'Leucine or Isoleucine', undef, undef, 'LI');
1341 0           $alph->parse_symbol('X', 'Any amino acid', undef, undef, 'ARNDCEQGHILKMFPSTWYV', '*.');
1342             # for now treat '*' (stop codon) and '.' (gap) as a wildcard
1343              
1344             # process
1345 0           $alph->parse_done();
1346              
1347             # error check
1348 0 0         die("Uhoh, this shouldn't happen:\n" + join("\n", $alph->get_errors())) if ($alph->has_errors());
1349              
1350 0           return $alph;
1351             }
1352              
1353             1;
1354             __END__