File Coverage

blib/lib/Bio/GeneDesign/Basic.pm
Criterion Covered Total %
statement 145 147 98.6
branch 37 48 77.0
condition 13 30 43.3
subroutine 17 17 100.0
pod n/a
total 212 242 87.6


line stmt bran cond sub pod time code
1             #
2             # Basic GeneDesign libraries
3             #
4              
5             =head1 NAME
6              
7             Bio::GeneDesign::Basic
8              
9             =head1 VERSION
10              
11             Version 5.52
12              
13             =head1 DESCRIPTION
14              
15             GeneDesign is a library for the computer-assisted design of synthetic genes
16              
17             =head1 AUTHOR
18              
19             Sarah Richardson <SMRichardson@lbl.gov>
20              
21             =cut
22              
23             package Bio::GeneDesign::Basic;
24             require Exporter;
25              
26 11     11   9575 use POSIX qw(log10);
  11         84596  
  11         74  
27 11     11   13139 use Carp;
  11         25  
  11         610  
28              
29 11     11   59 use strict;
  11         19  
  11         360  
30 11     11   163 use warnings;
  11         21  
  11         540  
31              
32             our $VERSION = 5.52;
33              
34 11     11   55 use base qw(Exporter);
  11         18  
  11         25763  
35             our @EXPORT_OK = qw(
36             _sanitize
37             _count
38             _ntherm
39             _complement
40             _melt
41             _regres
42             _regarr
43             _positions
44             _is_ambiguous
45             _compare_sequences
46             _amb_transcription
47             _add_arr
48             $ambnt
49             @BASES
50             %NTIDES
51             %AA_NAMES
52             %AACIDS
53             );
54              
55             our %EXPORT_TAGS = (GD=> \@EXPORT_OK);
56              
57             =head1 Definitions
58              
59             =cut
60              
61             our @BASES = qw(A T C G);
62              
63             our %NTIDES = (
64             A => [qw(A)], T => [qw(T)], C => [qw(C)], G => [qw(G)],
65             R => [qw(A G)], Y => [qw(C T)], W => [qw(A T)], S => [qw(C G)],
66             K => [qw(G T)], M => [qw(A C)], B => [qw(C G T)], D => [qw(A G T)],
67             H => [qw(A C T)], V => [qw(A C G)], N => [qw(A C G T)]
68             );
69              
70             my %REGHSH = (
71             A => "A", T => "T",
72             C => "C", G => "G",
73             R => "[AGR]", Y => "[CTY]",
74             W => "[ATW]", S => "[CGS]",
75             K => "[GKT]", M => "[ACM]",
76             B => "[BCGKSTY]", D => "[ADGKRTW]",
77             H => "[ACHMTWY]", V => "[ACGMRSV]",
78             N => "[ABCDGHKMNRSTVWY]"
79             );
80              
81             our $ambnt = qr/[R Y W S K M B D H V N]+/x;
82              
83             our %AACIDS = map { $_ => $_ } qw(A C D E F G H I K L M N P Q R S T V W Y);
84             $AACIDS{"*"} = "[\*]";
85              
86             our %AA_NAMES = (
87             A => "Ala", B => "Unk", C => "Cys", D => "Asp", E => "Glu", F => "Phe",
88             G => "Gly", H => "His", I => "Ile", J => "Unk", K => "Lys", L => "Leu",
89             M => "Met", N => "Asn", O => "Unk", P => "Pro", Q => "Gln", R => "Arg",
90             S => "Ser", T => "Thr", U => "Unk", V => "Val", W => "Trp", X => "Unk",
91             Y => "Tyr", Z => "Unk","*" => "Stp");
92              
93             # entropy, enthalpy, and free energy of paired bases
94             my %EEG = (
95             "TC" => ([ 8.8, 23.5, 1.5]), "GA" => ([ 8.8, 23.5, 1.5]),
96             "CT" => ([ 6.6, 16.4, 1.5]), "AG" => ([ 6.6, 16.4, 1.5]),
97             "GG" => ([10.9, 28.4, 2.1]), "CC" => ([10.9, 28.4, 2.1]),
98             "AA" => ([ 8.0, 21.9, 1.2]), "TT" => ([ 8.0, 21.9, 1.2]),
99             "AT" => ([ 5.6, 15.2, 0.9]), "TA" => ([ 6.6, 18.4, 0.9]),
100             "CG" => ([11.8, 29.0, 2.8]), "GC" => ([10.5, 26.4, 2.3]),
101             "CA" => ([ 8.2, 21.0, 1.7]), "TG" => ([ 8.2, 21.0, 1.7]),
102             "GT" => ([ 9.4, 25.5, 1.5]), "AC" => ([ 9.4, 25.5, 1.5])
103             );
104              
105             =head1 Functions
106              
107             =head2 _sanitize()
108              
109             remove non nucleotide characters from nucleotide sequences. remove non amino
110             acid characters from peptide sequences. return sanitized strand and list of bad
111             characters.
112              
113             =cut
114              
115             sub _sanitize
116             {
117 20001     20001   30252 my ($strand, $swit) = @_;
118 20001   50     46224 $swit = $swit || 'nuc';
119 20001 50 33     101265 die ('bad sanitize switch') if ($swit ne 'nuc' && $swit ne 'pep');
120 20001         31868 my %noncount = ();
121 20001         24869 my $newstrand = q{};
122 20001         70251 my @arr = split q{}, $strand;
123 20001         37275 foreach my $char (@arr)
124             {
125 100199         119889 my $CH = uc $char;
126 100199 50 33     577710 if ($swit eq 'nuc' && exists $NTIDES{$CH}
      33        
      33        
127             || ($swit eq 'pep' && exists $AACIDS{$CH}))
128             {
129 100199         185934 $newstrand .= $char;
130             }
131             else
132             {
133 0         0 $noncount{$char}++;
134             }
135             }
136 20001         45909 my @baddies = keys %noncount;
137 20001         89678 return ($newstrand, @baddies);
138             }
139              
140             =head2 _count()
141              
142             takes a nucleotide sequence and returns a base count. Looks for total length,
143             purines, pyrimidines, and degenerate bases. If degenerate bases are present
144             assumes their substitution for non degenerate bases is totally random for
145             percentage estimation.
146              
147             in: nucleotide sequence (string),
148             out: base count (hash)
149              
150             =cut
151              
152             sub _count
153             {
154 5     5   8 my ($strand) = @_;
155 5         8 my $len = length $strand;
156 5         305 my @arr = split q{}, uc $strand;
157 5         55 my %C = map {$_ => 0} keys %NTIDES;
  75         117  
158            
159             #Count bases
160 5         370 $C{$_}++ foreach @arr;
161            
162 5         31 $C{d} += $C{$_} foreach (qw(A T C G));
163 5         39 $C{n} += $C{$_} foreach (qw(B D H K M N R S V W Y));
164 5         12 $C{'?'} = ($C{d} + $C{n}) - $len;
165            
166             #Estimate how many of each degenerate base would be a G or C
167 5         20 my $split = .5*$C{R} + .5*$C{Y} + .5*$C{K} + .5*$C{M} + .5*$C{N};
168 5         14 my $trip = (2*$C{B} / 3) + (2*$C{V} / 3) + ($C{D} / 3) + ($C{H} / 3);
169            
170             #Calculate GC/AT percentage
171 5         11 my $gcc = $C{S} + $C{G} + $C{C} + $split + $trip;
172 5         43 my $gcp = sprintf "%.1f", ($gcc / $len) * 100;
173 5         9 $C{GCp} = $gcp;
174 5         16 $C{ATp} = 100 - $gcp;
175 5         7 $C{len} = $len;
176            
177 5         77 return \%C;
178             }
179              
180             =head2 _melt()
181              
182             takes a nucleotide sequence and returns a melting temperature
183              
184             in: nucleotide sequence (string)
185             salt concentration (string, opt, def =.05),
186             oligo concentration (string, opt, def = .0000001)
187             out: temperature (float)
188              
189             =cut
190              
191             sub _melt
192             {
193 3     3   6 my ($strand, $salt, $conc) = @_;
194 3         7 my $C = _count($strand);
195 3         94 my $len = length($strand);
196 3   50     42 $salt = $salt || .05;
197 3   50     8 $conc = $conc || .0000001;
198 3         21 my $Naj = 16.6 * log10($salt);
199              
200 3         3 my $Tm;
201              
202 3 100 33     18 if ($len < 14)
    50          
    0          
203             {
204 1         4 $Tm = ((4 * ($C->{C} + $C->{G})) + (2 * ($C->{A} + $C->{T})));
205             }
206             elsif ($len >= 14 && $len <= 50)
207             {
208 2         7 $Tm = 100.5 + (41 * ($C->{G} + $C->{C}) / $len) - (820/$len) + $Naj;
209             }
210             elsif ($len > 50)
211             {
212 0         0 $Tm = 81.5 + (41 * ($C->{G} + $C->{C}) / $len) - (500/$len) + $Naj - .62;
213             }
214 3         27 return sprintf "%.1f", $Tm;
215             }
216              
217             =head2 _ntherm()
218              
219             takes a nucleotide sequence and returns a nearest neighbor melting temp.
220              
221             in: nucleotide sequence (string)
222             out: temperature (float)
223            
224             =cut
225              
226             sub _ntherm
227             {
228 1     1   3 my ($strand, $salt, $conc) = @_;
229 1         3 my ($dH, $dS, $dG) = (0, 0, 0);
230 1         7 foreach my $w (keys %EEG)
231             {
232 16         220 while ($strand =~ /(?=$w)/ixg)
233             {
234 32         60 $dH += $EEG{$w}->[0];
235 32         158 $dS += $EEG{$w}->[1];
236             #$dG += $EEG{$w}->[2];
237             }
238             }
239 1   50     5 $salt = $salt || .05;
240 1   50     5 $conc = $conc || .0000001;
241 1         8 my $Naj = 16.6 * log10($salt);
242 1         5 my $lc = 1.987 * abs( log( $conc / 2 ) );
243 1         4 my $Tm = ( ( ($dH - 3.4) / ( ($dS + $lc) / 1000) ) - 273.15) + $Naj;
244 1         15 return sprintf "%.1f", $Tm;
245             }
246              
247             =head2 _complement()
248              
249             takes a nucleotide sequence and returns its complement or reverse complement.
250              
251             in: nucleotide sequence (string),
252             switch for reverse complement (1 or null)
253             out: nucleotide sequence (string)
254              
255             =cut
256              
257             sub _complement
258             {
259 1185     1185   2576 my ($strand, $swit) = @_;
260 1185 100       3533 $strand = scalar reverse($strand) if ($swit);
261 1185         2472 $strand =~ tr/AaCcGgTtRrYyKkMmBbDdHhVv/TtGgCcAaYyRrMmKkVvHhDdBb/;
262 1185         5192 return $strand;
263             }
264              
265             =head2 _regres()
266              
267             takes a sequence that may be degenerate and returns a string that is prepped
268             for use in a regular expression.
269            
270             in: sequence (string),
271             switch for aa or nt sequence (1 or null)
272             out: regexp string (string)
273              
274             =cut
275              
276             sub _regres
277             {
278 1052     1052   1964 my ($sequence, $swit) = @_;
279 1052   100     2350 $swit = $swit || 1;
280 1052         1258 my $comp = q{};
281 1052         6632 my @arr = split('', uc $sequence);
282 1052         2755 foreach my $char (@arr)
283             {
284 6521 100       9811 if ($swit == 1)
    50          
285             {
286 6497         9523 my $ntide = $REGHSH{$char};
287 6497 50       11376 croak ("$char is not a nucleotide\n") unless $ntide;
288 6497         11787 $comp .= $ntide;
289             }
290             elsif ($swit == 2)
291             {
292 24         31 my $aa = $AACIDS{$char};
293 24 50       38 croak ("$char is not an amino acid \n") unless $aa;
294 24         28 $comp .= $aa;
295             }
296             }
297 1052         17809 return qr/$comp/x;
298             }
299              
300             =head2 regarr()
301              
302             =cut
303              
304             sub _regarr
305             {
306 772     772   1635 my ($sequence) = @_;
307 772         1505 my $regex = [_regres($sequence, 1)];
308 772         1895 my $comp = _complement($sequence, 1);
309 772 100       2431 if ($comp ne $sequence)
310             {
311 201         1496 push @$regex, _regres($comp, 1);
312             }
313 772         3135 return $regex;
314             }
315              
316             =head2 _positions()
317              
318             =cut
319              
320             sub _positions
321             {
322 5     5   11 my ($seq, $regarr) = @_;
323 5         15 my $temphash = {};
324 5         12 foreach my $sit (@$regarr)
325             {
326 8         203 while ($seq =~ /(?=($sit))/ixg)
327             {
328 7         110 $temphash->{pos $seq} = $1;
329             }
330             }
331 5         26 return $temphash;
332             }
333              
334             =head2 _is_ambiguous
335              
336             =cut
337              
338             sub _is_ambiguous
339             {
340 5     5   9 my ($sequence) = @_;
341 5 100       34 return 1 if ($sequence =~ $ambnt);
342 4         14 return 0;
343             }
344              
345             =head2 _compare_sequences()
346              
347             takes two nucleotide sequences that are assumed to be perfectly aligned and
348             roughly equivalent and returns similarity metrics. should be twweeaakkeedd
349            
350             in: 2x nucleotide sequence (string)
351             out: similarity metrics (hash)
352              
353             =cut
354              
355             sub _compare_sequences
356             {
357 362     362   646 my ($topseq, $botseq) = @_;
358 362 50 33     1743 return if (!$botseq || length($botseq) != length($topseq));
359 362         661 my ($tsit, $tver, $len) = (0, 0, length($topseq));
360 362         828 while (length($topseq) > 0)
361             {
362 1794         3447 my ($topbit, $botbit) = (chop($topseq), chop ($botseq));
363 1794 100       5029 if ($topbit ne $botbit)
364             {
365 586 100       2086 $topbit = $topbit =~ $REGHSH{R} ? 1 : 0;
366 586 100       1789 $botbit = $botbit =~ $REGHSH{R} ? 1 : 0;
367 586 100       1836 $tsit++ if ($topbit == $botbit);
368 586 100       2022 $tver++ if ($topbit != $botbit);
369             }
370             }
371 362         420 my %A;
372 362         752 $A{D} = $tsit + $tver; #changes
373 362         677 $A{I} = $len - $A{D}; #identities
374 362         507 $A{T} = $tsit; #transitions
375 362         556 $A{V} = $tver; #transversions
376 362         2369 $A{P} = sprintf "%.1f", 100 - (($A{D} / $len) * 100); #percent identity
377 362         2385 return \%A;
378             }
379              
380             =head2 _amb_transcription
381              
382             =cut
383              
384             sub _amb_transcription
385             {
386 10     10   24 my ($ntseq) = @_;
387 10         28 my $prseq = uc $ntseq;
388 10         25 my (@SEED, @NEW) = ((), ());
389 10         16 my $offset = 0;
390 10         20 my $ntlen = length($prseq);
391 10         37 while ($offset < $ntlen)
392             {
393 47         121 my $template = substr($prseq, $offset, 1);
394 47         111 my $ntides = $NTIDES{$template};
395 47         63 my $possibilities = scalar(@$ntides);
396 47 100       94 if ($possibilities == 1)
397             {
398 44 100       111 @SEED = ( $template ) if ($offset == 0);
399 44 100       148 @NEW = ( $template ) if ($offset > 0);
400             }
401             else
402             {
403 3 50       10 @SEED = @$ntides if ($offset == 0);
404 3 50       21 @NEW = @$ntides if ($offset > 0);
405             }
406 47 100       161 unless ($offset == 0)
407             {
408 37         84 @SEED = _add_arr(\@SEED, \@NEW);
409             }
410 47         185 $offset ++;
411             }
412 10         23 my %hsh = map {$_ => 1 } @SEED;
  19         73  
413 10         52 my @keys = sort keys %hsh;
414 10         63 return \@keys;
415             }
416              
417              
418             =head2 _add_arr
419              
420             Basically builds a list of tree nodes for the amb_trans* functions.
421             in: 2 x peptide lists (array reference) out: combined list of peptides
422            
423             =cut
424              
425             sub _add_arr
426             {
427 75     75   123 my ($arr1ref, $arr2ref) = @_;
428 75         221 my @arr1 = @$arr1ref;
429 75         142 my @arr2 = @$arr2ref;
430 75         107 my @arr3 = ();
431 75         129 foreach my $do (@arr1)
432             {
433 465         585 foreach my $re (@arr2)
434             {
435 1456         2859 push @arr3, $do . $re
436             }
437             }
438 75         1085 return @arr3;
439             }
440              
441             1;
442              
443             __END__
444              
445             =head1 COPYRIGHT AND LICENSE
446              
447             Copyright (c) 2013, GeneDesign developers
448             All rights reserved.
449              
450             Redistribution and use in source and binary forms, with or without modification,
451             are permitted provided that the following conditions are met:
452              
453             * Redistributions of source code must retain the above copyright notice, this
454             list of conditions and the following disclaimer.
455              
456             * Redistributions in binary form must reproduce the above copyright notice, this
457             list of conditions and the following disclaimer in the documentation and/or
458             other materials provided with the distribution.
459              
460             * The names of Johns Hopkins, the Joint Genome Institute, the Lawrence Berkeley
461             National Laboratory, the Department of Energy, and the GeneDesign developers may
462             not be used to endorse or promote products derived from this software without
463             specific prior written permission.
464              
465             THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
466             ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
467             WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
468             DISCLAIMED. IN NO EVENT SHALL THE DEVELOPERS BE LIABLE FOR ANY DIRECT, INDIRECT,
469             INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
470             LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
471             PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
472             LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
473             OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
474             ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
475              
476             =cut