File Coverage

blib/lib/Bio/GeneDesign/Basic.pm

Criterion	Covered	Total	%
statement	145	147	98.6
branch	37	48	77.0
condition	13	30	43.3
subroutine	17	17	100.0
pod			n/a
total	212	242	87.6

line	stmt	bran	cond	sub	time	code
1						#
2						# Basic GeneDesign libraries
3						#
4
5						=head1 NAME
6
7						Bio::GeneDesign::Basic
8
9						=head1 VERSION
10
11						Version 5.52
12
13						=head1 DESCRIPTION
14
15						GeneDesign is a library for the computer-assisted design of synthetic genes
16
17						=head1 AUTHOR
18
19						Sarah Richardson <SMRichardson@lbl.gov>
20
21						=cut
22
23						package Bio::GeneDesign::Basic;
24						require Exporter;
25
26	11			11	9575	use POSIX qw(log10);
	11				84596
	11				74
27	11			11	13139	use Carp;
	11				25
	11				610
28
29	11			11	59	use strict;
	11				19
	11				360
30	11			11	163	use warnings;
	11				21
	11				540
31
32						our $VERSION = 5.52;
33
34	11			11	55	use base qw(Exporter);
	11				18
	11				25763
35						our @EXPORT_OK = qw(
36						_sanitize
37						_count
38						_ntherm
39						_complement
40						_melt
41						_regres
42						_regarr
43						_positions
44						_is_ambiguous
45						_compare_sequences
46						_amb_transcription
47						_add_arr
48						$ambnt
49						@BASES
50						%NTIDES
51						%AA_NAMES
52						%AACIDS
53						);
54
55						our %EXPORT_TAGS = (GD=> \@EXPORT_OK);
56
57						=head1 Definitions
58
59						=cut
60
61						our @BASES = qw(A T C G);
62
63						our %NTIDES = (
64						A => [qw(A)], T => [qw(T)], C => [qw(C)], G => [qw(G)],
65						R => [qw(A G)], Y => [qw(C T)], W => [qw(A T)], S => [qw(C G)],
66						K => [qw(G T)], M => [qw(A C)], B => [qw(C G T)], D => [qw(A G T)],
67						H => [qw(A C T)], V => [qw(A C G)], N => [qw(A C G T)]
68						);
69
70						my %REGHSH = (
71						A => "A", T => "T",
72						C => "C", G => "G",
73						R => "[AGR]", Y => "[CTY]",
74						W => "[ATW]", S => "[CGS]",
75						K => "[GKT]", M => "[ACM]",
76						B => "[BCGKSTY]", D => "[ADGKRTW]",
77						H => "[ACHMTWY]", V => "[ACGMRSV]",
78						N => "[ABCDGHKMNRSTVWY]"
79						);
80
81						our $ambnt = qr/[R Y W S K M B D H V N]+/x;
82
83						our %AACIDS = map { $_ => $_ } qw(A C D E F G H I K L M N P Q R S T V W Y);
84						$AACIDS{""} = "[\]";
85
86						our %AA_NAMES = (
87						A => "Ala", B => "Unk", C => "Cys", D => "Asp", E => "Glu", F => "Phe",
88						G => "Gly", H => "His", I => "Ile", J => "Unk", K => "Lys", L => "Leu",
89						M => "Met", N => "Asn", O => "Unk", P => "Pro", Q => "Gln", R => "Arg",
90						S => "Ser", T => "Thr", U => "Unk", V => "Val", W => "Trp", X => "Unk",
91						Y => "Tyr", Z => "Unk","*" => "Stp");
92
93						# entropy, enthalpy, and free energy of paired bases
94						my %EEG = (
95						"TC" => ([ 8.8, 23.5, 1.5]), "GA" => ([ 8.8, 23.5, 1.5]),
96						"CT" => ([ 6.6, 16.4, 1.5]), "AG" => ([ 6.6, 16.4, 1.5]),
97						"GG" => ([10.9, 28.4, 2.1]), "CC" => ([10.9, 28.4, 2.1]),
98						"AA" => ([ 8.0, 21.9, 1.2]), "TT" => ([ 8.0, 21.9, 1.2]),
99						"AT" => ([ 5.6, 15.2, 0.9]), "TA" => ([ 6.6, 18.4, 0.9]),
100						"CG" => ([11.8, 29.0, 2.8]), "GC" => ([10.5, 26.4, 2.3]),
101						"CA" => ([ 8.2, 21.0, 1.7]), "TG" => ([ 8.2, 21.0, 1.7]),
102						"GT" => ([ 9.4, 25.5, 1.5]), "AC" => ([ 9.4, 25.5, 1.5])
103						);
104
105						=head1 Functions
106
107						=head2 _sanitize()
108
109						remove non nucleotide characters from nucleotide sequences. remove non amino
110						acid characters from peptide sequences. return sanitized strand and list of bad
111						characters.
112
113						=cut
114
115						sub _sanitize
116						{
117	20001			20001	30252	my ($strand, $swit) = @_;
118	20001		50		46224	$swit = $swit \|\| 'nuc';
119	20001	50	33		101265	die ('bad sanitize switch') if ($swit ne 'nuc' && $swit ne 'pep');
120	20001				31868	my %noncount = ();
121	20001				24869	my $newstrand = q{};
122	20001				70251	my @arr = split q{}, $strand;
123	20001				37275	foreach my $char (@arr)
124						{
125	100199				119889	my $CH = uc $char;
126	100199	50	33		577710	if ($swit eq 'nuc' && exists $NTIDES{$CH}
			33
			33
127						\|\| ($swit eq 'pep' && exists $AACIDS{$CH}))
128						{
129	100199				185934	$newstrand .= $char;
130						}
131						else
132						{
133	0				0	$noncount{$char}++;
134						}
135						}
136	20001				45909	my @baddies = keys %noncount;
137	20001				89678	return ($newstrand, @baddies);
138						}
139
140						=head2 _count()
141
142						takes a nucleotide sequence and returns a base count. Looks for total length,
143						purines, pyrimidines, and degenerate bases. If degenerate bases are present
144						assumes their substitution for non degenerate bases is totally random for
145						percentage estimation.
146
147						in: nucleotide sequence (string),
148						out: base count (hash)
149
150						=cut
151
152						sub _count
153						{
154	5			5	8	my ($strand) = @_;
155	5				8	my $len = length $strand;
156	5				305	my @arr = split q{}, uc $strand;
157	5				55	my %C = map {$_ => 0} keys %NTIDES;
	75				117
158
159						#Count bases
160	5				370	$C{$_}++ foreach @arr;
161
162	5				31	$C{d} += $C{$_} foreach (qw(A T C G));
163	5				39	$C{n} += $C{$_} foreach (qw(B D H K M N R S V W Y));
164	5				12	$C{'?'} = ($C{d} + $C{n}) - $len;
165
166						#Estimate how many of each degenerate base would be a G or C
167	5				20	my $split = .5$C{R} + .5$C{Y} + .5$C{K} + .5$C{M} + .5*$C{N};
168	5				14	my $trip = (2$C{B} / 3) + (2$C{V} / 3) + ($C{D} / 3) + ($C{H} / 3);
169
170						#Calculate GC/AT percentage
171	5				11	my $gcc = $C{S} + $C{G} + $C{C} + $split + $trip;
172	5				43	my $gcp = sprintf "%.1f", ($gcc / $len) * 100;
173	5				9	$C{GCp} = $gcp;
174	5				16	$C{ATp} = 100 - $gcp;
175	5				7	$C{len} = $len;
176
177	5				77	return \%C;
178						}
179
180						=head2 _melt()
181
182						takes a nucleotide sequence and returns a melting temperature
183
184						in: nucleotide sequence (string)
185						salt concentration (string, opt, def =.05),
186						oligo concentration (string, opt, def = .0000001)
187						out: temperature (float)
188
189						=cut
190
191						sub _melt
192						{
193	3			3	6	my ($strand, $salt, $conc) = @_;
194	3				7	my $C = _count($strand);
195	3				94	my $len = length($strand);
196	3		50		42	$salt = $salt \|\| .05;
197	3		50		8	$conc = $conc \|\| .0000001;
198	3				21	my $Naj = 16.6 * log10($salt);
199
200	3				3	my $Tm;
201
202	3	100	33		18	if ($len < 14)
		50
		0
203						{
204	1				4	$Tm = ((4 * ($C->{C} + $C->{G})) + (2 * ($C->{A} + $C->{T})));
205						}
206						elsif ($len >= 14 && $len <= 50)
207						{
208	2				7	$Tm = 100.5 + (41 * ($C->{G} + $C->{C}) / $len) - (820/$len) + $Naj;
209						}
210						elsif ($len > 50)
211						{
212	0				0	$Tm = 81.5 + (41 * ($C->{G} + $C->{C}) / $len) - (500/$len) + $Naj - .62;
213						}
214	3				27	return sprintf "%.1f", $Tm;
215						}
216
217						=head2 _ntherm()
218
219						takes a nucleotide sequence and returns a nearest neighbor melting temp.
220
221						in: nucleotide sequence (string)
222						out: temperature (float)
223
224						=cut
225
226						sub _ntherm
227						{
228	1			1	3	my ($strand, $salt, $conc) = @_;
229	1				3	my ($dH, $dS, $dG) = (0, 0, 0);
230	1				7	foreach my $w (keys %EEG)
231						{
232	16				220	while ($strand =~ /(?=$w)/ixg)
233						{
234	32				60	$dH += $EEG{$w}->[0];
235	32				158	$dS += $EEG{$w}->[1];
236						#$dG += $EEG{$w}->[2];
237						}
238						}
239	1		50		5	$salt = $salt \|\| .05;
240	1		50		5	$conc = $conc \|\| .0000001;
241	1				8	my $Naj = 16.6 * log10($salt);
242	1				5	my $lc = 1.987 * abs( log( $conc / 2 ) );
243	1				4	my $Tm = ( ( ($dH - 3.4) / ( ($dS + $lc) / 1000) ) - 273.15) + $Naj;
244	1				15	return sprintf "%.1f", $Tm;
245						}
246
247						=head2 _complement()
248
249						takes a nucleotide sequence and returns its complement or reverse complement.
250
251						in: nucleotide sequence (string),
252						switch for reverse complement (1 or null)
253						out: nucleotide sequence (string)
254
255						=cut
256
257						sub _complement
258						{
259	1185			1185	2576	my ($strand, $swit) = @_;
260	1185	100			3533	$strand = scalar reverse($strand) if ($swit);
261	1185				2472	$strand =~ tr/AaCcGgTtRrYyKkMmBbDdHhVv/TtGgCcAaYyRrMmKkVvHhDdBb/;
262	1185				5192	return $strand;
263						}
264
265						=head2 _regres()
266
267						takes a sequence that may be degenerate and returns a string that is prepped
268						for use in a regular expression.
269
270						in: sequence (string),
271						switch for aa or nt sequence (1 or null)
272						out: regexp string (string)
273
274						=cut
275
276						sub _regres
277						{
278	1052			1052	1964	my ($sequence, $swit) = @_;
279	1052		100		2350	$swit = $swit \|\| 1;
280	1052				1258	my $comp = q{};
281	1052				6632	my @arr = split('', uc $sequence);
282	1052				2755	foreach my $char (@arr)
283						{
284	6521	100			9811	if ($swit == 1)
		50
285						{
286	6497				9523	my $ntide = $REGHSH{$char};
287	6497	50			11376	croak ("$char is not a nucleotide\n") unless $ntide;
288	6497				11787	$comp .= $ntide;
289						}
290						elsif ($swit == 2)
291						{
292	24				31	my $aa = $AACIDS{$char};
293	24	50			38	croak ("$char is not an amino acid \n") unless $aa;
294	24				28	$comp .= $aa;
295						}
296						}
297	1052				17809	return qr/$comp/x;
298						}
299
300						=head2 regarr()
301
302						=cut
303
304						sub _regarr
305						{
306	772			772	1635	my ($sequence) = @_;
307	772				1505	my $regex = [_regres($sequence, 1)];
308	772				1895	my $comp = _complement($sequence, 1);
309	772	100			2431	if ($comp ne $sequence)
310						{
311	201				1496	push @$regex, _regres($comp, 1);
312						}
313	772				3135	return $regex;
314						}
315
316						=head2 _positions()
317
318						=cut
319
320						sub _positions
321						{
322	5			5	11	my ($seq, $regarr) = @_;
323	5				15	my $temphash = {};
324	5				12	foreach my $sit (@$regarr)
325						{
326	8				203	while ($seq =~ /(?=($sit))/ixg)
327						{
328	7				110	$temphash->{pos $seq} = $1;
329						}
330						}
331	5				26	return $temphash;
332						}
333
334						=head2 _is_ambiguous
335
336						=cut
337
338						sub _is_ambiguous
339						{
340	5			5	9	my ($sequence) = @_;
341	5	100			34	return 1 if ($sequence =~ $ambnt);
342	4				14	return 0;
343						}
344
345						=head2 _compare_sequences()
346
347						takes two nucleotide sequences that are assumed to be perfectly aligned and
348						roughly equivalent and returns similarity metrics. should be twweeaakkeedd
349
350						in: 2x nucleotide sequence (string)
351						out: similarity metrics (hash)
352
353						=cut
354
355						sub _compare_sequences
356						{
357	362			362	646	my ($topseq, $botseq) = @_;
358	362	50	33		1743	return if (!$botseq \|\| length($botseq) != length($topseq));
359	362				661	my ($tsit, $tver, $len) = (0, 0, length($topseq));
360	362				828	while (length($topseq) > 0)
361						{
362	1794				3447	my ($topbit, $botbit) = (chop($topseq), chop ($botseq));
363	1794	100			5029	if ($topbit ne $botbit)
364						{
365	586	100			2086	$topbit = $topbit =~ $REGHSH{R} ? 1 : 0;
366	586	100			1789	$botbit = $botbit =~ $REGHSH{R} ? 1 : 0;
367	586	100			1836	$tsit++ if ($topbit == $botbit);
368	586	100			2022	$tver++ if ($topbit != $botbit);
369						}
370						}
371	362				420	my %A;
372	362				752	$A{D} = $tsit + $tver; #changes
373	362				677	$A{I} = $len - $A{D}; #identities
374	362				507	$A{T} = $tsit; #transitions
375	362				556	$A{V} = $tver; #transversions
376	362				2369	$A{P} = sprintf "%.1f", 100 - (($A{D} / $len) * 100); #percent identity
377	362				2385	return \%A;
378						}
379
380						=head2 _amb_transcription
381
382						=cut
383
384						sub _amb_transcription
385						{
386	10			10	24	my ($ntseq) = @_;
387	10				28	my $prseq = uc $ntseq;
388	10				25	my (@SEED, @NEW) = ((), ());
389	10				16	my $offset = 0;
390	10				20	my $ntlen = length($prseq);
391	10				37	while ($offset < $ntlen)
392						{
393	47				121	my $template = substr($prseq, $offset, 1);
394	47				111	my $ntides = $NTIDES{$template};
395	47				63	my $possibilities = scalar(@$ntides);
396	47	100			94	if ($possibilities == 1)
397						{
398	44	100			111	@SEED = ( $template ) if ($offset == 0);
399	44	100			148	@NEW = ( $template ) if ($offset > 0);
400						}
401						else
402						{
403	3	50			10	@SEED = @$ntides if ($offset == 0);
404	3	50			21	@NEW = @$ntides if ($offset > 0);
405						}
406	47	100			161	unless ($offset == 0)
407						{
408	37				84	@SEED = _add_arr(\@SEED, \@NEW);
409						}
410	47				185	$offset ++;
411						}
412	10				23	my %hsh = map {$_ => 1 } @SEED;
	19				73
413	10				52	my @keys = sort keys %hsh;
414	10				63	return \@keys;
415						}
416
417
418						=head2 _add_arr
419
420						Basically builds a list of tree nodes for the amb_trans* functions.
421						in: 2 x peptide lists (array reference) out: combined list of peptides
422
423						=cut
424
425						sub _add_arr
426						{
427	75			75	123	my ($arr1ref, $arr2ref) = @_;
428	75				221	my @arr1 = @$arr1ref;
429	75				142	my @arr2 = @$arr2ref;
430	75				107	my @arr3 = ();
431	75				129	foreach my $do (@arr1)
432						{
433	465				585	foreach my $re (@arr2)
434						{
435	1456				2859	push @arr3, $do . $re
436						}
437						}
438	75				1085	return @arr3;
439						}
440
441						1;
442
443						__END__
444
445						=head1 COPYRIGHT AND LICENSE
446
447						Copyright (c) 2013, GeneDesign developers
448						All rights reserved.
449
450						Redistribution and use in source and binary forms, with or without modification,
451						are permitted provided that the following conditions are met:
452
453						* Redistributions of source code must retain the above copyright notice, this
454						list of conditions and the following disclaimer.
455
456						* Redistributions in binary form must reproduce the above copyright notice, this
457						list of conditions and the following disclaimer in the documentation and/or
458						other materials provided with the distribution.
459
460						* The names of Johns Hopkins, the Joint Genome Institute, the Lawrence Berkeley
461						National Laboratory, the Department of Energy, and the GeneDesign developers may
462						not be used to endorse or promote products derived from this software without
463						specific prior written permission.
464
465						THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
466						ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
467						WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
468						DISCLAIMED. IN NO EVENT SHALL THE DEVELOPERS BE LIABLE FOR ANY DIRECT, INDIRECT,
469						INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
470						LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
471						PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
472						LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
473						OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
474						ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
475
476						=cut