File Coverage

blib/lib/Bio/GeneDesign/Basic.pm

Criterion	Covered	Total	%
statement	145	191	75.9
branch	37	54	68.5
condition	13	32	40.6
subroutine	17	21	80.9
pod			n/a
total	212	298	71.1

line	stmt	bran	cond	sub	time	code
1						#
2						# Basic GeneDesign libraries
3						#
4
5						=head1 NAME
6
7						Bio::GeneDesign::Basic
8
9						=head1 VERSION
10
11						Version 5.56
12
13						=head1 DESCRIPTION
14
15						GeneDesign is a library for the computer-assisted design of synthetic genes
16
17						=head1 AUTHOR
18
19						Sarah Richardson
20
21						=cut
22
23						package Bio::GeneDesign::Basic;
24						require Exporter;
25
26	11			11	73	use POSIX qw(log10);
	11				21
	11				92
27	11			11	16223	use Carp;
	11				20
	11				632
28
29	11			11	64	use strict;
	11				29
	11				286
30	11			11	61	use warnings;
	11				17
	11				499
31
32						our $VERSION = 5.56;
33
34	11			11	60	use base qw(Exporter);
	11				32
	11				25658
35						our @EXPORT_OK = qw(
36						_generate_dimers
37						_sanitize
38						_count
39						_gcwindows
40						_ntherm
41						_complement
42						_toRNA
43						_melt
44						_regres
45						_regarr
46						_positions
47						_find_runs
48						_is_ambiguous
49						_compare_sequences
50						_amb_transcription
51						_add_arr
52						$ambnt
53						@BASES
54						%NTIDES
55						%AA_NAMES
56						%AACIDS
57						);
58
59						our %EXPORT_TAGS = (GD=> \@EXPORT_OK);
60
61						=head1 Definitions
62
63						=cut
64
65						our @BASES = qw(A T C G);
66
67						our %NTIDES = (
68						A => [qw(A)], T => [qw(T)],
69						C => [qw(C)], G => [qw(G)],
70						R => [qw(A G)], Y => [qw(C T)],
71						W => [qw(A T)], S => [qw(C G)],
72						K => [qw(G T)], M => [qw(A C)],
73						B => [qw(C G T)], D => [qw(A G T)],
74						H => [qw(A C T)], V => [qw(A C G)],
75						N => [qw(A C G T)],
76						);
77
78						my %REGHSH = (
79						A => 'A', T => 'T',
80						C => 'C', G => 'G',
81						R => '[AGR]', Y => '[CTY]',
82						W => '[ATW]', S => '[CGS]',
83						K => '[GKT]', M => '[ACM]',
84						B => '[BCGKSTY]', D => '[ADGKRTW]',
85						H => '[ACHMTWY]', V => '[ACGMRSV]',
86						N => '[ABCDGHKMNRSTVWY]',
87						);
88
89						our $ambnt = qr/[R Y W S K M B D H V N]+/x;
90
91						our %AACIDS = map { $_ => $_ } qw(A C D E F G H I K L M N P Q R S T V W Y);
92						$AACIDS{""} = "[\]";
93
94						our %AA_NAMES = (
95						A => 'Ala', B => 'Unk', C => 'Cys', D => 'Asp', E => 'Glu', F => 'Phe',
96						G => 'Gly', H => 'His', I => 'Ile', J => 'Unk', K => 'Lys', L => 'Leu',
97						M => 'Met', N => 'Asn', O => 'Unk', P => 'Pro', Q => 'Gln', R => 'Arg',
98						S => 'Ser', T => 'Thr', U => 'Unk', V => 'Val', W => 'Trp', X => 'Unk',
99						Y => 'Tyr', Z => 'Unk', '*' => 'Stp');
100
101						# entropy, enthalpy, and free energy of paired bases
102						my %EEG = (
103						"TC" => ([ 8.8, 23.5, 1.5]), "GA" => ([ 8.8, 23.5, 1.5]),
104						"CT" => ([ 6.6, 16.4, 1.5]), "AG" => ([ 6.6, 16.4, 1.5]),
105						"GG" => ([10.9, 28.4, 2.1]), "CC" => ([10.9, 28.4, 2.1]),
106						"AA" => ([ 8.0, 21.9, 1.2]), "TT" => ([ 8.0, 21.9, 1.2]),
107						"AT" => ([ 5.6, 15.2, 0.9]), "TA" => ([ 6.6, 18.4, 0.9]),
108						"CG" => ([11.8, 29.0, 2.8]), "GC" => ([10.5, 26.4, 2.3]),
109						"CA" => ([ 8.2, 21.0, 1.7]), "TG" => ([ 8.2, 21.0, 1.7]),
110						"GT" => ([ 9.4, 25.5, 1.5]), "AC" => ([ 9.4, 25.5, 1.5])
111						);
112
113						=head1 Functions
114
115						=head2 _generate_dimers
116
117						=cut
118
119						sub _generate_dimers
120						{
121	0			0	0	my @dimers;
122	0				0	foreach my $a (@BASES)
123						{
124	0				0	foreach my $b (@BASES)
125						{
126	0				0	push @dimers, $a . $b;
127						}
128						}
129	0				0	return @dimers;
130						}
131
132						=head2 _sanitize()
133
134						remove non nucleotide characters from nucleotide sequences. remove non amino
135						acid characters from peptide sequences. return sanitized strand and list of bad
136						characters.
137
138						=cut
139
140						sub _sanitize
141						{
142	20001			20001	37359	my ($strand, $swit) = @_;
143	20001		50		40468	$swit = $swit \|\| 'nuc';
144	20001	50	33		72571	die ('bad sanitize switch') if ($swit ne 'nuc' && $swit ne 'pep');
145	20001				30732	my %noncount = ();
146	20001				27676	my $newstrand = q{};
147	20001				69752	my @arr = split q{}, $strand;
148	20001				32444	foreach my $char (@arr)
149						{
150	100199				131202	my $CH = uc $char;
151	100199	50	33		386967	if ($swit eq 'nuc' && exists $NTIDES{$CH}
			33
			33
152						\|\| ($swit eq 'pep' && exists $AACIDS{$CH}))
153						{
154	100199				157276	$newstrand .= $char;
155						}
156						else
157						{
158	0				0	$noncount{$char}++;
159						}
160						}
161	20001				36307	my @baddies = keys %noncount;
162	20001				68088	return ($newstrand, @baddies);
163						}
164
165						=head2 _count()
166
167						takes a nucleotide sequence and returns a base count. Looks for total length,
168						purines, pyrimidines, and degenerate bases. If degenerate bases are present
169						assumes their substitution for non degenerate bases is totally random for
170						percentage estimation.
171
172						in: nucleotide sequence (string),
173						out: base count (hash)
174
175						=cut
176
177						sub _count
178						{
179	5			5	8	my ($strand) = @_;
180	5				9	my $len = length $strand;
181	5				137	my @arr = split q{}, uc $strand;
182	5				24	my %C = map {$_ => 0} keys %NTIDES;
	75				116
183
184						#Count bases
185	5				196	$C{$_}++ foreach @arr;
186
187	5				29	$C{d} += $C{$_} foreach (qw(A T C G));
188	5				25	$C{n} += $C{$_} foreach (qw(B D H K M N R S V W Y));
189	5				10	$C{'?'} = ($C{d} + $C{n}) - $len;
190
191						#Estimate how many of each degenerate base would be a G or C
192	5				19	my $split = .5$C{R} + .5$C{Y} + .5$C{K} + .5$C{M} + .5*$C{N};
193	5				14	my $trip = (2$C{B} / 3) + (2$C{V} / 3) + ($C{D} / 3) + ($C{H} / 3);
194
195						#Calculate GC/AT percentage
196	5				10	my $gcc = $C{S} + $C{G} + $C{C} + $split + $trip;
197	5				39	my $gcp = sprintf "%.1f", ($gcc / $len) * 100;
198	5				9	$C{GCp} = $gcp;
199	5				16	$C{ATp} = 100 - $gcp;
200	5				7	$C{len} = $len;
201
202	5				56	return \%C;
203						}
204
205						=head2 _gcwindows()
206
207						takes a nucleotide sequence, a window size, and minimum and maximum values.
208						returns lists of real coordinates of subsequences that violate mimimum or
209						maximum GC percentages.
210
211						Values are returned inside an array reference such that the first value is an
212						array ref of minimum violators (as array refs of left/right coordinates), and
213						the second value is an array ref of maximum violators.
214
215						$return_value = [
216						[[left, right], [left, right]], #minimum violators
217						[[left, right], [left, right]] #maximum violators
218						];
219
220						=cut
221
222						sub _gcwindows
223						{
224	0			0	0	my ($seq, $winsize, $min, $max) = @_;
225	0				0	my @minflags = ();
226	0				0	my @maxflags = ();
227	0				0	my $seqlen = length $seq;
228	0				0	my $lefpos = 0;
229	0				0	my $rigpos = $winsize - 1;
230	0				0	my $win = substr $seq, $lefpos, $winsize;
231	0				0	my %hsh = (G => 0, C => 0, A => 0, T => 0);
232	0				0	$hsh{$_}++ foreach (split q{}, $win);
233	0				0	my $initbase = substr $seq, $lefpos, 1;
234	0				0	my $lastbase = substr $seq, $rigpos, 1;
235	0				0	while ($rigpos < $seqlen)
236						{
237	0				0	my $GCperc = 100 * (($hsh{G} + $hsh{C}) / $winsize);
238	0				0	$lefpos++;
239	0				0	$rigpos++;
240	0	0			0	if ($GCperc < $min)
241						{
242	0				0	push @minflags, [$lefpos, $rigpos];
243						}
244	0	0			0	if ($GCperc > $max)
245						{
246	0				0	push @maxflags, [$lefpos, $rigpos];
247						}
248	0				0	$hsh{$initbase}--;
249	0				0	$initbase = substr $seq, $lefpos, 1;
250	0				0	$lastbase = substr $seq, $rigpos, 1;
251	0				0	$hsh{$lastbase}++;
252						}
253
254	0				0	return [\@minflags, \@maxflags];
255						}
256
257						=head2 _melt()
258
259						takes a nucleotide sequence and returns a melting temperature
260
261						in: nucleotide sequence (string)
262						salt concentration (string, opt, def =.05),
263						oligo concentration (string, opt, def = .0000001)
264						out: temperature (float)
265
266						=cut
267
268						sub _melt
269						{
270	3			3	7	my ($strand, $salt, $conc) = @_;
271	3				5	my $C = _count($strand);
272	3				29	my $len = length($strand);
273	3		50		8	$salt = $salt \|\| .05;
274	3		50		6	$conc = $conc \|\| .0000001;
275	3				16	my $Naj = 16.6 * log10($salt);
276
277	3				5	my $Tm;
278
279	3	100	33		13	if ($len < 14)
		50
		0
280						{
281	1				4	$Tm = ((4 * ($C->{C} + $C->{G})) + (2 * ($C->{A} + $C->{T})));
282						}
283						elsif ($len >= 14 && $len <= 50)
284						{
285	2				7	$Tm = 100.5 + (41 * ($C->{G} + $C->{C}) / $len) - (820/$len) + $Naj;
286						}
287						elsif ($len > 50)
288						{
289	0				0	$Tm = 81.5 + (41 * ($C->{G} + $C->{C}) / $len) - (500/$len) + $Naj - .62;
290						}
291	3				24	return sprintf "%.1f", $Tm;
292						}
293
294						=head2 _ntherm()
295
296						takes a nucleotide sequence and returns a nearest neighbor melting temp.
297
298						in: nucleotide sequence (string)
299						out: temperature (float)
300
301						=cut
302
303						sub _ntherm
304						{
305	1			1	3	my ($strand, $salt, $conc) = @_;
306	1				3	my ($dH, $dS, $dG) = (0, 0, 0);
307	1				6	foreach my $w (keys %EEG)
308						{
309	16				104	while ($strand =~ /(?=$w)/ixg)
310						{
311	32				44	$dH += $EEG{$w}->[0];
312	32				90	$dS += $EEG{$w}->[1];
313						#$dG += $EEG{$w}->[2];
314						}
315						}
316	1		50		6	$salt = $salt \|\| .05;
317	1		50		4	$conc = $conc \|\| .0000001;
318	1				5	my $Naj = 16.6 * log10($salt);
319	1				4	my $lc = 1.987 * abs( log( $conc / 2 ) );
320	1				4	my $Tm = ( ( ($dH - 3.4) / ( ($dS + $lc) / 1000) ) - 273.15) + $Naj;
321	1				10	return sprintf "%.1f", $Tm;
322						}
323
324						=head2 _complement()
325
326						takes a nucleotide sequence and returns its complement or reverse complement.
327
328						in: nucleotide sequence (string),
329						switch for reverse complement (1 or null)
330						out: nucleotide sequence (string)
331
332						=cut
333
334						sub _complement
335						{
336	1190			1190	2692	my ($strand, $swit) = @_;
337	1190	100			2964	$strand = scalar reverse($strand) if ($swit);
338	1190				2261	$strand =~ tr/AaCcGgTtRrYyKkMmBbDdHhVv/TtGgCcAaYyRrMmKkVvHhDdBb/;
339	1190				3671	return $strand;
340						}
341
342						=head2 _toRNA()
343
344						=cut
345
346						sub _toRNA
347						{
348	0			0	0	my ($strand) = @_;
349	0				0	$strand =~ tr/Tt/Uu/;
350	0				0	return $strand;
351						}
352
353						=head2 _regres()
354
355						takes a sequence that may be degenerate and returns a string that is prepped
356						for use in a regular expression.
357
358						in: sequence (string),
359						switch for aa or nt sequence (1 or null)
360						out: regexp string (string)
361
362						=cut
363
364						sub _regres
365						{
366	1052			1052	1734	my ($sequence, $swit) = @_;
367	1052		100		1889	$swit = $swit \|\| 1;
368	1052				1308	my $comp = q{};
369	1052				3809	my @arr = split('', uc $sequence);
370	1052				1899	foreach my $char (@arr)
371						{
372	6529	100			8635	if ($swit == 1)
		50
373						{
374	6505				8194	my $ntide = $REGHSH{$char};
375	6505	50			9145	croak ("$char is not a nucleotide\n") unless $ntide;
376	6505				8927	$comp .= $ntide;
377						}
378						elsif ($swit == 2)
379						{
380	24				31	my $aa = $AACIDS{$char};
381	24	50			31	croak ("$char is not an amino acid \n") unless $aa;
382	24				27	$comp .= $aa;
383						}
384						}
385	1052				19268	return qr/$comp/ix;
386						}
387
388						=head2 regarr()
389
390						=cut
391
392						sub _regarr
393						{
394	773			773	1349	my ($sequence) = @_;
395	773				1567	my $regex = [_regres($sequence, 1)];
396	773				1968	my $comp = _complement($sequence, 1);
397	773	100			1942	if ($comp ne $sequence)
398						{
399	202				379	push @$regex, _regres($comp, 1);
400						}
401	773				2272	return $regex;
402						}
403
404						=head2 _positions()
405
406						=cut
407
408						sub _positions
409						{
410	6			6	14	my ($seq, $regarr) = @_;
411	6				14	my $temphash = {};
412	6				11	foreach my $sit (@$regarr)
413						{
414	10				156	while ($seq =~ /(?=($sit))/ixg)
415						{
416	8				69	$temphash->{pos $seq} = $1;
417						}
418						}
419	6				25	return $temphash;
420						}
421
422						=head2 _find_runs
423
424						=cut
425
426						sub _find_runs
427						{
428	0			0	0	my ($seq, $pattern, $min) = @_;
429	0				0	my @arr = ();
430	0		0		0	$min = $min \|\| 5;
431	0	0			0	if (_is_ambiguous($pattern))
432						{
433	0				0	die "Ambiguous pattern passed to _find_runs";
434						}
435	0				0	my $reg = '[' . $pattern . ']';
436	0				0	while ($seq =~ m{(($pattern) {$min,})}msixg)
437						{
438	0				0	my $len = length $1;
439	0				0	my $rig = pos $seq;
440	0				0	my $lef = $rig - $len + 1;
441	0				0	push @arr, [$lef, $rig];
442						}
443	0				0	return \@arr;
444						}
445
446						=head2 _is_ambiguous
447
448						=cut
449
450						sub _is_ambiguous
451						{
452	5			5	10	my ($sequence) = @_;
453	5	100			29	return 1 if ($sequence =~ $ambnt);
454	4				10	return 0;
455						}
456
457						=head2 _compare_sequences()
458
459						takes two nucleotide sequences that are assumed to be perfectly aligned and
460						roughly equivalent and returns similarity metrics. should be twweeaakkeedd
461
462						in: 2x nucleotide sequence (string)
463						out: similarity metrics (hash)
464
465						=cut
466
467						sub _compare_sequences
468						{
469	433			433	630	my ($topseq, $botseq) = @_;
470	433	50	33		1094	return if (!$botseq \|\| length($botseq) != length($topseq));
471	433				590	my ($tsit, $tver, $len) = (0, 0, length($topseq));
472	433				618	while (length($topseq) > 0)
473						{
474	2504				3694	my ($topbit, $botbit) = (chop($topseq), chop ($botseq));
475	2504	100			4036	if ($topbit ne $botbit)
476						{
477	814	100			1734	$topbit = $topbit =~ $REGHSH{R} ? 1 : 0;
478	814	100			1477	$botbit = $botbit =~ $REGHSH{R} ? 1 : 0;
479	814	100			1127	$tsit++ if ($topbit == $botbit);
480	814	100			1476	$tver++ if ($topbit != $botbit);
481						}
482						}
483	433				463	my %A;
484	433				602	$A{D} = $tsit + $tver; #changes
485	433				565	$A{I} = $len - $A{D}; #identities
486	433				454	$A{T} = $tsit; #transitions
487	433				456	$A{V} = $tver; #transversions
488	433				1668	$A{P} = sprintf "%.1f", 100 - (($A{D} / $len) * 100); #percent identity
489	433				908	return \%A;
490						}
491
492						=head2 _amb_transcription
493
494						=cut
495
496						sub _amb_transcription
497						{
498	10			10	17	my ($ntseq) = @_;
499	10				21	my $prseq = uc $ntseq;
500	10				16	my (@SEED, @NEW) = ((), ());
501	10				14	my $offset = 0;
502	10				17	my $ntlen = length $prseq;
503	10				25	while ($offset < $ntlen)
504						{
505	47				93	my $template = substr $prseq, $offset, 1;
506	47				67	my $ntides = $NTIDES{$template};
507	47				56	my $possibilities = scalar @$ntides;
508	47	100			65	if ($possibilities == 1)
509						{
510	44	100			80	@SEED = ( $template ) if ($offset == 0);
511	44	100			96	@NEW = ( $template ) if ($offset > 0);
512						}
513						else
514						{
515	3	50			10	@SEED = @$ntides if ($offset == 0);
516	3	50			10	@NEW = @$ntides if ($offset > 0);
517						}
518	47	100			84	unless ($offset == 0)
519						{
520	37				53	@SEED = _add_arr(\@SEED, \@NEW);
521						}
522	47				111	$offset ++;
523						}
524	10				16	my %hsh = map {$_ => 1 } @SEED;
	19				50
525	10				38	my @keys = sort keys %hsh;
526	10				37	return \@keys;
527						}
528
529
530						=head2 _add_arr
531
532						Basically builds a list of tree nodes for the amb_trans* functions.
533						in: 2 x peptide lists (array reference) out: combined list of peptides
534
535						=cut
536
537						sub _add_arr
538						{
539	75			75	117	my ($arr1ref, $arr2ref) = @_;
540	75				156	my @arr1 = @$arr1ref;
541	75				122	my @arr2 = @$arr2ref;
542	75				90	my @arr3 = ();
543	75				107	foreach my $do (@arr1)
544						{
545	465				605	foreach my $re (@arr2)
546						{
547	1456				2331	push @arr3, $do . $re
548						}
549						}
550	75				573	return @arr3;
551						}
552
553						1;
554
555						__END__