File Coverage

blib/lib/Bio/Util/DNA.pm

Criterion	Covered	Total	%
statement	12	46	26.0
branch	0	4	0.0
condition	0	2	0.0
subroutine	4	8	50.0
pod	4	4	100.0
total	20	64	31.2

line	stmt	bran	cond	sub	pod	time	code
1							=head1 NAME
2
3							Bio::Util::DNA - Basic DNA utilities
4
5							=head1 SYNOPSES
6
7							use Bio::Util::DNA qw(:all);
8
9							my $clean_ref = cleanDNA($seq_ref);
10							my $seq_ref = randomDNA(100);
11							my $rev_ref = reverse_complement($seq_ref);
12
13							=head1 DESCRIPTION
14
15							Provides a set of functions and predefined variables which are handy when
16							working with DNA.
17
18							=cut
19
20							package Bio::Util::DNA;
21
22	1			1		31499	use strict;
	1					2
	1					36
23	1			1		5	use warnings;
	1					1
	1					26
24
25	1			1		1335	use version; our $VERSION = qv('0.2.1');
	1					2445
	1					7
26
27	1			1		88	use Exporter 'import';
	1					2
	1					1109
28
29							our %EXPORT_TAGS;
30							$EXPORT_TAGS{funcs} = [
31							qw(
32							cleanDNA
33							randomDNA
34							unrollDNA
35							reverse_complement
36							)
37							];
38							$EXPORT_TAGS{all} = [
39							@{ $EXPORT_TAGS{funcs} },
40							qw(
41							$DNAs
42							@DNAs
43							$DNA_match
44							$DNA_fail
45
46							$RNAs
47							@RNAs
48							$RNA_match
49							$RNA_fail
50
51							$degenerates
52							@degenerates
53							$degenerate_match
54							$degenerate_fail
55
56							$all_nucleotides
57							@all_nucleotides
58							$all_nucleotide_match
59							$all_nucleotide_fail
60
61							%degenerate2nucleotides
62							%nucleotides2degenerate
63							%degenerate_hierarchy
64							)
65							];
66
67							our @EXPORT_OK = @{ $EXPORT_TAGS{all} };
68
69							=head1 VARIABLES
70
71							=head2 BASIC VARIABLES
72
73							Basic nucleotide variables that could be useful. All of the variables have a
74							prefix and a suffix;
75
76							=head3 Prefixes
77
78							=over
79
80							=item DNA [ACGT]
81
82							=item RNA [ACGU]
83
84							=item degenerate
85
86							=item all_nucleotide
87
88							=back
89
90							=head3 Suffixes
91
92							=over
93
94							=item ${prefix}s
95
96							String of the different nucleotides
97
98							=item @{prefix}s
99
100							Array of the different nucleotides
101
102							=item ${prefix}_match
103
104							Precompiled regular expression which matches nucleotide characters
105
106							=item ${prefix}_fail
107
108							Precompiled regular expression which matches non-nucleotide characters
109
110							=back
111
112							=cut
113
114							our $DNAs = 'ACGT';
115							our @DNAs = split //, $DNAs;
116							our $DNA_match = qr/[$DNAs]/i;
117							our $DNA_fail = qr/[^$DNAs]/i;
118
119							our $RNAs = 'ACGU';
120							our @RNAs = split //, $RNAs;
121							our $RNA_match = qr/[$RNAs]/i;
122							our $RNA_fail = qr/[^$RNAs]/i;
123
124							our $degenerates = 'BDHKMNRSVWY';
125							our @degenerates = split //, $degenerates;
126							our $degenerate_match = qr/[$degenerates]/i;
127							our $degenerate_fail = qr/[^$degenerates]/i;
128
129							our $all_nucleotides = 'ACGTUBDHKMNRSVWY';
130							our @all_nucleotides = split //, $all_nucleotides;
131							our $all_nucleotide_match = qr/[$all_nucleotides]/i;
132							our $all_nucleotide_fail = qr/[^$all_nucleotides]/i;
133
134							=head2 %degenerate2nucleotides
135
136							Hash of degenerate nucleotide definitions. Each entry contains a reference to
137							an array of DNA nucleotides that each degenerate nucleotide stands for.
138
139							=cut
140
141							our %degenerate2nucleotides = (
142							N => [qw( A C G T )],
143							B => [qw( C G T )], # !A
144							D => [qw( A G T )], # !C
145							H => [qw( A C T )], # !G
146							V => [qw( A C G )], # !T
147							M => [qw( A C )], # aroMatic
148							R => [qw( A G )], # puRine
149							W => [qw( A T )],
150							S => [qw( C G )],
151							Y => [qw( C T )], # pYrimidine
152							K => [qw( G T )] # Keto
153							);
154
155							=head2 %nucleotides2degenerate
156
157							Reverse of %degenerate2nucleotides. Keys are alphabetically-sorted DNA
158							nucleotides and values are the degenerate nucleotide that can represent those
159							nucleotides.
160
161							=cut
162
163							our %nucleotides2degenerate = (
164							ACGT => 'N',
165							CGT => 'B',
166							AGT => 'D',
167							ACT => 'H',
168							ACG => 'V',
169							AC => 'M',
170							AG => 'R',
171							AT => 'W',
172							CG => 'S',
173							CT => 'Y',
174							GT => 'K'
175							);
176
177							=head2 %degenerate_hierarchy
178
179							Contains the heirarchy of degenerate nucleotides; N of course contains all the
180							other degenerates, and the four degenerates that can stand for three different
181							bases contain three of the two-base degenerates.
182
183							=cut
184
185							our %degenerate_hierarchy = (
186							N => [qw( M R W S Y K V H D B )],
187							B => [qw( S Y K )], # !A = [CG],[CT],[GT]
188							D => [qw( R W K )], # !C = [AT],[AG],[GT]
189							H => [qw( M W Y )], # !G = [AC],[AT],[CT]
190							V => [qw( M R S )] # !T = [AC],[AG],[CG]
191							);
192
193							=head1 FUNCTIONS
194
195							=head2 cleanDNA
196
197							my $clean_ref = cleanDNA($seq_ref);
198
199							Cleans the sequence for use. Strips out comments (lines starting with '>') and
200							whitespace, converts uracil to thymine, and capitalizes all characters.
201
202							Examples:
203
204							my $clean_ref = cleanDNA($seq_ref);
205
206							my $seq_ref = cleanDNA(\'actg');
207							my $seq_ref = cleanDNA(\'act tag cta');
208							my $seq_ref = cleanDNA(\'>some mRNA
209							acugauauagau
210							uauagacgaucc');
211
212							=cut
213
214							sub cleanDNA {
215	0			0	1		my $seq_ref = shift;
216
217	0						my $clean = uc $$seq_ref;
218	0						$clean =~ s/^>.*//m;
219	0						$clean =~ s/$all_nucleotide_fail+//g;
220	0						$clean =~ tr/U/T/;
221
222	0						return \$clean;
223							}
224
225							=head2 randomDNA
226
227							my $seq_ref = randomDNA($length);
228
229							Generate random DNA for testing this module or your own scripts. Default length
230							is 100 nucleotides.
231
232							Example:
233
234							my $seq_ref = randomDNA();
235							my $seq_ref = randomDNA(600);
236
237							=cut
238
239							sub randomDNA {
240	0			0	1		my $length = shift;
241	0		0				$length = $length \|\| 100;
242
243	0						my $seq;
244	0						$seq .= int rand 4 while ( $length-- > 0 );
245	0						$seq =~ tr/0123/ACGT/;
246
247	0						return \$seq;
248							}
249
250							=head2 reverse_complement
251
252							=head2 rev_comp
253
254							my $reverse_ref = reverse_complement($seq_ref);
255
256							Finds the reverse complement of the sequence and handles degenerate
257							nucleotides.
258
259							Example:
260
261							$reverse_ref = reverse_complement(\'act');
262
263							=cut
264
265							sub reverse_complement {
266	0			0	1		my $seq_ref = shift;
267
268	0						my $reverse = reverse $$seq_ref;
269	0						$reverse =~ tr/acgtmrykvhdbnACGTMRYKVHDBN/tgcakyrmbdhvnTGCAKYRMBDHVN/;
270
271	0						return \$reverse;
272							}
273
274							=head2 unrollDNA
275
276							my $seq_arrayref = unrollDNA( $seq_ref );
277
278							Unroll a DNA string containing degenerate nucleotides. The first entry of the
279							arrayref will be the actual sequence.
280
281							Example:
282
283							my $seq_arrayref = unrollDNA( \'ACSTAD' ) =
284							[
285							'ACSTAD', 'ACCTAD', 'ACGTAD',
286							'ACSTAR', 'ACCTAR', 'ACGTAR',
287							'ACSTAW', 'ACCTAW', 'ACGTAW',
288							'ACSTAK', 'ACCTAK', 'ACGTAK',
289							'ACSTAA', 'ACCTAA', 'ACGTAA',
290							'ACSTAG', 'ACCTAG', 'ACGTAG',
291							'ACSTAT', 'ACCTAT', 'ACGTAT'
292							];
293
294							=cut
295
296							sub unrollDNA {
297	0			0	1		my ($seq_ref) = @_;
298
299	0						my @nucleotides = map {
300	0						[
301							$_,
302							(
303	0						$degenerate_hierarchy{$_} ? @{ $degenerate_hierarchy{$_} }
304							: ()
305							),
306							(
307	0	0					$degenerate2nucleotides{$_} ? @{ $degenerate2nucleotides{$_} }
		0
308							: ()
309							),
310							]
311							} split //, uc $$seq_ref;
312
313	0						my @possibilities = ( [ (undef) x @nucleotides ] );
314
315	0						for ( my $i = 0 ; $i < @nucleotides ; $i++ ) {
316	0						my $variants = $nucleotides[$i];
317
318	0						push @possibilities, map {
319	0						map { [@$_] }
	0
320							@possibilities
321							} (undef) x $#$variants;
322
323	0						my $block_size = @possibilities / @$variants;
324	0						for ( my $j = 0 ; $j < @$variants ; $j++ ) {
325	0						my $variant = $variants->[$j];
326	0						for ( my $k = $block_size * $j; $k < $block_size * ($j + 1); $k++ ) {
327	0						$possibilities[$k][$i] = $variant;
328							}
329							}
330							}
331
332	0						return [ map { join( '', @$_ ) } @possibilities ];
	0
333							}
334
335							1;
336
337							=head1 AUTHOR
338
339							Kevin Galinsky,
340
341							=head1 COPYRIGHT AND LICENSE
342
343							Copyright (c) 2010-2011, Broad Institute.
344
345							Copyright (c) 2008-2009, J. Craig Venter Institute.
346
347							This program is free software; you can redistribute it and/or modify it
348							under the same terms as Perl itself.
349
350							=cut