File Coverage

blib/lib/BioUtil/Seq.pm

Criterion	Covered	Total	%
statement	73	187	39.0
branch	19	82	23.1
condition	2	6	33.3
subroutine	14	24	58.3
pod	15	17	88.2
total	123	316	38.9

line	stmt	bran	cond	sub	pod	time	code
1							package BioUtil::Seq;
2
3							require Exporter;
4							@ISA = (Exporter);
5							@EXPORT = qw(
6							FastaReader
7							read_sequence_from_fasta_file
8							write_sequence_to_fasta_file
9							format_seq
10
11							validate_sequence
12							complement
13							revcom
14							base_content
15							degenerate_seq_to_regexp
16							degenerate_seq_match_sites
17							dna2peptide
18							codon2aa
19							generate_random_seqence
20
21							shuffle_sequences
22							rename_fasta_header
23							clean_fasta_header
24							);
25
26	1			1		22828	use vars qw($VERSION);
	1					3
	1					73
27
28	1			1		33	use 5.010_000;
	1					3
	1					42
29	1			1		15	use strict;
	1					7
	1					58
30	1			1		6	use warnings FATAL => 'all';
	1					1
	1					69
31
32	1			1		6	use List::Util qw(shuffle);
	1					1
	1					3889
33
34							=head1 NAME
35
36							BioUtil::Seq - Utilities for sequence
37
38							Some great modules like BioPerl provide many robust solutions.
39							However, it is not easy to install for someone in some platforms.
40							And for some simple task scripts, a lite module may be a good choice.
41							So I reinvented some wheels and added some useful utilities into this module,
42							hoping it would be helpful.
43
44							=head1 VERSION
45
46							Version 2015.0309
47
48							=cut
49
50							our $VERSION = 2015.0309;
51
52							=head1 EXPORT
53
54							FastaReader
55							read_sequence_from_fasta_file
56							write_sequence_to_fasta_file
57							format_seq
58
59							validate_sequence
60							complement
61							revcom
62							base_content
63							degenerate_seq_to_regexp
64							degenerate_seq_match_sites
65							dna2peptide
66							codon2aa
67							generate_random_seqence
68
69							shuffle_sequences
70							rename_fasta_header
71							clean_fasta_header
72
73							=head1 SYNOPSIS
74
75							use BioUtil::Seq;
76
77
78							=head1 SUBROUTINES/METHODS
79
80
81							=head2 FastaReader
82
83							FastaReader is a fasta file parser using closure.
84							FastaReader returns an anonymous subroutine, when called, it
85							return a fasta record which is reference of an array
86							containing fasta header and sequence.
87
88							FastaReader could also read from STDIN when the file name is "STDIN" or "stdin".
89
90							A boolean argument is optional. If set as "true", spaces including blank, tab,
91							"return" ("\r") and "new line" ("\n") symbols in sequence will not be trimed.
92
93							FastaReader speeds up by utilizing the special Perl variable $/ (set to "\n>"),
94							with kind help of Mario Roy, author of MCE
95							(https://code.google.com/p/many-core-engine-perl/). A lot of optimizations were
96							also done by him.
97
98							Example:
99
100							# do not trim the spaces and \n
101							# $not_trim = 1;
102							# my $next_seq = FastaReader("test.fa", $not_trim);
103
104							# read from STDIN
105							# my $next_seq = FastaReader('STDIN');
106
107							# read from file
108							my $next_seq = FastaReader("test.fa");
109
110							while ( my $fa = &$next_seq() ) {
111							my ( $header, $seq ) = @$fa;
112
113							print ">$header\n$seq\n";
114							}
115
116							=cut
117
118							sub FastaReader {
119	2			2	1	3	my ( $file, $not_trim ) = @_;
120
121	2					4	my ( $open_flg, $finished ) = ( 0, 0 );
122	2					5	my ( $fh, $pos, $head ) = (undef) x 3;
123
124	2	50	33			21	if ( $file =~ /^STDIN$/i ) { # from stdin
		50
125	0					0	$fh = *STDIN;
126							}
127							elsif ( ref $file eq '' or ref $file eq 'SCALAR' ) { # from file
128	2	50				69	open $fh, '<', $file or die "fail to open file: $file!\n";
129	2					5	$open_flg = 1;
130							}
131							else { # glob, i.e. given file handler
132	0					0	$fh = $file;
133							}
134
135	2					13	local $/ = \1; ## read one byte
136	2					23	while (<$fh>) { ## until reaching ">"
137	2	50				7	last if $_ eq '>';
138							}
139							return sub {
140	6	50		6		12	return if $finished;
141
142	6					19	local $/ = "\n>"; ## set input record separator
143	6					31	while (<$fh>) {
144							## trim trailing ">", part of $/. faster than s/\r?\n>$//
145	4	100				15	substr( $_, -1, 1, '' ) if substr( $_, -1, 1 ) eq '>';
146
147							## extract header and sequence
148							# faster than ( $head, $seq ) = split( /\n/, $_, 2 );
149	4					10	$pos = index( $_, "\n" ) + 1;
150	4					9	$head = substr( $_, 0, $pos - 1 );
151
152							# $_ becomes sequence, to save memory
153							# $seq = substr( $_, $pos );
154	4					7	substr( $_, 0, $pos, '' );
155
156							## trim trailing "\r" in header
157	4	50				13	chop $head if substr( $head, -1, 1 ) eq "\r";
158
159	4	50				11	if ( length $head > 0 ) {
160
161							# faster than $seq =~ s/\s//g unless $not_trim;
162							# $seq =~ tr/\t\r\n //d unless $not_trim;
163	4	50				15	$_ =~ tr/\t\r\n //d unless $not_trim;
164	4					26	return [ $head, $_ ];
165							}
166							}
167
168	2	50				20	close $fh if $open_flg;
169	2					4	$finished = 1;
170	2					11	return;
171	2					21	};
172							}
173
174							sub FastaReader_old {
175	0			0	0	0	my ( $file, $not_trim ) = @_;
176
177	0					0	my ( $last_header, $seq_buffer ) = ( '', '' ); # buffer for header and seq
178	0					0	my ( $header, $seq ) = ( '', '' ); # current header and seq
179	0					0	my $finished = 0;
180
181	0					0	my ( $fh, $is_stdin ) = ( undef, 0 );
182	0	0				0	if ( $file =~ /^STDIN$/i ) {
183	0					0	( $fh, $is_stdin ) = ( *STDIN, 1 );
184							}
185							else {
186	0	0				0	open $fh, "<", $file or die "fail to open file: $file!\n";
187							}
188
189							return sub {
190	0	0		0		0	return undef if $finished; # end of file
191
192	0					0	while (<$fh>) {
193	0					0	s/^\s+//; # remove the space at the front of line
194
195	0	0				0	if (/^>(.*)/) { # header line
196	0					0	( $header, $last_header ) = ( $last_header, $1 );
197	0					0	( $seq, $seq_buffer ) = ( $seq_buffer, '' );
198
199							# only output fasta records with non-blank header
200	0	0				0	if ( $header ne '' ) {
201	0	0				0	$seq =~ s/\s+//g unless $not_trim;
202	0					0	return [ $header, $seq ];
203							}
204							}
205							else {
206	0					0	$seq_buffer .= $_; # append seq
207							}
208							}
209	0	0				0	close $fh unless $is_stdin;
210	0					0	$finished = 1;
211
212							# last record. only output fasta records with non-blank header
213	0	0				0	if ( $last_header ne '' ) {
214	0	0				0	$seq_buffer =~ s/\s+//g unless $not_trim;
215	0					0	return [ $last_header, $seq_buffer ];
216							}
217	0					0	};
218							}
219
220							=head2 read_sequence_from_fasta_file
221
222							Read all sequences from fasta file.
223
224							Example:
225
226							my $seqs = read_sequence_from_fasta_file($file);
227							for my $header (keys %$seqs) {
228							my $seq = $$seqs{$header};
229							print ">$header\n$seq\n";
230							}
231
232							=cut
233
234							sub read_sequence_from_fasta_file {
235	2			2	1	16	my ( $file, $not_trim ) = @_;
236	2					5	my $seqs = {};
237
238	2					10	my $next_seq = FastaReader( $file, $not_trim );
239	2					5	while ( my $fa = &$next_seq() ) {
240							# my ( $header, $seq ) = @$fa;
241							# $$seqs{$header} = $seq;
242	4					20	$$seqs{ $fa->[0] } = $fa->[1];
243							}
244
245	2					21	return $seqs;
246							}
247
248							=head2 write_sequence_to_fasta_file
249
250							Example:
251
252							my $seq = {"seq1" => "acgagaggag"};
253							write_sequence_to_fasta_file($seq, "seq.fa");
254
255							=cut
256
257							sub write_sequence_to_fasta_file {
258	1			1	1	464	my ( $seqs, $file, $n ) = @_;
259	1	50				6	unless ( ref $seqs eq 'HASH' ) {
260	0					0	warn "seqs should be reference of hash\n";
261	0					0	return 0;
262							}
263	1	50				4	$n = 70 unless defined $n;
264
265	1	50				118	open my $fh2, ">$file" or die "failed to write to $file\n";
266	1					6	for ( keys %$seqs ) {
267	2					11	print $fh2 ">$_\n", format_seq( $$seqs{$_}, $n ), "\n";
268							}
269	1					58	close $fh2;
270							}
271
272							=head2 format_seq
273
274							Format sequence to readable text
275
276							Example:
277
278							printf ">%s\n%s", $head, format_seq($seq, 60);
279
280							=cut
281
282							sub format_seq {
283	2			2	1	5	my ( $s, $n ) = @_;
284	2	50				7	$n = 70 unless defined $n;
285	2	50	33			25	unless ( $n =~ /^\d+$/ and $n > 0 ) {
286	0					0	warn "n should be positive integer\n";
287	0					0	return $s;
288							}
289
290	2					4	my $s2 = '';
291	2					3	my ( $j, $int );
292	2					8	$int = int( ( length $s ) / $n );
293	2					7	for ( $j = 0; $j <= $int; $j++ ) {
294	2					74	$s2 .= substr( $s, $j * $n, $n ) . "\n";
295							}
296	2					15	return $s2;
297							}
298
299							=head2 validate_sequence
300
301							Validate a sequence.
302
303							Legale symbols:
304
305							DNA: ACGTRYSWKMBDHV
306							RNA: ACGURYSWKMBDHV
307							Protein: ACDEFGHIKLMNPQRSTVWY
308							gap and space: - *.
309
310							Example:
311
312							if (validate_sequence($seq)) {
313							# do some thing
314							}
315
316							=cut
317
318							sub validate_sequence {
319	2			2	1	332	my ($seq) = @_;
320	2	100				13	return 0 if $seq =~ /[^\.\-\s_*ABCDEFGHIKLMNPQRSTUVWY]/i;
321	1					3	return 1;
322							}
323
324							=head2 complement
325
326							Complement sequence
327
328							IUPAC nucleotide code: ACGTURYSWKMBDHVN
329
330							http://droog.gs.washington.edu/parc/images/iupac.html
331
332							code base Complement
333							A A T
334							C C G
335							G G C
336							T/U T A
337
338							R A/G Y
339							Y C/T R
340							S C/G S
341							W A/T W
342							K G/T M
343							M A/C K
344
345							B C/G/T V
346							D A/G/T H
347							H A/C/T D
348							V A/C/G B
349
350							X/N A/C/G/T X
351							. not A/C/G/T
352							or- gap
353
354							my $comp = complement($seq);
355
356							=cut
357
358							sub complement {
359	1			1	1	23	my ($s) = @_;
360	1					3	$s
361							=~ tr/ACGTURYMKSWBDHVNacgturymkswbdhvn/TGCAAYRKMSWVHDBNtgcaayrkmswvhdbn/;
362	1					7	return $s;
363							}
364
365							=head2 revcom
366
367							Reverse complement sequence
368
369							my $recom = revcom($seq);
370
371							=cut
372
373							sub revcom {
374	1			1	1	5	return reverse complement( $_[0] );
375							}
376
377							=head2 base_content
378
379							Example:
380
381							my $gc_cotent = base_content('gc', $seq);
382
383							=cut
384
385							sub base_content {
386	1			1	1	3	my ( $bases, $seq ) = @_;
387	1	50				5	if ( $seq eq '' ) {
388	0					0	return 0;
389							}
390
391	1					2	my $sum = 0;
392	1					47	$sum += $seq =~ s/$_/$_/ig for split "", $bases;
393	1					32	return sprintf "%.4f", $sum / length $seq;
394							}
395
396							=head2 degenerate_seq_to_regexp
397
398							Translate degenerate sequence to regular expression
399
400							=cut
401
402							sub degenerate_seq_to_regexp {
403	0			0	1		my ($seq) = @_;
404	0						my %bases = (
405							'A' => 'A',
406							'T' => 'T',
407							'U' => 'U',
408							'C' => 'C',
409							'G' => 'G',
410							'R' => '[AG]',
411							'Y' => '[CT]',
412							'M' => '[AC]',
413							'K' => '[GT]',
414							'S' => '[CG]',
415							'W' => '[AT]',
416							'H' => '[ACT]',
417							'B' => '[CGT]',
418							'V' => '[ACG]',
419							'D' => '[AGT]',
420							'N' => '[ACGT]',
421							);
422	0						return join '', map { $bases{$_} } split //, $seq;
	0
423							}
424
425							=head2 degenerate_seq_match_sites
426
427							Find all sites matching degenerat subseq
428
429							=cut
430
431							sub degenerate_seq_match_sites {
432	0			0	1		my ( $r, $s ) = @_;
433
434							# original regexp length
435	0						my $r2 = $r;
436	0						$r2 =~ s/\[[^\[\]]+?\]/_/g;
437	0						my $len = length $r2;
438
439	0						my @sites = ();
440	0						my $pos = -1;
441	0						while ( $s =~ /($r)/ig ) {
442	0						$pos = pos $s;
443	0						push @sites, [ $pos - $len + 1, $pos, $1];
444	0						pos $s = $pos - $len + 1;
445							}
446	0						return \@sites;
447							}
448
449							=head2 dna2peptide
450
451							Translate DNA sequence into a peptide
452
453							=cut
454
455							sub dna2peptide {
456	0			0	1		my ($dna) = @_;
457	0						my $protein = '';
458
459							# Translate each three-base codon to an amino acid, and append to a protein
460	0						for ( my $i = 0; $i < ( length($dna) - 2 ); $i += 3 ) {
461	0						$protein .= codon2aa( substr( $dna, $i, 3 ) );
462							}
463	0						return $protein;
464							}
465
466							=head2 codon2aa
467
468							Translate a DNA 3-character codon to an amino acid
469
470							=cut
471
472							sub codon2aa {
473	0			0	1		my ($codon) = @_;
474	0						$codon = uc $codon;
475	0						my %genetic_code = (
476							'TCA' => 'S', # Serine
477							'TCC' => 'S', # Serine
478							'TCG' => 'S', # Serine
479							'TCT' => 'S', # Serine
480							'TTC' => 'F', # Phenylalanine
481							'TTT' => 'F', # Phenylalanine
482							'TTA' => 'L', # Leucine
483							'TTG' => 'L', # Leucine
484							'TAC' => 'Y', # Tyrosine
485							'TAT' => 'Y', # Tyrosine
486							'TAA' => '_', # Stop
487							'TAG' => '_', # Stop
488							'TGC' => 'C', # Cysteine
489							'TGT' => 'C', # Cysteine
490							'TGA' => '_', # Stop
491							'TGG' => 'W', # Tryptophan
492							'CTA' => 'L', # Leucine
493							'CTC' => 'L', # Leucine
494							'CTG' => 'L', # Leucine
495							'CTT' => 'L', # Leucine
496							'CCA' => 'P', # Proline
497							'CCC' => 'P', # Proline
498							'CCG' => 'P', # Proline
499							'CCT' => 'P', # Proline
500							'CAC' => 'H', # Histidine
501							'CAT' => 'H', # Histidine
502							'CAA' => 'Q', # Glutamine
503							'CAG' => 'Q', # Glutamine
504							'CGA' => 'R', # Arginine
505							'CGC' => 'R', # Arginine
506							'CGG' => 'R', # Arginine
507							'CGT' => 'R', # Arginine
508							'ATA' => 'I', # Isoleucine
509							'ATC' => 'I', # Isoleucine
510							'ATT' => 'I', # Isoleucine
511							'ATG' => 'M', # Methionine
512							'ACA' => 'T', # Threonine
513							'ACC' => 'T', # Threonine
514							'ACG' => 'T', # Threonine
515							'ACT' => 'T', # Threonine
516							'AAC' => 'N', # Asparagine
517							'AAT' => 'N', # Asparagine
518							'AAA' => 'K', # Lysine
519							'AAG' => 'K', # Lysine
520							'AGC' => 'S', # Serine
521							'AGT' => 'S', # Serine
522							'AGA' => 'R', # Arginine
523							'AGG' => 'R', # Arginine
524							'GTA' => 'V', # Valine
525							'GTC' => 'V', # Valine
526							'GTG' => 'V', # Valine
527							'GTT' => 'V', # Valine
528							'GCA' => 'A', # Alanine
529							'GCC' => 'A', # Alanine
530							'GCG' => 'A', # Alanine
531							'GCT' => 'A', # Alanine
532							'GAC' => 'D', # Aspartic Acid
533							'GAT' => 'D', # Aspartic Acid
534							'GAA' => 'E', # Glutamic Acid
535							'GAG' => 'E', # Glutamic Acid
536							'GGA' => 'G', # Glycine
537							'GGC' => 'G', # Glycine
538							'GGG' => 'G', # Glycine
539							'GGT' => 'G', # Glycine
540							);
541
542	0	0					if ( exists $genetic_code{$codon} ) {
543	0						return $genetic_code{$codon};
544							}
545							else {
546	0						print STDERR "Bad codon \"$codon\"!!\n";
547	0						exit;
548							}
549							}
550
551							=head2 generate_random_seqence
552
553							Example:
554
555							my @alphabet = qw/a c g t/;
556							my $seq = generate_random_seqence( \@alphabet, 50 );
557
558							=cut
559
560							sub generate_random_seqence {
561	0			0	1		my ( $alphabet, $length ) = @_;
562	0	0					unless ( ref $alphabet eq 'ARRAY' ) {
563	0						warn "alphabet should be ref of array\n";
564	0						return 0;
565							}
566
567	0						my $n = @$alphabet;
568	0						my $seq;
569	0						$seq .= $$alphabet[ int rand($n) ] for ( 1 .. $length );
570	0						return $seq;
571							}
572
573							=head2 shuffle sequences
574
575							Example:
576
577							shuffle_sequences($file, "$file.shuf.fa");
578
579							=cut
580
581							sub shuffle_sequences {
582	0			0	0		my ( $file, $file_out, $not_trim ) = @_;
583	0						my $seqs = read_sequence_from_fasta_file( $file, $not_trim );
584	0						my @keys = shuffle( keys %$seqs );
585
586	0	0					$file_out = "$file.shuffled.fa" unless defined $file_out;
587	0	0					open my $fh2, ">$file_out" or die "fail to write file $file_out\n";
588	0						print $fh2 ">$_\n$$seqs{$_}\n" for @keys;
589	0						close $fh2;
590
591	0						return $file_out;
592							}
593
594							=head2 rename_fasta_header
595
596							Rename fasta header with regexp.
597
598							Example:
599
600							# delete some symbols
601							my $n = rename_fasta_header('[^a-z\d\s\-\_\[\]\\|]', '', $file, "$file.rename.fa");
602							print "$n records renamed\n";
603
604							=cut
605
606							sub rename_fasta_header {
607	0			0	1		my ( $regex, $repalcement, $file, $outfile ) = @_;
608
609	0	0					open my $fh, "<", $file or die "fail to open file: $file\n";
610	0	0					open my $fh2, ">", $outfile or die "fail to wirte file: $outfile\n";
611
612	0						my $head = '';
613	0						my $n = 0;
614	0						while (<$fh>) {
615	0	0					if (/^\s>(.)\r?\n/) {
616	0						$head = $1;
617	0	0					if ( $head =~ /$regex/ ) {
618	0						$head =~ s/$regex/$repalcement/g;
619	0						$n++;
620							}
621	0						print $fh2 ">$head\n";
622							}
623							else {
624	0						print $fh2 $_;
625							}
626							}
627	0						close $fh;
628	0						close $fh2;
629
630	0						return $n;
631							}
632
633							=head2 clean_fasta_header
634
635							Rename given symbols to repalcement string.
636							Because, some symbols in fasta header will cause unexpected result.
637
638							Example:
639
640							my $file = "test.fa";
641							my $n = clean_fasta_header($file, "$file.rename.fa");
642							# replace any symbol in (\/:*?"<>\|) with '', i.e. deleting.
643							# my $n = clean_fasta_header($file, "$file.rename.fa", '', '\/:*?"<>\|');
644							print "$n records renamed\n";
645
646							=cut
647
648							sub clean_fasta_header {
649	0			0	1		my ( $file, $outfile, $replacement, $symbols ) = @_;
650	0	0					$replacement = "_" unless defined $replacement;
651
652	0						my @default = split //, '\/:*?"<>\|';
653	0	0					$symbols = \@default unless defined $symbols;
654	0	0					unless ( ref $symbols eq 'ARRAY' ) {
655	0						warn "symbols should be ref of array\n";
656	0						return 0;
657							}
658	0						my $re = join '', map { quotemeta $_ } @$symbols;
	0
659	0	0					open my $fh, "<", $file or die "fail to open file: $file\n";
660	0	0					open my $fh2, ">", $outfile or die "fail to wirte file: $outfile\n";
661
662	0						my $head = '';
663	0						my $n = 0;
664	0						while (<$fh>) {
665	0	0					if (/^\s>(.)\r?\n/) {
666	0						$head = $1;
667	0	0					if ( $head =~ /[$re]/ ) {
668	0						$head =~ s/[$re]/$replacement/g;
669	0						$n++;
670							}
671	0						print $fh2 ">$head\n";
672							}
673							else {
674	0						print $fh2 $_;
675							}
676							}
677	0						close $fh;
678	0						close $fh2;
679
680	0						return $n;
681							}
682
683							1;