File Coverage

blib/lib/BioUtil/Seq.pm

Criterion	Covered	Total	%
statement	73	184	39.6
branch	19	84	22.6
condition	2	6	33.3
subroutine	14	24	58.3
pod	15	17	88.2
total	123	315	39.0

line	stmt	bran	cond	sub	pod	time	code
1							package BioUtil::Seq;
2
3							require Exporter;
4							@ISA = (Exporter);
5							@EXPORT = qw(
6							FastaReader
7							read_sequence_from_fasta_file
8							write_sequence_to_fasta_file
9							format_seq
10
11							validate_sequence
12							complement
13							revcom
14							base_content
15							degenerate_seq_to_regexp
16							match_regexp
17							dna2peptide
18							codon2aa
19							generate_random_seqence
20
21							shuffle_sequences
22							rename_fasta_header
23							clean_fasta_header
24							);
25
26	1			1		15889	use vars qw($VERSION);
	1					2
	1					46
27
28	1			1		16	use 5.010_000;
	1					4
29	1			1		4	use strict;
	1					10
	1					21
30	1			1		4	use warnings FATAL => 'all';
	1					1
	1					49
31
32	1			1		5	use List::Util qw(shuffle);
	1					1
	1					2638
33
34							=head1 NAME
35
36							BioUtil::Seq - Utilities for sequence
37
38							Some great modules like BioPerl provide many robust solutions.
39							However, it is not easy to install for someone in some platforms.
40							And for some simple task scripts, a lite module may be a good choice.
41							So I reinvented some wheels and added some useful utilities into this module,
42							hoping it would be helpful.
43
44							=head1 VERSION
45
46							Version 2015.0309
47
48							=cut
49
50							our $VERSION = 2015.0728;
51
52							=head1 EXPORT
53
54							FastaReader
55							read_sequence_from_fasta_file
56							write_sequence_to_fasta_file
57							format_seq
58
59							validate_sequence
60							complement
61							revcom
62							base_content
63							degenerate_seq_to_regexp
64							match_regexp
65							dna2peptide
66							codon2aa
67							generate_random_seqence
68
69							shuffle_sequences
70							rename_fasta_header
71							clean_fasta_header
72
73							=head1 SYNOPSIS
74
75							use BioUtil::Seq;
76
77
78							=head1 SUBROUTINES/METHODS
79
80
81							=head2 FastaReader
82
83							FastaReader is a fasta file parser using closure.
84							FastaReader returns an anonymous subroutine, when called, it
85							return a fasta record which is reference of an array
86							containing fasta header and sequence.
87
88							FastaReader could also read from STDIN when the file name is "STDIN" or "stdin".
89
90							A boolean argument is optional. If set as "true", spaces including blank, tab,
91							"return" ("\r") and "new line" ("\n") symbols in sequence will not be trimed.
92
93							FastaReader speeds up by utilizing the special Perl variable $/ (set to "\n>"),
94							with kind help of Mario Roy, author of MCE
95							(https://code.google.com/p/many-core-engine-perl/). A lot of optimizations were
96							also done by him.
97
98							Example:
99
100							# do not trim the spaces and \n
101							# $not_trim = 1;
102							# my $next_seq = FastaReader("test.fa", $not_trim);
103
104							# read from STDIN
105							# my $next_seq = FastaReader('STDIN');
106
107							# read from file
108							my $next_seq = FastaReader("test.fa");
109
110							while ( my $fa = &$next_seq() ) {
111							my ( $header, $seq ) = @$fa;
112
113							print ">$header\n$seq\n";
114							}
115
116							=cut
117
118							sub FastaReader {
119	2			2	1	4	my ( $file, $not_trim ) = @_;
120
121	2					4	my ( $open_flg, $finished ) = ( 0, 0 );
122	2					5	my ( $fh, $pos, $head ) = (undef) x 3;
123
124	2	50	33			14	if ( $file =~ /^STDIN$/i ) { # from stdin
		50
125	0					0	$fh = *STDIN;
126							}
127							elsif ( ref $file eq '' or ref $file eq 'SCALAR' ) { # from file
128	2	50				56	open $fh, '<', $file or die "fail to open file: $file!\n";
129	2					5	$open_flg = 1;
130							}
131							else { # glob, i.e. given file handler
132	0					0	$fh = $file;
133							}
134
135	2					7	local $/ = \1; ## read one byte
136	2					18	while (<$fh>) { ## until reaching ">"
137	2	50				8	last if $_ eq '>';
138							}
139							return sub {
140	6	50		6		13	return if $finished;
141
142	6					16	local $/ = "\n>"; ## set input record separator
143	6					20	while (<$fh>) {
144							## trim trailing ">", part of $/. faster than s/\r?\n>$//
145	4	100				12	substr( $_, -1, 1, '' ) if substr( $_, -1, 1 ) eq '>';
146
147							## extract header and sequence
148							# faster than ( $head, $seq ) = split( /\n/, $_, 2 );
149	4					9	$pos = index( $_, "\n" ) + 1;
150	4					6	$head = substr( $_, 0, $pos - 1 );
151
152							# $_ becomes sequence, to save memory
153							# $seq = substr( $_, $pos );
154	4					8	substr( $_, 0, $pos, '' );
155
156							## trim trailing "\r" in header
157	4	50				12	chop $head if substr( $head, -1, 1 ) eq "\r";
158
159	4	50				9	if ( length $head > 0 ) {
160
161							# faster than $seq =~ s/\s//g unless $not_trim;
162							# $seq =~ tr/\t\r\n //d unless $not_trim;
163	4	50				11	$_ =~ tr/\t\r\n //d unless $not_trim;
164	4					21	return [ $head, $_ ];
165							}
166							}
167
168	2	50				15	close $fh if $open_flg;
169	2					2	$finished = 1;
170	2					9	return;
171	2					13	};
172							}
173
174							sub FastaReader_old {
175	0			0	0	0	my ( $file, $not_trim ) = @_;
176
177	0					0	my ( $last_header, $seq_buffer ) = ( '', '' ); # buffer for header and seq
178	0					0	my ( $header, $seq ) = ( '', '' ); # current header and seq
179	0					0	my $finished = 0;
180
181	0					0	my ( $fh, $is_stdin ) = ( undef, 0 );
182	0	0				0	if ( $file =~ /^STDIN$/i ) {
183	0					0	( $fh, $is_stdin ) = ( *STDIN, 1 );
184							}
185							else {
186	0	0				0	open $fh, "<", $file or die "fail to open file: $file!\n";
187							}
188
189							return sub {
190	0	0		0		0	return undef if $finished; # end of file
191
192	0					0	while (<$fh>) {
193	0					0	s/^\s+//; # remove the space at the front of line
194
195	0	0				0	if (/^>(.*)/) { # header line
196	0					0	( $header, $last_header ) = ( $last_header, $1 );
197	0					0	( $seq, $seq_buffer ) = ( $seq_buffer, '' );
198
199							# only output fasta records with non-blank header
200	0	0				0	if ( $header ne '' ) {
201	0	0				0	$seq =~ s/\s+//g unless $not_trim;
202	0					0	return [ $header, $seq ];
203							}
204							}
205							else {
206	0					0	$seq_buffer .= $_; # append seq
207							}
208							}
209	0	0				0	close $fh unless $is_stdin;
210	0					0	$finished = 1;
211
212							# last record. only output fasta records with non-blank header
213	0	0				0	if ( $last_header ne '' ) {
214	0	0				0	$seq_buffer =~ s/\s+//g unless $not_trim;
215	0					0	return [ $last_header, $seq_buffer ];
216							}
217	0					0	};
218							}
219
220							=head2 read_sequence_from_fasta_file
221
222							Read all sequences from fasta file.
223
224							Example:
225
226							my $seqs = read_sequence_from_fasta_file($file);
227							for my $header (keys %$seqs) {
228							my $seq = $$seqs{$header};
229							print ">$header\n$seq\n";
230							}
231
232							=cut
233
234							sub read_sequence_from_fasta_file {
235	2			2	1	17	my ( $file, $not_trim ) = @_;
236	2					3	my $seqs = {};
237
238	2					8	my $next_seq = FastaReader( $file, $not_trim );
239	2					5	while ( my $fa = &$next_seq() ) {
240
241							# my ( $header, $seq ) = @$fa;
242							# $$seqs{$header} = $seq;
243	4					15	$$seqs{ $fa->[0] } = $fa->[1];
244							}
245
246	2					17	return $seqs;
247							}
248
249							=head2 write_sequence_to_fasta_file
250
251							Example:
252
253							my $seq = {"seq1" => "acgagaggag"};
254							write_sequence_to_fasta_file($seq, "seq.fa");
255
256							=cut
257
258							sub write_sequence_to_fasta_file {
259	1			1	1	348	my ( $seqs, $file, $n ) = @_;
260	1	50				3	unless ( ref $seqs eq 'HASH' ) {
261	0					0	warn "seqs should be reference of hash\n";
262	0					0	return 0;
263							}
264	1	50				4	$n = 70 unless defined $n;
265
266	1	50				135	open my $fh2, ">$file" or die "failed to write to $file\n";
267	1					3	for ( keys %$seqs ) {
268	2					9	print $fh2 ">$_\n", format_seq( $$seqs{$_}, $n ), "\n";
269							}
270	1					64	close $fh2;
271							}
272
273							=head2 format_seq
274
275							Format sequence to readable text
276
277							Example:
278
279							printf ">%s\n%s", $head, format_seq($seq, 60);
280
281							=cut
282
283							sub format_seq {
284	2			2	1	5	my ( $s, $n ) = @_;
285	2	50				5	$n = 70 unless defined $n;
286	2	50	33			17	unless ( $n =~ /^\d+$/ and $n > 0 ) {
287	0					0	warn "n should be positive integer\n";
288	0					0	return $s;
289							}
290
291	2					3	my $s2 = '';
292	2					3	my ( $j, $int );
293	2					6	$int = int( ( length $s ) / $n );
294	2					6	for ( $j = 0; $j <= $int; $j++ ) {
295	2					7	$s2 .= substr( $s, $j * $n, $n ) . "\n";
296							}
297	2					8	return $s2;
298							}
299
300							=head2 validate_sequence
301
302							Validate a sequence.
303
304							Legale symbols:
305
306							DNA: ACGTRYSWKMBDHV
307							RNA: ACGURYSWKMBDHV
308							Protein: ACDEFGHIKLMNPQRSTVWY
309							gap and space: - *.
310
311							Example:
312
313							if (validate_sequence($seq)) {
314							# do some thing
315							}
316
317							=cut
318
319							sub validate_sequence {
320	2			2	1	232	my ($seq) = @_;
321	2	100				10	return 0 if $seq =~ /[^\.\-\s_*ABCDEFGHIKLMNPQRSTUVWY]/i;
322	1					2	return 1;
323							}
324
325							=head2 complement
326
327							Complement sequence
328
329							IUPAC nucleotide code: ACGTURYSWKMBDHVN
330
331							http://droog.gs.washington.edu/parc/images/iupac.html
332
333							code base Complement
334							A A T
335							C C G
336							G G C
337							T/U T A
338
339							R A/G Y
340							Y C/T R
341							S C/G S
342							W A/T W
343							K G/T M
344							M A/C K
345
346							B C/G/T V
347							D A/G/T H
348							H A/C/T D
349							V A/C/G B
350
351							X/N A/C/G/T X
352							. not A/C/G/T
353							or- gap
354
355							my $comp = complement($seq);
356
357							=cut
358
359							sub complement {
360	1			1	1	12	my ($s) = @_;
361	1					3	$s
362							=~ tr/ACGTURYMKSWBDHVNacgturymkswbdhvn/TGCAAYRKMSWVHDBNtgcaayrkmswvhdbn/;
363	1					4	return $s;
364							}
365
366							=head2 revcom
367
368							Reverse complement sequence
369
370							my $recom = revcom($seq);
371
372							=cut
373
374							sub revcom {
375	1			1	1	3	my $rc = reverse complement( $_[0] );
376	1					4	return $rc;
377							}
378
379							=head2 base_content
380
381							Example:
382
383							my $gc_cotent = base_content('gc', $seq);
384
385							=cut
386
387							sub base_content {
388	1			1	1	2	my ( $bases, $seq ) = @_;
389	1	50				4	if ( $seq eq '' ) {
390	0					0	return 0;
391							}
392
393	1					2	my $sum = 0;
394	1					24	$sum += $seq =~ s/$_/$_/ig for split "", $bases;
395	1					16	return sprintf "%.4f", $sum / length $seq;
396							}
397
398							=head2 degenerate_seq_to_regexp
399
400							Translate degenerate sequence to regular expression
401
402							=cut
403
404							sub degenerate_seq_to_regexp {
405	0			0	1		my ($seq) = @_;
406	0						my %bases = (
407							'A' => 'A',
408							'T' => 'T',
409							'U' => 'U',
410							'C' => 'C',
411							'G' => 'G',
412							'R' => '[AG]',
413							'Y' => '[CT]',
414							'M' => '[AC]',
415							'K' => '[GT]',
416							'S' => '[CG]',
417							'W' => '[AT]',
418							'H' => '[ACT]',
419							'B' => '[CGT]',
420							'V' => '[ACG]',
421							'D' => '[AGT]',
422							'N' => '[ACGT]',
423							);
424	0	0					return join '', map { exists $bases{$_} ? $bases{$_} : $_ }
	0
425							split //, uc $seq;
426							}
427
428							=head2 match_regexp
429
430							Find all sites matching the regular expression.
431
432							See https://github.com/shenwei356/bio_scripts/blob/master/sequence/fasta_locate_motif.pl
433
434							=cut
435
436							sub match_regexp {
437	0			0	1		my ( $r, $s ) = @_;
438	0						my @matched = ();
439	0						my $pos = -1;
440	0						while ( $s =~ /($r)/ig ) {
441	0						$pos = pos $s;
442
443							# return start, end, matched string
444							# start and end are 0-based
445	0						push @matched, [ $pos - length($1), $pos - 1, $1 ];
446	0						pos $s = $pos - length($1) + 1;
447							}
448	0						return \@matched;
449							}
450
451							=head2 dna2peptide
452
453							Translate DNA sequence into a peptide
454
455							=cut
456
457							sub dna2peptide {
458	0			0	1		my ($dna) = @_;
459	0						my $protein = '';
460
461							# Translate each three-base codon to an amino acid, and append to a protein
462	0						for ( my $i = 0; $i < ( length($dna) - 2 ); $i += 3 ) {
463	0						$protein .= codon2aa( substr( $dna, $i, 3 ) );
464							}
465	0						return $protein;
466							}
467
468							=head2 codon2aa
469
470							Translate a DNA 3-character codon to an amino acid
471
472							=cut
473
474							sub codon2aa {
475	0			0	1		my ($codon) = @_;
476	0						$codon = uc $codon;
477	0						my %genetic_code = (
478							'TCA' => 'S', # Serine
479							'TCC' => 'S', # Serine
480							'TCG' => 'S', # Serine
481							'TCT' => 'S', # Serine
482							'TTC' => 'F', # Phenylalanine
483							'TTT' => 'F', # Phenylalanine
484							'TTA' => 'L', # Leucine
485							'TTG' => 'L', # Leucine
486							'TAC' => 'Y', # Tyrosine
487							'TAT' => 'Y', # Tyrosine
488							'TAA' => '_', # Stop
489							'TAG' => '_', # Stop
490							'TGC' => 'C', # Cysteine
491							'TGT' => 'C', # Cysteine
492							'TGA' => '_', # Stop
493							'TGG' => 'W', # Tryptophan
494							'CTA' => 'L', # Leucine
495							'CTC' => 'L', # Leucine
496							'CTG' => 'L', # Leucine
497							'CTT' => 'L', # Leucine
498							'CCA' => 'P', # Proline
499							'CCC' => 'P', # Proline
500							'CCG' => 'P', # Proline
501							'CCT' => 'P', # Proline
502							'CAC' => 'H', # Histidine
503							'CAT' => 'H', # Histidine
504							'CAA' => 'Q', # Glutamine
505							'CAG' => 'Q', # Glutamine
506							'CGA' => 'R', # Arginine
507							'CGC' => 'R', # Arginine
508							'CGG' => 'R', # Arginine
509							'CGT' => 'R', # Arginine
510							'ATA' => 'I', # Isoleucine
511							'ATC' => 'I', # Isoleucine
512							'ATT' => 'I', # Isoleucine
513							'ATG' => 'M', # Methionine
514							'ACA' => 'T', # Threonine
515							'ACC' => 'T', # Threonine
516							'ACG' => 'T', # Threonine
517							'ACT' => 'T', # Threonine
518							'AAC' => 'N', # Asparagine
519							'AAT' => 'N', # Asparagine
520							'AAA' => 'K', # Lysine
521							'AAG' => 'K', # Lysine
522							'AGC' => 'S', # Serine
523							'AGT' => 'S', # Serine
524							'AGA' => 'R', # Arginine
525							'AGG' => 'R', # Arginine
526							'GTA' => 'V', # Valine
527							'GTC' => 'V', # Valine
528							'GTG' => 'V', # Valine
529							'GTT' => 'V', # Valine
530							'GCA' => 'A', # Alanine
531							'GCC' => 'A', # Alanine
532							'GCG' => 'A', # Alanine
533							'GCT' => 'A', # Alanine
534							'GAC' => 'D', # Aspartic Acid
535							'GAT' => 'D', # Aspartic Acid
536							'GAA' => 'E', # Glutamic Acid
537							'GAG' => 'E', # Glutamic Acid
538							'GGA' => 'G', # Glycine
539							'GGC' => 'G', # Glycine
540							'GGG' => 'G', # Glycine
541							'GGT' => 'G', # Glycine
542							);
543
544	0	0					if ( exists $genetic_code{$codon} ) {
545	0						return $genetic_code{$codon};
546							}
547							else {
548	0						print STDERR "Bad codon \"$codon\"!!\n";
549	0						exit;
550							}
551							}
552
553							=head2 generate_random_seqence
554
555							Example:
556
557							my @alphabet = qw/a c g t/;
558							my $seq = generate_random_seqence( \@alphabet, 50 );
559
560							=cut
561
562							sub generate_random_seqence {
563	0			0	1		my ( $alphabet, $length ) = @_;
564	0	0					unless ( ref $alphabet eq 'ARRAY' ) {
565	0						warn "alphabet should be ref of array\n";
566	0						return 0;
567							}
568
569	0						my $n = @$alphabet;
570	0						my $seq;
571	0						$seq .= $$alphabet[ int rand($n) ] for ( 1 .. $length );
572	0						return $seq;
573							}
574
575							=head2 shuffle sequences
576
577							Example:
578
579							shuffle_sequences($file, "$file.shuf.fa");
580
581							=cut
582
583							sub shuffle_sequences {
584	0			0	0		my ( $file, $file_out, $not_trim ) = @_;
585	0						my $seqs = read_sequence_from_fasta_file( $file, $not_trim );
586	0						my @keys = shuffle( keys %$seqs );
587
588	0	0					$file_out = "$file.shuffled.fa" unless defined $file_out;
589	0	0					open my $fh2, ">$file_out" or die "fail to write file $file_out\n";
590	0						print $fh2 ">$_\n$$seqs{$_}\n" for @keys;
591	0						close $fh2;
592
593	0						return $file_out;
594							}
595
596							=head2 rename_fasta_header
597
598							Rename fasta header with regexp.
599
600							Example:
601
602							# delete some symbols
603							my $n = rename_fasta_header('[^a-z\d\s\-\_\[\]\\|]', '', $file, "$file.rename.fa");
604							print "$n records renamed\n";
605
606							=cut
607
608							sub rename_fasta_header {
609	0			0	1		my ( $regex, $repalcement, $file, $outfile ) = @_;
610
611	0	0					open my $fh, "<", $file or die "fail to open file: $file\n";
612	0	0					open my $fh2, ">", $outfile or die "fail to wirte file: $outfile\n";
613
614	0						my $head = '';
615	0						my $n = 0;
616	0						while (<$fh>) {
617	0	0					if (/^\s>(.)\r?\n/) {
618	0						$head = $1;
619	0	0					if ( $head =~ /$regex/ ) {
620	0						$head =~ s/$regex/$repalcement/g;
621	0						$n++;
622							}
623	0						print $fh2 ">$head\n";
624							}
625							else {
626	0						print $fh2 $_;
627							}
628							}
629	0						close $fh;
630	0						close $fh2;
631
632	0						return $n;
633							}
634
635							=head2 clean_fasta_header
636
637							Rename given symbols to repalcement string.
638							Because, some symbols in fasta header will cause unexpected result.
639
640							Example:
641
642							my $file = "test.fa";
643							my $n = clean_fasta_header($file, "$file.rename.fa");
644							# replace any symbol in (\/:*?"<>\|) with '', i.e. deleting.
645							# my $n = clean_fasta_header($file, "$file.rename.fa", '', '\/:*?"<>\|');
646							print "$n records renamed\n";
647
648							=cut
649
650							sub clean_fasta_header {
651	0			0	1		my ( $file, $outfile, $replacement, $symbols ) = @_;
652	0	0					$replacement = "_" unless defined $replacement;
653
654	0						my @default = split //, '\/:*?"<>\|';
655	0	0					$symbols = \@default unless defined $symbols;
656	0	0					unless ( ref $symbols eq 'ARRAY' ) {
657	0						warn "symbols should be ref of array\n";
658	0						return 0;
659							}
660	0						my $re = join '', map { quotemeta $_ } @$symbols;
	0
661	0	0					open my $fh, "<", $file or die "fail to open file: $file\n";
662	0	0					open my $fh2, ">", $outfile or die "fail to wirte file: $outfile\n";
663
664	0						my $head = '';
665	0						my $n = 0;
666	0						while (<$fh>) {
667	0	0					if (/^\s>(.)\r?\n/) {
668	0						$head = $1;
669	0	0					if ( $head =~ /[$re]/ ) {
670	0						$head =~ s/[$re]/$replacement/g;
671	0						$n++;
672							}
673	0						print $fh2 ">$head\n";
674							}
675							else {
676	0						print $fh2 $_;
677							}
678							}
679	0						close $fh;
680	0						close $fh2;
681
682	0						return $n;
683							}
684
685							1;