| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | package BioUtil::Seq; | 
| 2 |  |  |  |  |  |  |  | 
| 3 |  |  |  |  |  |  | require Exporter; | 
| 4 |  |  |  |  |  |  | @ISA    = (Exporter); | 
| 5 |  |  |  |  |  |  | @EXPORT = qw( | 
| 6 |  |  |  |  |  |  | FastaReader | 
| 7 |  |  |  |  |  |  | read_sequence_from_fasta_file | 
| 8 |  |  |  |  |  |  | write_sequence_to_fasta_file | 
| 9 |  |  |  |  |  |  | format_seq | 
| 10 |  |  |  |  |  |  |  | 
| 11 |  |  |  |  |  |  | validate_sequence | 
| 12 |  |  |  |  |  |  | complement | 
| 13 |  |  |  |  |  |  | revcom | 
| 14 |  |  |  |  |  |  | base_content | 
| 15 |  |  |  |  |  |  | degenerate_seq_to_regexp | 
| 16 |  |  |  |  |  |  | match_regexp | 
| 17 |  |  |  |  |  |  | dna2peptide | 
| 18 |  |  |  |  |  |  | codon2aa | 
| 19 |  |  |  |  |  |  | generate_random_seqence | 
| 20 |  |  |  |  |  |  |  | 
| 21 |  |  |  |  |  |  | shuffle_sequences | 
| 22 |  |  |  |  |  |  | rename_fasta_header | 
| 23 |  |  |  |  |  |  | clean_fasta_header | 
| 24 |  |  |  |  |  |  | ); | 
| 25 |  |  |  |  |  |  |  | 
| 26 | 1 |  |  | 1 |  | 15889 | use vars qw($VERSION); | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 46 |  | 
| 27 |  |  |  |  |  |  |  | 
| 28 | 1 |  |  | 1 |  | 16 | use 5.010_000; | 
|  | 1 |  |  |  |  | 4 |  | 
| 29 | 1 |  |  | 1 |  | 4 | use strict; | 
|  | 1 |  |  |  |  | 10 |  | 
|  | 1 |  |  |  |  | 21 |  | 
| 30 | 1 |  |  | 1 |  | 4 | use warnings FATAL => 'all'; | 
|  | 1 |  |  |  |  | 1 |  | 
|  | 1 |  |  |  |  | 49 |  | 
| 31 |  |  |  |  |  |  |  | 
| 32 | 1 |  |  | 1 |  | 5 | use List::Util qw(shuffle); | 
|  | 1 |  |  |  |  | 1 |  | 
|  | 1 |  |  |  |  | 2638 |  | 
| 33 |  |  |  |  |  |  |  | 
| 34 |  |  |  |  |  |  | =head1 NAME | 
| 35 |  |  |  |  |  |  |  | 
| 36 |  |  |  |  |  |  | BioUtil::Seq - Utilities for sequence | 
| 37 |  |  |  |  |  |  |  | 
| 38 |  |  |  |  |  |  | Some great modules like BioPerl provide many robust solutions. | 
| 39 |  |  |  |  |  |  | However, it is not easy to install for someone in some platforms. | 
| 40 |  |  |  |  |  |  | And for some simple task scripts, a lite module may be a good choice. | 
| 41 |  |  |  |  |  |  | So I reinvented some wheels and added some useful utilities into this module, | 
| 42 |  |  |  |  |  |  | hoping it would be helpful. | 
| 43 |  |  |  |  |  |  |  | 
| 44 |  |  |  |  |  |  | =head1 VERSION | 
| 45 |  |  |  |  |  |  |  | 
| 46 |  |  |  |  |  |  | Version 2015.0309 | 
| 47 |  |  |  |  |  |  |  | 
| 48 |  |  |  |  |  |  | =cut | 
| 49 |  |  |  |  |  |  |  | 
| 50 |  |  |  |  |  |  | our $VERSION = 2015.0728; | 
| 51 |  |  |  |  |  |  |  | 
| 52 |  |  |  |  |  |  | =head1 EXPORT | 
| 53 |  |  |  |  |  |  |  | 
| 54 |  |  |  |  |  |  | FastaReader | 
| 55 |  |  |  |  |  |  | read_sequence_from_fasta_file | 
| 56 |  |  |  |  |  |  | write_sequence_to_fasta_file | 
| 57 |  |  |  |  |  |  | format_seq | 
| 58 |  |  |  |  |  |  |  | 
| 59 |  |  |  |  |  |  | validate_sequence | 
| 60 |  |  |  |  |  |  | complement | 
| 61 |  |  |  |  |  |  | revcom | 
| 62 |  |  |  |  |  |  | base_content | 
| 63 |  |  |  |  |  |  | degenerate_seq_to_regexp | 
| 64 |  |  |  |  |  |  | match_regexp | 
| 65 |  |  |  |  |  |  | dna2peptide | 
| 66 |  |  |  |  |  |  | codon2aa | 
| 67 |  |  |  |  |  |  | generate_random_seqence | 
| 68 |  |  |  |  |  |  |  | 
| 69 |  |  |  |  |  |  | shuffle_sequences | 
| 70 |  |  |  |  |  |  | rename_fasta_header | 
| 71 |  |  |  |  |  |  | clean_fasta_header | 
| 72 |  |  |  |  |  |  |  | 
| 73 |  |  |  |  |  |  | =head1 SYNOPSIS | 
| 74 |  |  |  |  |  |  |  | 
| 75 |  |  |  |  |  |  | use BioUtil::Seq; | 
| 76 |  |  |  |  |  |  |  | 
| 77 |  |  |  |  |  |  |  | 
| 78 |  |  |  |  |  |  | =head1 SUBROUTINES/METHODS | 
| 79 |  |  |  |  |  |  |  | 
| 80 |  |  |  |  |  |  |  | 
| 81 |  |  |  |  |  |  | =head2 FastaReader | 
| 82 |  |  |  |  |  |  |  | 
| 83 |  |  |  |  |  |  | FastaReader is a fasta file parser using closure. | 
| 84 |  |  |  |  |  |  | FastaReader returns an anonymous subroutine, when called, it | 
| 85 |  |  |  |  |  |  | return a fasta record which is reference of an array | 
| 86 |  |  |  |  |  |  | containing fasta header and sequence. | 
| 87 |  |  |  |  |  |  |  | 
| 88 |  |  |  |  |  |  | FastaReader could also read from STDIN when the file name is "STDIN" or "stdin". | 
| 89 |  |  |  |  |  |  |  | 
| 90 |  |  |  |  |  |  | A boolean argument is optional. If set as "true", spaces including blank, tab, | 
| 91 |  |  |  |  |  |  | "return" ("\r") and "new line" ("\n") symbols in sequence will not be trimed. | 
| 92 |  |  |  |  |  |  |  | 
| 93 |  |  |  |  |  |  | FastaReader speeds up by utilizing the special Perl variable $/ (set to "\n>"), | 
| 94 |  |  |  |  |  |  | with kind help of Mario Roy, author of MCE | 
| 95 |  |  |  |  |  |  | (https://code.google.com/p/many-core-engine-perl/). A lot of optimizations were | 
| 96 |  |  |  |  |  |  | also done by him. | 
| 97 |  |  |  |  |  |  |  | 
| 98 |  |  |  |  |  |  | Example: | 
| 99 |  |  |  |  |  |  |  | 
| 100 |  |  |  |  |  |  | # do not trim the spaces and \n | 
| 101 |  |  |  |  |  |  | # $not_trim = 1; | 
| 102 |  |  |  |  |  |  | # my $next_seq = FastaReader("test.fa", $not_trim); | 
| 103 |  |  |  |  |  |  |  | 
| 104 |  |  |  |  |  |  | # read from STDIN | 
| 105 |  |  |  |  |  |  | # my $next_seq = FastaReader('STDIN'); | 
| 106 |  |  |  |  |  |  |  | 
| 107 |  |  |  |  |  |  | # read from file | 
| 108 |  |  |  |  |  |  | my $next_seq = FastaReader("test.fa"); | 
| 109 |  |  |  |  |  |  |  | 
| 110 |  |  |  |  |  |  | while ( my $fa = &$next_seq() ) { | 
| 111 |  |  |  |  |  |  | my ( $header, $seq ) = @$fa; | 
| 112 |  |  |  |  |  |  |  | 
| 113 |  |  |  |  |  |  | print ">$header\n$seq\n"; | 
| 114 |  |  |  |  |  |  | } | 
| 115 |  |  |  |  |  |  |  | 
| 116 |  |  |  |  |  |  | =cut | 
| 117 |  |  |  |  |  |  |  | 
| 118 |  |  |  |  |  |  | sub FastaReader { | 
| 119 | 2 |  |  | 2 | 1 | 4 | my ( $file, $not_trim ) = @_; | 
| 120 |  |  |  |  |  |  |  | 
| 121 | 2 |  |  |  |  | 4 | my ( $open_flg, $finished ) = ( 0, 0 ); | 
| 122 | 2 |  |  |  |  | 5 | my ( $fh, $pos, $head ) = (undef) x 3; | 
| 123 |  |  |  |  |  |  |  | 
| 124 | 2 | 50 | 33 |  |  | 14 | if ( $file =~ /^STDIN$/i ) {    # from stdin | 
|  |  | 50 |  |  |  |  |  | 
| 125 | 0 |  |  |  |  | 0 | $fh = *STDIN; | 
| 126 |  |  |  |  |  |  | } | 
| 127 |  |  |  |  |  |  | elsif ( ref $file eq '' or ref $file eq 'SCALAR' ) {    # from file | 
| 128 | 2 | 50 |  |  |  | 56 | open $fh, '<', $file or die "fail to open file: $file!\n"; | 
| 129 | 2 |  |  |  |  | 5 | $open_flg = 1; | 
| 130 |  |  |  |  |  |  | } | 
| 131 |  |  |  |  |  |  | else {    # glob, i.e. given file handler | 
| 132 | 0 |  |  |  |  | 0 | $fh = $file; | 
| 133 |  |  |  |  |  |  | } | 
| 134 |  |  |  |  |  |  |  | 
| 135 | 2 |  |  |  |  | 7 | local $/ = \1;     ## read one byte | 
| 136 | 2 |  |  |  |  | 18 | while (<$fh>) {    ## until reaching ">" | 
| 137 | 2 | 50 |  |  |  | 8 | last if $_ eq '>'; | 
| 138 |  |  |  |  |  |  | } | 
| 139 |  |  |  |  |  |  | return sub { | 
| 140 | 6 | 50 |  | 6 |  | 13 | return if $finished; | 
| 141 |  |  |  |  |  |  |  | 
| 142 | 6 |  |  |  |  | 16 | local $/ = "\n>";    ## set input record separator | 
| 143 | 6 |  |  |  |  | 20 | while (<$fh>) { | 
| 144 |  |  |  |  |  |  | ## trim trailing ">", part of $/. faster than s/\r?\n>$// | 
| 145 | 4 | 100 |  |  |  | 12 | substr( $_, -1, 1, '' ) if substr( $_, -1, 1 ) eq '>'; | 
| 146 |  |  |  |  |  |  |  | 
| 147 |  |  |  |  |  |  | ## extract header and sequence | 
| 148 |  |  |  |  |  |  | # faster than  ( $head, $seq ) = split( /\n/, $_, 2 ); | 
| 149 | 4 |  |  |  |  | 9 | $pos = index( $_, "\n" ) + 1; | 
| 150 | 4 |  |  |  |  | 6 | $head = substr( $_, 0, $pos - 1 ); | 
| 151 |  |  |  |  |  |  |  | 
| 152 |  |  |  |  |  |  | # $_ becomes sequence, to save memory | 
| 153 |  |  |  |  |  |  | # $seq = substr( $_, $pos ); | 
| 154 | 4 |  |  |  |  | 8 | substr( $_, 0, $pos, '' ); | 
| 155 |  |  |  |  |  |  |  | 
| 156 |  |  |  |  |  |  | ## trim trailing "\r" in header | 
| 157 | 4 | 50 |  |  |  | 12 | chop $head if substr( $head, -1, 1 ) eq "\r"; | 
| 158 |  |  |  |  |  |  |  | 
| 159 | 4 | 50 |  |  |  | 9 | if ( length $head > 0 ) { | 
| 160 |  |  |  |  |  |  |  | 
| 161 |  |  |  |  |  |  | # faster than $seq =~ s/\s//g unless $not_trim; | 
| 162 |  |  |  |  |  |  | # $seq =~ tr/\t\r\n //d unless $not_trim; | 
| 163 | 4 | 50 |  |  |  | 11 | $_ =~ tr/\t\r\n //d unless $not_trim; | 
| 164 | 4 |  |  |  |  | 21 | return [ $head, $_ ]; | 
| 165 |  |  |  |  |  |  | } | 
| 166 |  |  |  |  |  |  | } | 
| 167 |  |  |  |  |  |  |  | 
| 168 | 2 | 50 |  |  |  | 15 | close $fh if $open_flg; | 
| 169 | 2 |  |  |  |  | 2 | $finished = 1; | 
| 170 | 2 |  |  |  |  | 9 | return; | 
| 171 | 2 |  |  |  |  | 13 | }; | 
| 172 |  |  |  |  |  |  | } | 
| 173 |  |  |  |  |  |  |  | 
| 174 |  |  |  |  |  |  | sub FastaReader_old { | 
| 175 | 0 |  |  | 0 | 0 | 0 | my ( $file, $not_trim ) = @_; | 
| 176 |  |  |  |  |  |  |  | 
| 177 | 0 |  |  |  |  | 0 | my ( $last_header, $seq_buffer ) = ( '', '' ); # buffer for header and seq | 
| 178 | 0 |  |  |  |  | 0 | my ( $header,      $seq )        = ( '', '' ); # current header and seq | 
| 179 | 0 |  |  |  |  | 0 | my $finished = 0; | 
| 180 |  |  |  |  |  |  |  | 
| 181 | 0 |  |  |  |  | 0 | my ( $fh, $is_stdin ) = ( undef, 0 ); | 
| 182 | 0 | 0 |  |  |  | 0 | if ( $file =~ /^STDIN$/i ) { | 
| 183 | 0 |  |  |  |  | 0 | ( $fh, $is_stdin ) = ( *STDIN, 1 ); | 
| 184 |  |  |  |  |  |  | } | 
| 185 |  |  |  |  |  |  | else { | 
| 186 | 0 | 0 |  |  |  | 0 | open $fh, "<", $file or die "fail to open file: $file!\n"; | 
| 187 |  |  |  |  |  |  | } | 
| 188 |  |  |  |  |  |  |  | 
| 189 |  |  |  |  |  |  | return sub { | 
| 190 | 0 | 0 |  | 0 |  | 0 | return undef if $finished;                 # end of file | 
| 191 |  |  |  |  |  |  |  | 
| 192 | 0 |  |  |  |  | 0 | while (<$fh>) { | 
| 193 | 0 |  |  |  |  | 0 | s/^\s+//;    # remove the space at the front of line | 
| 194 |  |  |  |  |  |  |  | 
| 195 | 0 | 0 |  |  |  | 0 | if (/^>(.*)/) {    # header line | 
| 196 | 0 |  |  |  |  | 0 | ( $header, $last_header ) = ( $last_header, $1 ); | 
| 197 | 0 |  |  |  |  | 0 | ( $seq,    $seq_buffer )  = ( $seq_buffer,  '' ); | 
| 198 |  |  |  |  |  |  |  | 
| 199 |  |  |  |  |  |  | # only output fasta records with non-blank header | 
| 200 | 0 | 0 |  |  |  | 0 | if ( $header ne '' ) { | 
| 201 | 0 | 0 |  |  |  | 0 | $seq =~ s/\s+//g unless $not_trim; | 
| 202 | 0 |  |  |  |  | 0 | return [ $header, $seq ]; | 
| 203 |  |  |  |  |  |  | } | 
| 204 |  |  |  |  |  |  | } | 
| 205 |  |  |  |  |  |  | else { | 
| 206 | 0 |  |  |  |  | 0 | $seq_buffer .= $_;    # append seq | 
| 207 |  |  |  |  |  |  | } | 
| 208 |  |  |  |  |  |  | } | 
| 209 | 0 | 0 |  |  |  | 0 | close $fh unless $is_stdin; | 
| 210 | 0 |  |  |  |  | 0 | $finished = 1; | 
| 211 |  |  |  |  |  |  |  | 
| 212 |  |  |  |  |  |  | # last record. only output fasta records with non-blank header | 
| 213 | 0 | 0 |  |  |  | 0 | if ( $last_header ne '' ) { | 
| 214 | 0 | 0 |  |  |  | 0 | $seq_buffer =~ s/\s+//g unless $not_trim; | 
| 215 | 0 |  |  |  |  | 0 | return [ $last_header, $seq_buffer ]; | 
| 216 |  |  |  |  |  |  | } | 
| 217 | 0 |  |  |  |  | 0 | }; | 
| 218 |  |  |  |  |  |  | } | 
| 219 |  |  |  |  |  |  |  | 
| 220 |  |  |  |  |  |  | =head2 read_sequence_from_fasta_file | 
| 221 |  |  |  |  |  |  |  | 
| 222 |  |  |  |  |  |  | Read all sequences from fasta file. | 
| 223 |  |  |  |  |  |  |  | 
| 224 |  |  |  |  |  |  | Example: | 
| 225 |  |  |  |  |  |  |  | 
| 226 |  |  |  |  |  |  | my $seqs = read_sequence_from_fasta_file($file); | 
| 227 |  |  |  |  |  |  | for my $header (keys %$seqs) { | 
| 228 |  |  |  |  |  |  | my $seq = $$seqs{$header}; | 
| 229 |  |  |  |  |  |  | print ">$header\n$seq\n"; | 
| 230 |  |  |  |  |  |  | } | 
| 231 |  |  |  |  |  |  |  | 
| 232 |  |  |  |  |  |  | =cut | 
| 233 |  |  |  |  |  |  |  | 
| 234 |  |  |  |  |  |  | sub read_sequence_from_fasta_file { | 
| 235 | 2 |  |  | 2 | 1 | 17 | my ( $file, $not_trim ) = @_; | 
| 236 | 2 |  |  |  |  | 3 | my $seqs = {}; | 
| 237 |  |  |  |  |  |  |  | 
| 238 | 2 |  |  |  |  | 8 | my $next_seq = FastaReader( $file, $not_trim ); | 
| 239 | 2 |  |  |  |  | 5 | while ( my $fa = &$next_seq() ) { | 
| 240 |  |  |  |  |  |  |  | 
| 241 |  |  |  |  |  |  | # my ( $header, $seq ) = @$fa; | 
| 242 |  |  |  |  |  |  | # $$seqs{$header} = $seq; | 
| 243 | 4 |  |  |  |  | 15 | $$seqs{ $fa->[0] } = $fa->[1]; | 
| 244 |  |  |  |  |  |  | } | 
| 245 |  |  |  |  |  |  |  | 
| 246 | 2 |  |  |  |  | 17 | return $seqs; | 
| 247 |  |  |  |  |  |  | } | 
| 248 |  |  |  |  |  |  |  | 
| 249 |  |  |  |  |  |  | =head2 write_sequence_to_fasta_file | 
| 250 |  |  |  |  |  |  |  | 
| 251 |  |  |  |  |  |  | Example: | 
| 252 |  |  |  |  |  |  |  | 
| 253 |  |  |  |  |  |  | my $seq = {"seq1" => "acgagaggag"}; | 
| 254 |  |  |  |  |  |  | write_sequence_to_fasta_file($seq, "seq.fa"); | 
| 255 |  |  |  |  |  |  |  | 
| 256 |  |  |  |  |  |  | =cut | 
| 257 |  |  |  |  |  |  |  | 
| 258 |  |  |  |  |  |  | sub write_sequence_to_fasta_file { | 
| 259 | 1 |  |  | 1 | 1 | 348 | my ( $seqs, $file, $n ) = @_; | 
| 260 | 1 | 50 |  |  |  | 3 | unless ( ref $seqs eq 'HASH' ) { | 
| 261 | 0 |  |  |  |  | 0 | warn "seqs should be reference of hash\n"; | 
| 262 | 0 |  |  |  |  | 0 | return 0; | 
| 263 |  |  |  |  |  |  | } | 
| 264 | 1 | 50 |  |  |  | 4 | $n = 70 unless defined $n; | 
| 265 |  |  |  |  |  |  |  | 
| 266 | 1 | 50 |  |  |  | 135 | open my $fh2, ">$file" or die "failed to write to $file\n"; | 
| 267 | 1 |  |  |  |  | 3 | for ( keys %$seqs ) { | 
| 268 | 2 |  |  |  |  | 9 | print $fh2 ">$_\n", format_seq( $$seqs{$_}, $n ), "\n"; | 
| 269 |  |  |  |  |  |  | } | 
| 270 | 1 |  |  |  |  | 64 | close $fh2; | 
| 271 |  |  |  |  |  |  | } | 
| 272 |  |  |  |  |  |  |  | 
| 273 |  |  |  |  |  |  | =head2 format_seq | 
| 274 |  |  |  |  |  |  |  | 
| 275 |  |  |  |  |  |  | Format sequence to readable text | 
| 276 |  |  |  |  |  |  |  | 
| 277 |  |  |  |  |  |  | Example: | 
| 278 |  |  |  |  |  |  |  | 
| 279 |  |  |  |  |  |  | printf ">%s\n%s", $head, format_seq($seq, 60); | 
| 280 |  |  |  |  |  |  |  | 
| 281 |  |  |  |  |  |  | =cut | 
| 282 |  |  |  |  |  |  |  | 
| 283 |  |  |  |  |  |  | sub format_seq { | 
| 284 | 2 |  |  | 2 | 1 | 5 | my ( $s, $n ) = @_; | 
| 285 | 2 | 50 |  |  |  | 5 | $n = 70 unless defined $n; | 
| 286 | 2 | 50 | 33 |  |  | 17 | unless ( $n =~ /^\d+$/ and $n > 0 ) { | 
| 287 | 0 |  |  |  |  | 0 | warn "n should be positive integer\n"; | 
| 288 | 0 |  |  |  |  | 0 | return $s; | 
| 289 |  |  |  |  |  |  | } | 
| 290 |  |  |  |  |  |  |  | 
| 291 | 2 |  |  |  |  | 3 | my $s2 = ''; | 
| 292 | 2 |  |  |  |  | 3 | my ( $j, $int ); | 
| 293 | 2 |  |  |  |  | 6 | $int = int( ( length $s ) / $n ); | 
| 294 | 2 |  |  |  |  | 6 | for ( $j = 0; $j <= $int; $j++ ) { | 
| 295 | 2 |  |  |  |  | 7 | $s2 .= substr( $s, $j * $n, $n ) . "\n"; | 
| 296 |  |  |  |  |  |  | } | 
| 297 | 2 |  |  |  |  | 8 | return $s2; | 
| 298 |  |  |  |  |  |  | } | 
| 299 |  |  |  |  |  |  |  | 
| 300 |  |  |  |  |  |  | =head2 validate_sequence | 
| 301 |  |  |  |  |  |  |  | 
| 302 |  |  |  |  |  |  | Validate a sequence. | 
| 303 |  |  |  |  |  |  |  | 
| 304 |  |  |  |  |  |  | Legale symbols: | 
| 305 |  |  |  |  |  |  |  | 
| 306 |  |  |  |  |  |  | DNA: ACGTRYSWKMBDHV | 
| 307 |  |  |  |  |  |  | RNA: ACGURYSWKMBDHV | 
| 308 |  |  |  |  |  |  | Protein: ACDEFGHIKLMNPQRSTVWY | 
| 309 |  |  |  |  |  |  | gap and space: - *. | 
| 310 |  |  |  |  |  |  |  | 
| 311 |  |  |  |  |  |  | Example: | 
| 312 |  |  |  |  |  |  |  | 
| 313 |  |  |  |  |  |  | if (validate_sequence($seq)) { | 
| 314 |  |  |  |  |  |  | # do some thing | 
| 315 |  |  |  |  |  |  | } | 
| 316 |  |  |  |  |  |  |  | 
| 317 |  |  |  |  |  |  | =cut | 
| 318 |  |  |  |  |  |  |  | 
| 319 |  |  |  |  |  |  | sub validate_sequence { | 
| 320 | 2 |  |  | 2 | 1 | 232 | my ($seq) = @_; | 
| 321 | 2 | 100 |  |  |  | 10 | return 0 if $seq =~ /[^\.\-\s_*ABCDEFGHIKLMNPQRSTUVWY]/i; | 
| 322 | 1 |  |  |  |  | 2 | return 1; | 
| 323 |  |  |  |  |  |  | } | 
| 324 |  |  |  |  |  |  |  | 
| 325 |  |  |  |  |  |  | =head2 complement | 
| 326 |  |  |  |  |  |  |  | 
| 327 |  |  |  |  |  |  | Complement sequence | 
| 328 |  |  |  |  |  |  |  | 
| 329 |  |  |  |  |  |  | IUPAC nucleotide code: ACGTURYSWKMBDHVN | 
| 330 |  |  |  |  |  |  |  | 
| 331 |  |  |  |  |  |  | http://droog.gs.washington.edu/parc/images/iupac.html | 
| 332 |  |  |  |  |  |  |  | 
| 333 |  |  |  |  |  |  | code    base    Complement | 
| 334 |  |  |  |  |  |  | A   A   T | 
| 335 |  |  |  |  |  |  | C   C   G | 
| 336 |  |  |  |  |  |  | G   G   C | 
| 337 |  |  |  |  |  |  | T/U T   A | 
| 338 |  |  |  |  |  |  |  | 
| 339 |  |  |  |  |  |  | R   A/G Y | 
| 340 |  |  |  |  |  |  | Y   C/T R | 
| 341 |  |  |  |  |  |  | S   C/G S | 
| 342 |  |  |  |  |  |  | W   A/T W | 
| 343 |  |  |  |  |  |  | K   G/T M | 
| 344 |  |  |  |  |  |  | M   A/C K | 
| 345 |  |  |  |  |  |  |  | 
| 346 |  |  |  |  |  |  | B   C/G/T   V | 
| 347 |  |  |  |  |  |  | D   A/G/T   H | 
| 348 |  |  |  |  |  |  | H   A/C/T   D | 
| 349 |  |  |  |  |  |  | V   A/C/G   B | 
| 350 |  |  |  |  |  |  |  | 
| 351 |  |  |  |  |  |  | X/N A/C/G/T X | 
| 352 |  |  |  |  |  |  | .   not A/C/G/T | 
| 353 |  |  |  |  |  |  | or-    gap | 
| 354 |  |  |  |  |  |  |  | 
| 355 |  |  |  |  |  |  | my $comp = complement($seq); | 
| 356 |  |  |  |  |  |  |  | 
| 357 |  |  |  |  |  |  | =cut | 
| 358 |  |  |  |  |  |  |  | 
| 359 |  |  |  |  |  |  | sub complement { | 
| 360 | 1 |  |  | 1 | 1 | 12 | my ($s) = @_; | 
| 361 | 1 |  |  |  |  | 3 | $s | 
| 362 |  |  |  |  |  |  | =~ tr/ACGTURYMKSWBDHVNacgturymkswbdhvn/TGCAAYRKMSWVHDBNtgcaayrkmswvhdbn/; | 
| 363 | 1 |  |  |  |  | 4 | return $s; | 
| 364 |  |  |  |  |  |  | } | 
| 365 |  |  |  |  |  |  |  | 
| 366 |  |  |  |  |  |  | =head2 revcom | 
| 367 |  |  |  |  |  |  |  | 
| 368 |  |  |  |  |  |  | Reverse complement sequence | 
| 369 |  |  |  |  |  |  |  | 
| 370 |  |  |  |  |  |  | my $recom = revcom($seq); | 
| 371 |  |  |  |  |  |  |  | 
| 372 |  |  |  |  |  |  | =cut | 
| 373 |  |  |  |  |  |  |  | 
| 374 |  |  |  |  |  |  | sub revcom { | 
| 375 | 1 |  |  | 1 | 1 | 3 | my $rc = reverse complement( $_[0] ); | 
| 376 | 1 |  |  |  |  | 4 | return $rc; | 
| 377 |  |  |  |  |  |  | } | 
| 378 |  |  |  |  |  |  |  | 
| 379 |  |  |  |  |  |  | =head2 base_content | 
| 380 |  |  |  |  |  |  |  | 
| 381 |  |  |  |  |  |  | Example: | 
| 382 |  |  |  |  |  |  |  | 
| 383 |  |  |  |  |  |  | my $gc_cotent = base_content('gc', $seq); | 
| 384 |  |  |  |  |  |  |  | 
| 385 |  |  |  |  |  |  | =cut | 
| 386 |  |  |  |  |  |  |  | 
| 387 |  |  |  |  |  |  | sub base_content { | 
| 388 | 1 |  |  | 1 | 1 | 2 | my ( $bases, $seq ) = @_; | 
| 389 | 1 | 50 |  |  |  | 4 | if ( $seq eq '' ) { | 
| 390 | 0 |  |  |  |  | 0 | return 0; | 
| 391 |  |  |  |  |  |  | } | 
| 392 |  |  |  |  |  |  |  | 
| 393 | 1 |  |  |  |  | 2 | my $sum = 0; | 
| 394 | 1 |  |  |  |  | 24 | $sum += $seq =~ s/$_/$_/ig for split "", $bases; | 
| 395 | 1 |  |  |  |  | 16 | return sprintf "%.4f", $sum / length $seq; | 
| 396 |  |  |  |  |  |  | } | 
| 397 |  |  |  |  |  |  |  | 
| 398 |  |  |  |  |  |  | =head2 degenerate_seq_to_regexp | 
| 399 |  |  |  |  |  |  |  | 
| 400 |  |  |  |  |  |  | Translate degenerate sequence to regular expression | 
| 401 |  |  |  |  |  |  |  | 
| 402 |  |  |  |  |  |  | =cut | 
| 403 |  |  |  |  |  |  |  | 
| 404 |  |  |  |  |  |  | sub degenerate_seq_to_regexp { | 
| 405 | 0 |  |  | 0 | 1 |  | my ($seq) = @_; | 
| 406 | 0 |  |  |  |  |  | my %bases = ( | 
| 407 |  |  |  |  |  |  | 'A' => 'A', | 
| 408 |  |  |  |  |  |  | 'T' => 'T', | 
| 409 |  |  |  |  |  |  | 'U' => 'U', | 
| 410 |  |  |  |  |  |  | 'C' => 'C', | 
| 411 |  |  |  |  |  |  | 'G' => 'G', | 
| 412 |  |  |  |  |  |  | 'R' => '[AG]', | 
| 413 |  |  |  |  |  |  | 'Y' => '[CT]', | 
| 414 |  |  |  |  |  |  | 'M' => '[AC]', | 
| 415 |  |  |  |  |  |  | 'K' => '[GT]', | 
| 416 |  |  |  |  |  |  | 'S' => '[CG]', | 
| 417 |  |  |  |  |  |  | 'W' => '[AT]', | 
| 418 |  |  |  |  |  |  | 'H' => '[ACT]', | 
| 419 |  |  |  |  |  |  | 'B' => '[CGT]', | 
| 420 |  |  |  |  |  |  | 'V' => '[ACG]', | 
| 421 |  |  |  |  |  |  | 'D' => '[AGT]', | 
| 422 |  |  |  |  |  |  | 'N' => '[ACGT]', | 
| 423 |  |  |  |  |  |  | ); | 
| 424 | 0 | 0 |  |  |  |  | return join '', map { exists $bases{$_} ? $bases{$_} : $_ } | 
|  | 0 |  |  |  |  |  |  | 
| 425 |  |  |  |  |  |  | split //, uc $seq; | 
| 426 |  |  |  |  |  |  | } | 
| 427 |  |  |  |  |  |  |  | 
| 428 |  |  |  |  |  |  | =head2 match_regexp | 
| 429 |  |  |  |  |  |  |  | 
| 430 |  |  |  |  |  |  | Find all sites matching the regular expression. | 
| 431 |  |  |  |  |  |  |  | 
| 432 |  |  |  |  |  |  | See https://github.com/shenwei356/bio_scripts/blob/master/sequence/fasta_locate_motif.pl | 
| 433 |  |  |  |  |  |  |  | 
| 434 |  |  |  |  |  |  | =cut | 
| 435 |  |  |  |  |  |  |  | 
| 436 |  |  |  |  |  |  | sub match_regexp { | 
| 437 | 0 |  |  | 0 | 1 |  | my ( $r, $s ) = @_; | 
| 438 | 0 |  |  |  |  |  | my @matched = (); | 
| 439 | 0 |  |  |  |  |  | my $pos     = -1; | 
| 440 | 0 |  |  |  |  |  | while ( $s =~ /($r)/ig ) { | 
| 441 | 0 |  |  |  |  |  | $pos = pos $s; | 
| 442 |  |  |  |  |  |  |  | 
| 443 |  |  |  |  |  |  | # return start, end, matched string | 
| 444 |  |  |  |  |  |  | # start and end are 0-based | 
| 445 | 0 |  |  |  |  |  | push @matched, [ $pos - length($1), $pos - 1, $1 ]; | 
| 446 | 0 |  |  |  |  |  | pos $s = $pos - length($1) + 1; | 
| 447 |  |  |  |  |  |  | } | 
| 448 | 0 |  |  |  |  |  | return \@matched; | 
| 449 |  |  |  |  |  |  | } | 
| 450 |  |  |  |  |  |  |  | 
| 451 |  |  |  |  |  |  | =head2 dna2peptide | 
| 452 |  |  |  |  |  |  |  | 
| 453 |  |  |  |  |  |  | Translate DNA sequence into a peptide | 
| 454 |  |  |  |  |  |  |  | 
| 455 |  |  |  |  |  |  | =cut | 
| 456 |  |  |  |  |  |  |  | 
| 457 |  |  |  |  |  |  | sub dna2peptide { | 
| 458 | 0 |  |  | 0 | 1 |  | my ($dna) = @_; | 
| 459 | 0 |  |  |  |  |  | my $protein = ''; | 
| 460 |  |  |  |  |  |  |  | 
| 461 |  |  |  |  |  |  | # Translate each three-base codon to an amino acid, and append to a protein | 
| 462 | 0 |  |  |  |  |  | for ( my $i = 0; $i < ( length($dna) - 2 ); $i += 3 ) { | 
| 463 | 0 |  |  |  |  |  | $protein .= codon2aa( substr( $dna, $i, 3 ) ); | 
| 464 |  |  |  |  |  |  | } | 
| 465 | 0 |  |  |  |  |  | return $protein; | 
| 466 |  |  |  |  |  |  | } | 
| 467 |  |  |  |  |  |  |  | 
| 468 |  |  |  |  |  |  | =head2 codon2aa | 
| 469 |  |  |  |  |  |  |  | 
| 470 |  |  |  |  |  |  | Translate a DNA 3-character codon to an amino acid | 
| 471 |  |  |  |  |  |  |  | 
| 472 |  |  |  |  |  |  | =cut | 
| 473 |  |  |  |  |  |  |  | 
| 474 |  |  |  |  |  |  | sub codon2aa { | 
| 475 | 0 |  |  | 0 | 1 |  | my ($codon) = @_; | 
| 476 | 0 |  |  |  |  |  | $codon = uc $codon; | 
| 477 | 0 |  |  |  |  |  | my %genetic_code = ( | 
| 478 |  |  |  |  |  |  | 'TCA' => 'S',    # Serine | 
| 479 |  |  |  |  |  |  | 'TCC' => 'S',    # Serine | 
| 480 |  |  |  |  |  |  | 'TCG' => 'S',    # Serine | 
| 481 |  |  |  |  |  |  | 'TCT' => 'S',    # Serine | 
| 482 |  |  |  |  |  |  | 'TTC' => 'F',    # Phenylalanine | 
| 483 |  |  |  |  |  |  | 'TTT' => 'F',    # Phenylalanine | 
| 484 |  |  |  |  |  |  | 'TTA' => 'L',    # Leucine | 
| 485 |  |  |  |  |  |  | 'TTG' => 'L',    # Leucine | 
| 486 |  |  |  |  |  |  | 'TAC' => 'Y',    # Tyrosine | 
| 487 |  |  |  |  |  |  | 'TAT' => 'Y',    # Tyrosine | 
| 488 |  |  |  |  |  |  | 'TAA' => '_',    # Stop | 
| 489 |  |  |  |  |  |  | 'TAG' => '_',    # Stop | 
| 490 |  |  |  |  |  |  | 'TGC' => 'C',    # Cysteine | 
| 491 |  |  |  |  |  |  | 'TGT' => 'C',    # Cysteine | 
| 492 |  |  |  |  |  |  | 'TGA' => '_',    # Stop | 
| 493 |  |  |  |  |  |  | 'TGG' => 'W',    # Tryptophan | 
| 494 |  |  |  |  |  |  | 'CTA' => 'L',    # Leucine | 
| 495 |  |  |  |  |  |  | 'CTC' => 'L',    # Leucine | 
| 496 |  |  |  |  |  |  | 'CTG' => 'L',    # Leucine | 
| 497 |  |  |  |  |  |  | 'CTT' => 'L',    # Leucine | 
| 498 |  |  |  |  |  |  | 'CCA' => 'P',    # Proline | 
| 499 |  |  |  |  |  |  | 'CCC' => 'P',    # Proline | 
| 500 |  |  |  |  |  |  | 'CCG' => 'P',    # Proline | 
| 501 |  |  |  |  |  |  | 'CCT' => 'P',    # Proline | 
| 502 |  |  |  |  |  |  | 'CAC' => 'H',    # Histidine | 
| 503 |  |  |  |  |  |  | 'CAT' => 'H',    # Histidine | 
| 504 |  |  |  |  |  |  | 'CAA' => 'Q',    # Glutamine | 
| 505 |  |  |  |  |  |  | 'CAG' => 'Q',    # Glutamine | 
| 506 |  |  |  |  |  |  | 'CGA' => 'R',    # Arginine | 
| 507 |  |  |  |  |  |  | 'CGC' => 'R',    # Arginine | 
| 508 |  |  |  |  |  |  | 'CGG' => 'R',    # Arginine | 
| 509 |  |  |  |  |  |  | 'CGT' => 'R',    # Arginine | 
| 510 |  |  |  |  |  |  | 'ATA' => 'I',    # Isoleucine | 
| 511 |  |  |  |  |  |  | 'ATC' => 'I',    # Isoleucine | 
| 512 |  |  |  |  |  |  | 'ATT' => 'I',    # Isoleucine | 
| 513 |  |  |  |  |  |  | 'ATG' => 'M',    # Methionine | 
| 514 |  |  |  |  |  |  | 'ACA' => 'T',    # Threonine | 
| 515 |  |  |  |  |  |  | 'ACC' => 'T',    # Threonine | 
| 516 |  |  |  |  |  |  | 'ACG' => 'T',    # Threonine | 
| 517 |  |  |  |  |  |  | 'ACT' => 'T',    # Threonine | 
| 518 |  |  |  |  |  |  | 'AAC' => 'N',    # Asparagine | 
| 519 |  |  |  |  |  |  | 'AAT' => 'N',    # Asparagine | 
| 520 |  |  |  |  |  |  | 'AAA' => 'K',    # Lysine | 
| 521 |  |  |  |  |  |  | 'AAG' => 'K',    # Lysine | 
| 522 |  |  |  |  |  |  | 'AGC' => 'S',    # Serine | 
| 523 |  |  |  |  |  |  | 'AGT' => 'S',    # Serine | 
| 524 |  |  |  |  |  |  | 'AGA' => 'R',    # Arginine | 
| 525 |  |  |  |  |  |  | 'AGG' => 'R',    # Arginine | 
| 526 |  |  |  |  |  |  | 'GTA' => 'V',    # Valine | 
| 527 |  |  |  |  |  |  | 'GTC' => 'V',    # Valine | 
| 528 |  |  |  |  |  |  | 'GTG' => 'V',    # Valine | 
| 529 |  |  |  |  |  |  | 'GTT' => 'V',    # Valine | 
| 530 |  |  |  |  |  |  | 'GCA' => 'A',    # Alanine | 
| 531 |  |  |  |  |  |  | 'GCC' => 'A',    # Alanine | 
| 532 |  |  |  |  |  |  | 'GCG' => 'A',    # Alanine | 
| 533 |  |  |  |  |  |  | 'GCT' => 'A',    # Alanine | 
| 534 |  |  |  |  |  |  | 'GAC' => 'D',    # Aspartic Acid | 
| 535 |  |  |  |  |  |  | 'GAT' => 'D',    # Aspartic Acid | 
| 536 |  |  |  |  |  |  | 'GAA' => 'E',    # Glutamic Acid | 
| 537 |  |  |  |  |  |  | 'GAG' => 'E',    # Glutamic Acid | 
| 538 |  |  |  |  |  |  | 'GGA' => 'G',    # Glycine | 
| 539 |  |  |  |  |  |  | 'GGC' => 'G',    # Glycine | 
| 540 |  |  |  |  |  |  | 'GGG' => 'G',    # Glycine | 
| 541 |  |  |  |  |  |  | 'GGT' => 'G',    # Glycine | 
| 542 |  |  |  |  |  |  | ); | 
| 543 |  |  |  |  |  |  |  | 
| 544 | 0 | 0 |  |  |  |  | if ( exists $genetic_code{$codon} ) { | 
| 545 | 0 |  |  |  |  |  | return $genetic_code{$codon}; | 
| 546 |  |  |  |  |  |  | } | 
| 547 |  |  |  |  |  |  | else { | 
| 548 | 0 |  |  |  |  |  | print STDERR "Bad codon \"$codon\"!!\n"; | 
| 549 | 0 |  |  |  |  |  | exit; | 
| 550 |  |  |  |  |  |  | } | 
| 551 |  |  |  |  |  |  | } | 
| 552 |  |  |  |  |  |  |  | 
| 553 |  |  |  |  |  |  | =head2 generate_random_seqence | 
| 554 |  |  |  |  |  |  |  | 
| 555 |  |  |  |  |  |  | Example: | 
| 556 |  |  |  |  |  |  |  | 
| 557 |  |  |  |  |  |  | my @alphabet = qw/a c g t/; | 
| 558 |  |  |  |  |  |  | my $seq = generate_random_seqence( \@alphabet, 50 ); | 
| 559 |  |  |  |  |  |  |  | 
| 560 |  |  |  |  |  |  | =cut | 
| 561 |  |  |  |  |  |  |  | 
| 562 |  |  |  |  |  |  | sub generate_random_seqence { | 
| 563 | 0 |  |  | 0 | 1 |  | my ( $alphabet, $length ) = @_; | 
| 564 | 0 | 0 |  |  |  |  | unless ( ref $alphabet eq 'ARRAY' ) { | 
| 565 | 0 |  |  |  |  |  | warn "alphabet should be ref of array\n"; | 
| 566 | 0 |  |  |  |  |  | return 0; | 
| 567 |  |  |  |  |  |  | } | 
| 568 |  |  |  |  |  |  |  | 
| 569 | 0 |  |  |  |  |  | my $n = @$alphabet; | 
| 570 | 0 |  |  |  |  |  | my $seq; | 
| 571 | 0 |  |  |  |  |  | $seq .= $$alphabet[ int rand($n) ] for ( 1 .. $length ); | 
| 572 | 0 |  |  |  |  |  | return $seq; | 
| 573 |  |  |  |  |  |  | } | 
| 574 |  |  |  |  |  |  |  | 
| 575 |  |  |  |  |  |  | =head2 shuffle sequences | 
| 576 |  |  |  |  |  |  |  | 
| 577 |  |  |  |  |  |  | Example: | 
| 578 |  |  |  |  |  |  |  | 
| 579 |  |  |  |  |  |  | shuffle_sequences($file, "$file.shuf.fa"); | 
| 580 |  |  |  |  |  |  |  | 
| 581 |  |  |  |  |  |  | =cut | 
| 582 |  |  |  |  |  |  |  | 
| 583 |  |  |  |  |  |  | sub shuffle_sequences { | 
| 584 | 0 |  |  | 0 | 0 |  | my ( $file, $file_out, $not_trim ) = @_; | 
| 585 | 0 |  |  |  |  |  | my $seqs = read_sequence_from_fasta_file( $file, $not_trim ); | 
| 586 | 0 |  |  |  |  |  | my @keys = shuffle( keys %$seqs ); | 
| 587 |  |  |  |  |  |  |  | 
| 588 | 0 | 0 |  |  |  |  | $file_out = "$file.shuffled.fa" unless defined $file_out; | 
| 589 | 0 | 0 |  |  |  |  | open my $fh2, ">$file_out" or die "fail to write file $file_out\n"; | 
| 590 | 0 |  |  |  |  |  | print $fh2 ">$_\n$$seqs{$_}\n" for @keys; | 
| 591 | 0 |  |  |  |  |  | close $fh2; | 
| 592 |  |  |  |  |  |  |  | 
| 593 | 0 |  |  |  |  |  | return $file_out; | 
| 594 |  |  |  |  |  |  | } | 
| 595 |  |  |  |  |  |  |  | 
| 596 |  |  |  |  |  |  | =head2 rename_fasta_header | 
| 597 |  |  |  |  |  |  |  | 
| 598 |  |  |  |  |  |  | Rename fasta header with regexp. | 
| 599 |  |  |  |  |  |  |  | 
| 600 |  |  |  |  |  |  | Example: | 
| 601 |  |  |  |  |  |  |  | 
| 602 |  |  |  |  |  |  | # delete some symbols | 
| 603 |  |  |  |  |  |  | my $n = rename_fasta_header('[^a-z\d\s\-\_\(\)\[\]\|]', '', $file, "$file.rename.fa"); | 
| 604 |  |  |  |  |  |  | print "$n records renamed\n"; | 
| 605 |  |  |  |  |  |  |  | 
| 606 |  |  |  |  |  |  | =cut | 
| 607 |  |  |  |  |  |  |  | 
| 608 |  |  |  |  |  |  | sub rename_fasta_header { | 
| 609 | 0 |  |  | 0 | 1 |  | my ( $regex, $repalcement, $file, $outfile ) = @_; | 
| 610 |  |  |  |  |  |  |  | 
| 611 | 0 | 0 |  |  |  |  | open my $fh,  "<", $file    or die "fail to open file: $file\n"; | 
| 612 | 0 | 0 |  |  |  |  | open my $fh2, ">", $outfile or die "fail to wirte file: $outfile\n"; | 
| 613 |  |  |  |  |  |  |  | 
| 614 | 0 |  |  |  |  |  | my $head = ''; | 
| 615 | 0 |  |  |  |  |  | my $n    = 0; | 
| 616 | 0 |  |  |  |  |  | while (<$fh>) { | 
| 617 | 0 | 0 |  |  |  |  | if (/^\s*>(.*)\r?\n/) { | 
| 618 | 0 |  |  |  |  |  | $head = $1; | 
| 619 | 0 | 0 |  |  |  |  | if ( $head =~ /$regex/ ) { | 
| 620 | 0 |  |  |  |  |  | $head =~ s/$regex/$repalcement/g; | 
| 621 | 0 |  |  |  |  |  | $n++; | 
| 622 |  |  |  |  |  |  | } | 
| 623 | 0 |  |  |  |  |  | print $fh2 ">$head\n"; | 
| 624 |  |  |  |  |  |  | } | 
| 625 |  |  |  |  |  |  | else { | 
| 626 | 0 |  |  |  |  |  | print $fh2 $_; | 
| 627 |  |  |  |  |  |  | } | 
| 628 |  |  |  |  |  |  | } | 
| 629 | 0 |  |  |  |  |  | close $fh; | 
| 630 | 0 |  |  |  |  |  | close $fh2; | 
| 631 |  |  |  |  |  |  |  | 
| 632 | 0 |  |  |  |  |  | return $n; | 
| 633 |  |  |  |  |  |  | } | 
| 634 |  |  |  |  |  |  |  | 
| 635 |  |  |  |  |  |  | =head2 clean_fasta_header | 
| 636 |  |  |  |  |  |  |  | 
| 637 |  |  |  |  |  |  | Rename given symbols to repalcement string. | 
| 638 |  |  |  |  |  |  | Because, some symbols in fasta header will cause unexpected result. | 
| 639 |  |  |  |  |  |  |  | 
| 640 |  |  |  |  |  |  | Example: | 
| 641 |  |  |  |  |  |  |  | 
| 642 |  |  |  |  |  |  | my  $file = "test.fa"; | 
| 643 |  |  |  |  |  |  | my $n = clean_fasta_header($file, "$file.rename.fa"); | 
| 644 |  |  |  |  |  |  | # replace any symbol in (\/:*?"<>|) with '', i.e. deleting. | 
| 645 |  |  |  |  |  |  | # my $n = clean_fasta_header($file, "$file.rename.fa", '',  '\/:*?"<>|'); | 
| 646 |  |  |  |  |  |  | print "$n records renamed\n"; | 
| 647 |  |  |  |  |  |  |  | 
| 648 |  |  |  |  |  |  | =cut | 
| 649 |  |  |  |  |  |  |  | 
| 650 |  |  |  |  |  |  | sub clean_fasta_header { | 
| 651 | 0 |  |  | 0 | 1 |  | my ( $file, $outfile, $replacement, $symbols ) = @_; | 
| 652 | 0 | 0 |  |  |  |  | $replacement = "_" unless defined $replacement; | 
| 653 |  |  |  |  |  |  |  | 
| 654 | 0 |  |  |  |  |  | my @default = split //, '\/:*?"<>|'; | 
| 655 | 0 | 0 |  |  |  |  | $symbols = \@default unless defined $symbols; | 
| 656 | 0 | 0 |  |  |  |  | unless ( ref $symbols eq 'ARRAY' ) { | 
| 657 | 0 |  |  |  |  |  | warn "symbols should be ref of array\n"; | 
| 658 | 0 |  |  |  |  |  | return 0; | 
| 659 |  |  |  |  |  |  | } | 
| 660 | 0 |  |  |  |  |  | my $re = join '', map { quotemeta $_ } @$symbols; | 
|  | 0 |  |  |  |  |  |  | 
| 661 | 0 | 0 |  |  |  |  | open my $fh,  "<", $file    or die "fail to open file: $file\n"; | 
| 662 | 0 | 0 |  |  |  |  | open my $fh2, ">", $outfile or die "fail to wirte file: $outfile\n"; | 
| 663 |  |  |  |  |  |  |  | 
| 664 | 0 |  |  |  |  |  | my $head = ''; | 
| 665 | 0 |  |  |  |  |  | my $n    = 0; | 
| 666 | 0 |  |  |  |  |  | while (<$fh>) { | 
| 667 | 0 | 0 |  |  |  |  | if (/^\s*>(.*)\r?\n/) { | 
| 668 | 0 |  |  |  |  |  | $head = $1; | 
| 669 | 0 | 0 |  |  |  |  | if ( $head =~ /[$re]/ ) { | 
| 670 | 0 |  |  |  |  |  | $head =~ s/[$re]/$replacement/g; | 
| 671 | 0 |  |  |  |  |  | $n++; | 
| 672 |  |  |  |  |  |  | } | 
| 673 | 0 |  |  |  |  |  | print $fh2 ">$head\n"; | 
| 674 |  |  |  |  |  |  | } | 
| 675 |  |  |  |  |  |  | else { | 
| 676 | 0 |  |  |  |  |  | print $fh2 $_; | 
| 677 |  |  |  |  |  |  | } | 
| 678 |  |  |  |  |  |  | } | 
| 679 | 0 |  |  |  |  |  | close $fh; | 
| 680 | 0 |  |  |  |  |  | close $fh2; | 
| 681 |  |  |  |  |  |  |  | 
| 682 | 0 |  |  |  |  |  | return $n; | 
| 683 |  |  |  |  |  |  | } | 
| 684 |  |  |  |  |  |  |  | 
| 685 |  |  |  |  |  |  | 1; |