| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | package Bio::MUST::Core::Seq; | 
| 2 |  |  |  |  |  |  | # ABSTRACT: Nucleotide or protein sequence | 
| 3 |  |  |  |  |  |  | # CONTRIBUTOR: Catherine COLSON <ccolson@doct.uliege.be> | 
| 4 |  |  |  |  |  |  | # CONTRIBUTOR: Arnaud DI FRANCO <arnaud.difranco@gmail.com> | 
| 5 |  |  |  |  |  |  | # CONTRIBUTOR: Valerian LUPO <valerian.lupo@doct.uliege.be> | 
| 6 |  |  |  |  |  |  | $Bio::MUST::Core::Seq::VERSION = '0.212650'; | 
| 7 | 17 |  |  | 17 |  | 139 | use Moose; | 
|  | 17 |  |  |  |  | 41 |  | 
|  | 17 |  |  |  |  | 167 |  | 
| 8 | 17 |  |  | 17 |  | 97138 | use MooseX::SemiAffordanceAccessor; | 
|  | 17 |  |  |  |  | 137079 |  | 
|  | 17 |  |  |  |  | 76 |  | 
| 9 | 17 |  |  | 17 |  | 137026 | use namespace::autoclean; | 
|  | 17 |  |  |  |  | 49 |  | 
|  | 17 |  |  |  |  | 191 |  | 
| 10 |  |  |  |  |  |  |  | 
| 11 |  |  |  |  |  |  | # use Smart::Comments '###'; | 
| 12 |  |  |  |  |  |  |  | 
| 13 | 17 |  |  | 17 |  | 1748 | use autodie; | 
|  | 17 |  |  |  |  | 45 |  | 
|  | 17 |  |  |  |  | 501 |  | 
| 14 | 17 |  |  | 17 |  | 95028 | use feature qw(say); | 
|  | 17 |  |  |  |  | 67 |  | 
|  | 17 |  |  |  |  | 1889 |  | 
| 15 |  |  |  |  |  |  |  | 
| 16 | 17 |  |  | 17 |  | 331 | use Carp; | 
|  | 17 |  |  |  |  | 47 |  | 
|  | 17 |  |  |  |  | 1475 |  | 
| 17 |  |  |  |  |  |  |  | 
| 18 | 17 |  |  | 17 |  | 125 | use Bio::MUST::Core::Types; | 
|  | 17 |  |  |  |  | 40 |  | 
|  | 17 |  |  |  |  | 622 |  | 
| 19 | 17 |  |  | 17 |  | 127 | use Bio::MUST::Core::Constants qw(:seqtypes :seqids :gaps); | 
|  | 17 |  |  |  |  | 42 |  | 
|  | 17 |  |  |  |  | 4661 |  | 
| 20 | 17 |  |  | 17 |  | 10027 | use aliased 'Bio::MUST::Core::SeqId'; | 
|  | 17 |  |  |  |  | 13593 |  | 
|  | 17 |  |  |  |  | 152 |  | 
| 21 |  |  |  |  |  |  |  | 
| 22 |  |  |  |  |  |  | has 'seq_id' => ( | 
| 23 |  |  |  |  |  |  | is       => 'rw', | 
| 24 |  |  |  |  |  |  | isa      => 'Bio::MUST::Core::SeqId', | 
| 25 |  |  |  |  |  |  | required => 1, | 
| 26 |  |  |  |  |  |  | coerce   => 1, | 
| 27 |  |  |  |  |  |  | handles  => qr{.*}xms,      # expose all SeqId methods (and attributes) | 
| 28 |  |  |  |  |  |  | ); | 
| 29 |  |  |  |  |  |  |  | 
| 30 |  |  |  |  |  |  |  | 
| 31 |  |  |  |  |  |  | has 'seq' => ( | 
| 32 |  |  |  |  |  |  | traits   => ['String'], | 
| 33 |  |  |  |  |  |  | is       => 'ro', | 
| 34 |  |  |  |  |  |  | isa      => 'Bio::MUST::Core::Types::Seq', | 
| 35 |  |  |  |  |  |  | default  => q{},            # can be empty | 
| 36 |  |  |  |  |  |  | coerce   => 1, | 
| 37 |  |  |  |  |  |  | writer   => '_set_seq', | 
| 38 |  |  |  |  |  |  | handles  => { | 
| 39 |  |  |  |  |  |  | seq_len => 'length', | 
| 40 |  |  |  |  |  |  | append_seq => 'append', | 
| 41 |  |  |  |  |  |  | replace_seq => 'replace', | 
| 42 |  |  |  |  |  |  | edit_seq => 'substr', | 
| 43 |  |  |  |  |  |  | }, | 
| 44 |  |  |  |  |  |  | ); | 
| 45 |  |  |  |  |  |  |  | 
| 46 |  |  |  |  |  |  |  | 
| 47 |  |  |  |  |  |  |  | 
| 48 |  |  |  |  |  |  | # TODO: check whether this could be done by some Moose extension | 
| 49 |  |  |  |  |  |  |  | 
| 50 |  |  |  |  |  |  | sub clone { | 
| 51 | 512 |  |  | 512 | 1 | 847 | my $self = shift; | 
| 52 |  |  |  |  |  |  |  | 
| 53 | 512 |  |  |  |  | 1448 | return $self->new( | 
| 54 |  |  |  |  |  |  | seq_id => $self->full_id, seq => $self->seq | 
| 55 |  |  |  |  |  |  | ); | 
| 56 |  |  |  |  |  |  | } | 
| 57 |  |  |  |  |  |  |  | 
| 58 |  |  |  |  |  |  |  | 
| 59 |  |  |  |  |  |  | # boolean assertions | 
| 60 |  |  |  |  |  |  | # TODO: optimize these assertions via caching | 
| 61 |  |  |  |  |  |  |  | 
| 62 |  |  |  |  |  |  |  | 
| 63 |  |  |  |  |  |  | sub is_protein { | 
| 64 | 753 |  |  | 753 | 1 | 1223 | my $self = shift; | 
| 65 | 753 | 100 |  |  |  | 19070 | return 1 if $self->seq =~ $PROTLIKE; | 
| 66 | 153 |  |  |  |  | 399 | return 0;                               # at least 1 non-nt char | 
| 67 |  |  |  |  |  |  | } | 
| 68 |  |  |  |  |  |  |  | 
| 69 |  |  |  |  |  |  |  | 
| 70 |  |  |  |  |  |  | sub is_rna { | 
| 71 | 11 |  |  | 11 | 1 | 18 | my $self = shift; | 
| 72 | 11 | 100 | 100 |  |  | 267 | return 1 if $self->seq =~ $RNALIKE && (not $self->is_protein); | 
| 73 | 9 |  |  |  |  | 23 | return 0;                               # at least 1 'U' | 
| 74 |  |  |  |  |  |  | } | 
| 75 |  |  |  |  |  |  |  | 
| 76 |  |  |  |  |  |  |  | 
| 77 |  |  |  |  |  |  | sub is_aligned { | 
| 78 | 8599 |  |  | 8599 | 1 | 11777 | my $self = shift; | 
| 79 | 8599 | 100 |  |  |  | 212220 | return 1 if $self->seq =~ $GAP;         # at least 1 gap-like char | 
| 80 | 8403 |  |  |  |  | 17450 | return 0; | 
| 81 |  |  |  |  |  |  | } | 
| 82 |  |  |  |  |  |  |  | 
| 83 |  |  |  |  |  |  |  | 
| 84 |  |  |  |  |  |  | sub is_subseq_of { | 
| 85 | 36 |  |  | 36 | 1 | 60 | my $self = shift; | 
| 86 | 36 |  |  |  |  | 51 | my $seq2 = shift;                       # can be a mere string | 
| 87 |  |  |  |  |  |  |  | 
| 88 | 36 |  |  |  |  | 67 | $self = $self->raw_str; | 
| 89 | 36 | 100 |  |  |  | 184 | $seq2 = $seq2->isa('Bio::MUST::Core::Seq') | 
| 90 |  |  |  |  |  |  | ? $seq2->raw_str : _strip_gaps($seq2); | 
| 91 | 36 | 100 |  |  |  | 260 | return 1 if $seq2 =~ m/$self/xmsi;      # case-insensitive comparison | 
| 92 | 16 |  |  |  |  | 43 | return 0;                               # only here because expensive! | 
| 93 |  |  |  |  |  |  | } | 
| 94 |  |  |  |  |  |  |  | 
| 95 |  |  |  |  |  |  |  | 
| 96 |  |  |  |  |  |  | sub is_superseq_of { | 
| 97 | 36 |  |  | 36 | 1 | 62 | my $self = shift; | 
| 98 | 36 |  |  |  |  | 49 | my $seq2 = shift;                       # can be a mere string | 
| 99 |  |  |  |  |  |  |  | 
| 100 | 36 |  |  |  |  | 58 | $self = $self->raw_str; | 
| 101 | 36 | 100 |  |  |  | 158 | $seq2 = $seq2->isa('Bio::MUST::Core::Seq') | 
| 102 |  |  |  |  |  |  | ? $seq2->raw_str : _strip_gaps($seq2); | 
| 103 | 36 | 100 |  |  |  | 250 | return 1 if $self =~ m/$seq2/xmsi;      # case-insensitive comparison | 
| 104 | 16 |  |  |  |  | 44 | return 0;                               # only here because expensive! | 
| 105 |  |  |  |  |  |  | } | 
| 106 |  |  |  |  |  |  |  | 
| 107 |  |  |  |  |  |  |  | 
| 108 |  |  |  |  |  |  | sub first_site { | 
| 109 | 11 |  |  | 11 | 1 | 17 | my $self = shift; | 
| 110 |  |  |  |  |  |  |  | 
| 111 | 11 |  |  |  |  | 266 | my ($leading_gaps) = $self->seq =~ m{ \A ($GAP+) }xms; | 
| 112 | 11 |  | 100 |  |  | 53 | return length $leading_gaps // 0; | 
| 113 |  |  |  |  |  |  | } | 
| 114 |  |  |  |  |  |  |  | 
| 115 |  |  |  |  |  |  |  | 
| 116 |  |  |  |  |  |  | sub uc {                            ## no critic (ProhibitBuiltinHomonyms) | 
| 117 | 5 |  |  | 5 | 1 | 8 | my $self = shift; | 
| 118 |  |  |  |  |  |  |  | 
| 119 | 5 |  |  |  |  | 123 | $self->_set_seq( uc $self->seq ); | 
| 120 | 5 |  |  |  |  | 16 | return $self; | 
| 121 |  |  |  |  |  |  | } | 
| 122 |  |  |  |  |  |  |  | 
| 123 |  |  |  |  |  |  | sub uc_seq {                                ## no critic (RequireArgUnpacking) | 
| 124 | 0 |  |  | 0 | 0 | 0 | carp '[BMC] Warning: Method uc_seq is deprecated; use uc instead!'; | 
| 125 | 0 |  |  |  |  | 0 | return shift->uc(@_); | 
| 126 |  |  |  |  |  |  | } | 
| 127 |  |  |  |  |  |  |  | 
| 128 |  |  |  |  |  |  |  | 
| 129 |  |  |  |  |  |  | sub recode { | 
| 130 | 5 |  |  | 5 | 1 | 9 | my $self     = shift; | 
| 131 | 5 |  |  |  |  | 7 | my $base_for = shift; | 
| 132 |  |  |  |  |  |  |  | 
| 133 | 5 |  |  |  |  | 14 | my @states = $self->all_states; | 
| 134 | 5 |  |  |  |  | 14 | my @rec_states; | 
| 135 |  |  |  |  |  |  |  | 
| 136 | 5 |  |  |  |  | 7 | for my $state (@states) { | 
| 137 | 700 |  | 33 |  |  | 1152 | my $rec_state = $base_for->{$state} // $FRAMESHIFT; | 
| 138 | 700 |  |  |  |  | 1008 | push @rec_states, $rec_state; | 
| 139 |  |  |  |  |  |  | } | 
| 140 |  |  |  |  |  |  |  | 
| 141 | 5 |  |  |  |  | 42 | my $new_seq = join q{}, @rec_states; | 
| 142 | 5 |  |  |  |  | 158 | $self->_set_seq($new_seq); | 
| 143 |  |  |  |  |  |  |  | 
| 144 | 5 |  |  |  |  | 74 | return $self; | 
| 145 |  |  |  |  |  |  | } | 
| 146 |  |  |  |  |  |  |  | 
| 147 |  |  |  |  |  |  | sub recode_seq {                            ## no critic (RequireArgUnpacking) | 
| 148 | 0 |  |  | 0 | 0 | 0 | carp '[BMC] Warning: Method recode_seq is deprecated; use recode instead!'; | 
| 149 | 0 |  |  |  |  | 0 | return shift->recode(@_); | 
| 150 |  |  |  |  |  |  | } | 
| 151 |  |  |  |  |  |  |  | 
| 152 |  |  |  |  |  |  | # gap cleaning methods | 
| 153 |  |  |  |  |  |  |  | 
| 154 |  |  |  |  |  |  |  | 
| 155 |  |  |  |  |  |  | sub degap { | 
| 156 | 38 |  |  | 38 | 1 | 94 | my $self = shift; | 
| 157 |  |  |  |  |  |  |  | 
| 158 | 38 |  |  |  |  | 101 | $self->_set_seq($self->raw_str); | 
| 159 | 38 |  |  |  |  | 287 | return $self; | 
| 160 |  |  |  |  |  |  | } | 
| 161 |  |  |  |  |  |  |  | 
| 162 |  |  |  |  |  |  |  | 
| 163 |  |  |  |  |  |  | sub gapify { | 
| 164 | 181 |  |  | 181 | 1 | 316 | my $self = shift; | 
| 165 | 181 |  | 100 |  |  | 455 | my $char = shift // '*';                # defaults to gap | 
| 166 |  |  |  |  |  |  |  | 
| 167 |  |  |  |  |  |  | # original version: very general but much too slow on large matrices | 
| 168 |  |  |  |  |  |  | # my $regex = $PROTMISS;                  # defaults to protein seq | 
| 169 |  |  |  |  |  |  | # | 
| 170 |  |  |  |  |  |  | # # in case of DNA ensure correct 'missification' (if applicable) | 
| 171 |  |  |  |  |  |  | # unless ($self->is_protein) { | 
| 172 |  |  |  |  |  |  | #     $regex = $DNAMISS; | 
| 173 |  |  |  |  |  |  | #     $char = 'N' if $char =~ $DNAMISS; | 
| 174 |  |  |  |  |  |  | # } | 
| 175 |  |  |  |  |  |  | # | 
| 176 |  |  |  |  |  |  | # ( my $seq = $self->seq ) =~ s{$regex}{$char}xmsg; | 
| 177 |  |  |  |  |  |  |  | 
| 178 |  |  |  |  |  |  | # alternative version: hard-coding $char (if possible) gives a 250% boost | 
| 179 |  |  |  |  |  |  | # my $seq = $self->seq; | 
| 180 |  |  |  |  |  |  | # if ($self->is_protein) { | 
| 181 |  |  |  |  |  |  | #     if    ($char eq '*') { | 
| 182 |  |  |  |  |  |  | #         $seq =~ s{$PROTMISS}{*}xmsg; | 
| 183 |  |  |  |  |  |  | #     } | 
| 184 |  |  |  |  |  |  | #     elsif ($char eq 'X') { | 
| 185 |  |  |  |  |  |  | #         $seq =~ s{$PROTMISS}{X}xmsg; | 
| 186 |  |  |  |  |  |  | #     } | 
| 187 |  |  |  |  |  |  | #     else { | 
| 188 |  |  |  |  |  |  | #         $seq =~ s{$PROTMISS}{$char}xmsg; | 
| 189 |  |  |  |  |  |  | #     } | 
| 190 |  |  |  |  |  |  | # } | 
| 191 |  |  |  |  |  |  | # else { | 
| 192 |  |  |  |  |  |  | #     if    ($char eq '*') { | 
| 193 |  |  |  |  |  |  | #         $seq =~ s{$DNAMISS}{*}xmsg; | 
| 194 |  |  |  |  |  |  | #     } | 
| 195 |  |  |  |  |  |  | #     elsif ($char eq 'X') { | 
| 196 |  |  |  |  |  |  | #         $seq =~ s{$DNAMISS}{N}xmsg; | 
| 197 |  |  |  |  |  |  | #     } | 
| 198 |  |  |  |  |  |  | #     else { | 
| 199 |  |  |  |  |  |  | #         $seq =~ s{$DNAMISS}{$char}xmsg; | 
| 200 |  |  |  |  |  |  | #     } | 
| 201 |  |  |  |  |  |  | # } | 
| 202 |  |  |  |  |  |  |  | 
| 203 |  |  |  |  |  |  | # current version: hard-coded tr/// for 900% boost with s/// fall-back | 
| 204 |  |  |  |  |  |  | # Note: $PROTMISS and $DNAMISS regexes in Constants.pm are ignored! | 
| 205 | 181 |  |  |  |  | 4674 | my $seq = $self->seq; | 
| 206 | 181 | 100 |  |  |  | 458 | if ($self->is_protein) { | 
| 207 | 173 | 100 |  |  |  | 524 | if    ($char eq '*') { | 
|  |  | 50 |  |  |  |  |  | 
| 208 | 10 |  |  |  |  | 28 | $seq =~ tr{?XxBJOUZbjouz}{*}; | 
| 209 |  |  |  |  |  |  | } | 
| 210 |  |  |  |  |  |  | elsif ($char eq 'X') { | 
| 211 | 163 |  |  |  |  | 841 | $seq =~ tr{?XxBJOUZbjouz}{X}; | 
| 212 |  |  |  |  |  |  | } | 
| 213 |  |  |  |  |  |  | else { | 
| 214 | 0 |  |  |  |  | 0 | $seq =~ s{[?XxBJOUZbjouz]}{$char}xmsg; | 
| 215 |  |  |  |  |  |  | } | 
| 216 |  |  |  |  |  |  | } | 
| 217 |  |  |  |  |  |  | else { | 
| 218 | 8 | 100 |  |  |  | 29 | if    ($char eq '*') { | 
|  |  | 50 |  |  |  |  |  | 
| 219 | 4 |  |  |  |  | 11 | $seq =~ tr{?XxNnBDHKMRSVWYbdhkmrsvwy}{*}; | 
| 220 |  |  |  |  |  |  | } | 
| 221 |  |  |  |  |  |  | elsif ($char eq 'X') { | 
| 222 | 4 |  |  |  |  | 10 | $seq =~ tr{?XxNnBDHKMRSVWYbdhkmrsvwy}{N}; | 
| 223 |  |  |  |  |  |  | } | 
| 224 |  |  |  |  |  |  | else { | 
| 225 | 0 |  |  |  |  | 0 | $seq =~ s{[?XxNnBDHKMRSVWYbdhkmrsvwy]}{$char}xmsg; | 
| 226 |  |  |  |  |  |  | } | 
| 227 |  |  |  |  |  |  | } | 
| 228 |  |  |  |  |  |  |  | 
| 229 | 181 |  |  |  |  | 5400 | $self->_set_seq($seq); | 
| 230 | 181 |  |  |  |  | 642 | return $self; | 
| 231 |  |  |  |  |  |  | } | 
| 232 |  |  |  |  |  |  |  | 
| 233 |  |  |  |  |  |  |  | 
| 234 |  |  |  |  |  |  | sub spacify { | 
| 235 | 3069 |  |  | 3069 | 1 | 4957 | my $self = shift; | 
| 236 |  |  |  |  |  |  |  | 
| 237 | 3069 |  |  |  |  | 74635 | my $seq = $self->seq; | 
| 238 |  |  |  |  |  |  |  | 
| 239 |  |  |  |  |  |  | # uniformize runs of [*-space] having at least one 'true' space | 
| 240 |  |  |  |  |  |  | # Note: we cannot use replace_seq because of the g flag | 
| 241 | 3069 |  |  |  |  | 105886 | $seq =~ s{ ( $GAP+ \ + ) }{ ' ' x length($1) }xmseg; | 
|  | 550 |  |  |  |  | 5652 |  | 
| 242 | 3069 |  |  |  |  | 80009 | $seq =~ s{ ( \ + $GAP+ ) }{ ' ' x length($1) }xmseg; | 
|  | 561 |  |  |  |  | 5444 |  | 
| 243 |  |  |  |  |  |  |  | 
| 244 |  |  |  |  |  |  | # Note: two simpler regexes are muuuuuch faster than one complicated regex! | 
| 245 |  |  |  |  |  |  | # $seq =~ s{ ( $GAP* \ + $GAP* ) }{ ' ' x length($1) }xmseg; | 
| 246 |  |  |  |  |  |  |  | 
| 247 | 3069 |  |  |  |  | 89360 | $self->_set_seq($seq); | 
| 248 | 3069 |  |  |  |  | 10453 | return $self; | 
| 249 |  |  |  |  |  |  | } | 
| 250 |  |  |  |  |  |  |  | 
| 251 |  |  |  |  |  |  |  | 
| 252 |  |  |  |  |  |  | sub trim { | 
| 253 | 3068 |  |  | 3068 | 1 | 5248 | my $self = shift; | 
| 254 |  |  |  |  |  |  |  | 
| 255 | 3068 |  |  |  |  | 107964 | $self->replace_seq( qr{ $GAP+\z }xms, q{} ); | 
| 256 | 3068 |  |  |  |  | 12810 | return $self; | 
| 257 |  |  |  |  |  |  | } | 
| 258 |  |  |  |  |  |  |  | 
| 259 |  |  |  |  |  |  |  | 
| 260 |  |  |  |  |  |  | sub pad_to { | 
| 261 | 3068 |  |  | 3068 | 1 | 4844 | my $self  = shift; | 
| 262 | 3068 |  |  |  |  | 4093 | my $bound = shift; | 
| 263 |  |  |  |  |  |  |  | 
| 264 | 3068 |  |  |  |  | 95084 | $self->append_seq( q{ } x ($bound - $self->seq_len) ); | 
| 265 | 3068 |  |  |  |  | 9411 | return $self; | 
| 266 |  |  |  |  |  |  | } | 
| 267 |  |  |  |  |  |  |  | 
| 268 |  |  |  |  |  |  |  | 
| 269 |  |  |  |  |  |  | sub clear_new_tag { | 
| 270 | 3 |  |  | 3 | 1 | 5 | my $self = shift; | 
| 271 |  |  |  |  |  |  |  | 
| 272 | 3 |  |  |  |  | 12 | (my $full_id = $self->full_id) =~ s{$NEW_TAG\z}{}xms; | 
| 273 | 3 |  |  |  |  | 102 | $self->set_seq_id( SeqId->new( full_id => $full_id ) ); | 
| 274 |  |  |  |  |  |  |  | 
| 275 | 3 |  |  |  |  | 86 | return $self; | 
| 276 |  |  |  |  |  |  | } | 
| 277 |  |  |  |  |  |  |  | 
| 278 |  |  |  |  |  |  | # site-wise methods (0-numbered) | 
| 279 |  |  |  |  |  |  |  | 
| 280 |  |  |  |  |  |  |  | 
| 281 |  |  |  |  |  |  | sub all_states { | 
| 282 | 4203 |  |  | 4203 | 1 | 6455 | my $self = shift; | 
| 283 | 4203 |  |  |  |  | 111372 | return split //, $self->seq; | 
| 284 |  |  |  |  |  |  | } | 
| 285 |  |  |  |  |  |  |  | 
| 286 |  |  |  |  |  |  |  | 
| 287 |  |  |  |  |  |  | sub state_at {                              ## no critic (RequireArgUnpacking) | 
| 288 | 2002847 |  |  | 2002847 | 1 | 62953193 | return shift->edit_seq(@_, 1); | 
| 289 |  |  |  |  |  |  | } | 
| 290 |  |  |  |  |  |  |  | 
| 291 |  |  |  |  |  |  |  | 
| 292 |  |  |  |  |  |  | sub delete_site {                           ## no critic (RequireArgUnpacking) | 
| 293 | 0 |  |  | 0 | 1 | 0 | my $self = shift; | 
| 294 |  |  |  |  |  |  |  | 
| 295 | 0 |  |  |  |  | 0 | $self->edit_seq(@_, 1, q{}); | 
| 296 | 0 |  |  |  |  | 0 | return $self; | 
| 297 |  |  |  |  |  |  | } | 
| 298 |  |  |  |  |  |  |  | 
| 299 |  |  |  |  |  |  |  | 
| 300 |  |  |  |  |  |  | sub is_missing { | 
| 301 | 40 |  |  | 40 | 1 | 144 | my $self = shift; | 
| 302 | 40 |  |  |  |  | 53 | my $site = shift; | 
| 303 |  |  |  |  |  |  |  | 
| 304 | 40 |  |  |  |  | 75 | my $state = $self->state_at($site); | 
| 305 | 40 | 100 |  |  |  | 190 | return 1 if $state =~ $PROTMISS; | 
| 306 | 38 | 100 | 100 |  |  | 113 | return 1 if $state =~  $DNAMISS && (not $self->is_protein); | 
| 307 | 37 |  |  |  |  | 228 | return 0;                               # X (or N depending on seq type) | 
| 308 |  |  |  |  |  |  | } | 
| 309 |  |  |  |  |  |  |  | 
| 310 |  |  |  |  |  |  |  | 
| 311 |  |  |  |  |  |  | sub is_gap { | 
| 312 | 139 |  |  | 139 | 1 | 256 | my $self = shift; | 
| 313 | 139 |  |  |  |  | 180 | my $site = shift; | 
| 314 |  |  |  |  |  |  |  | 
| 315 | 139 | 100 |  |  |  | 259 | return 1 if $self->state_at($site) =~ $GAP; | 
| 316 | 109 |  |  |  |  | 322 | return 0; | 
| 317 |  |  |  |  |  |  | } | 
| 318 |  |  |  |  |  |  |  | 
| 319 |  |  |  |  |  |  |  | 
| 320 |  |  |  |  |  |  | # global methods | 
| 321 |  |  |  |  |  |  |  | 
| 322 |  |  |  |  |  |  | around qw(purity reverse_complemented_seq codons) => sub { | 
| 323 |  |  |  |  |  |  | my $method = shift; | 
| 324 |  |  |  |  |  |  | my $self   = shift; | 
| 325 |  |  |  |  |  |  |  | 
| 326 |  |  |  |  |  |  | # Note: we return an explicit undef to emulate other accessor behavior | 
| 327 |  |  |  |  |  |  | if ($self->is_protein) { | 
| 328 |  |  |  |  |  |  | carp '[BMC] Warning: sequence looks like a protein; returning undef!'; | 
| 329 |  |  |  |  |  |  | return undef;           ## no critic (ProhibitExplicitReturnUndef) | 
| 330 |  |  |  |  |  |  | } | 
| 331 |  |  |  |  |  |  |  | 
| 332 |  |  |  |  |  |  | return $self->$method(@_); | 
| 333 |  |  |  |  |  |  | }; | 
| 334 |  |  |  |  |  |  |  | 
| 335 |  |  |  |  |  |  |  | 
| 336 |  |  |  |  |  |  | sub nomiss_seq_len { | 
| 337 | 361 |  |  | 361 | 1 | 613 | my $self = shift; | 
| 338 |  |  |  |  |  |  |  | 
| 339 | 361 | 100 |  |  |  | 612 | my $regex = $self->is_protein ? $PROTMISS : $DNAMISS; | 
| 340 | 361 |  |  |  |  | 793 | (my $raw_str = $self->raw_str) =~ s/$regex//xmsg; | 
| 341 |  |  |  |  |  |  | # TODO: decide how to handle ambiguous nucleotides | 
| 342 |  |  |  |  |  |  |  | 
| 343 | 361 |  |  |  |  | 1133 | return length $raw_str; | 
| 344 |  |  |  |  |  |  | } | 
| 345 |  |  |  |  |  |  |  | 
| 346 |  |  |  |  |  |  |  | 
| 347 |  |  |  |  |  |  | sub purity { | 
| 348 |  |  |  |  |  |  | my $self = shift; | 
| 349 |  |  |  |  |  |  |  | 
| 350 |  |  |  |  |  |  | (my $pure_seq = $self->seq) =~ s/$NONPUREDNA//xmsg; | 
| 351 |  |  |  |  |  |  | my $purity = 1.0 * length($pure_seq) / $self->seq_len; | 
| 352 |  |  |  |  |  |  |  | 
| 353 |  |  |  |  |  |  | return $purity; | 
| 354 |  |  |  |  |  |  | } | 
| 355 |  |  |  |  |  |  |  | 
| 356 |  |  |  |  |  |  |  | 
| 357 |  |  |  |  |  |  | sub reverse_complemented_seq { | 
| 358 |  |  |  |  |  |  | my $self = shift; | 
| 359 |  |  |  |  |  |  |  | 
| 360 |  |  |  |  |  |  | # reverse complement and preserve case | 
| 361 |  |  |  |  |  |  | # Note: RNA always becomes DNA | 
| 362 |  |  |  |  |  |  | my $new_seq = scalar reverse $self->seq; | 
| 363 |  |  |  |  |  |  | $new_seq =~ tr/ATUGCYRSWKMBDHVN/TAACGRYSWMKVHDBN/; | 
| 364 |  |  |  |  |  |  | $new_seq =~ tr/atugcyrswkmbdhvn/taacgryswmkvhdbn/; | 
| 365 |  |  |  |  |  |  |  | 
| 366 |  |  |  |  |  |  | return $self->new( seq_id => $self->full_id, seq => $new_seq ); | 
| 367 |  |  |  |  |  |  | } | 
| 368 |  |  |  |  |  |  |  | 
| 369 |  |  |  |  |  |  |  | 
| 370 |  |  |  |  |  |  | sub spliced_seq { | 
| 371 | 1 |  |  | 1 | 1 | 9 | my $self   = shift; | 
| 372 | 1 |  |  |  |  | 3 | my $blocks = shift; | 
| 373 |  |  |  |  |  |  |  | 
| 374 | 1 |  |  |  |  | 2 | my $new_seq; | 
| 375 | 1 |  |  |  |  | 28 | my $seq = $self->seq; | 
| 376 | 1 |  |  |  |  | 2 | for my $block ( @{$blocks} ) { | 
|  | 1 |  |  |  |  | 4 |  | 
| 377 | 8 |  |  |  |  | 11 | my ($start, $end) = @{$block}; | 
|  | 8 |  |  |  |  | 17 |  | 
| 378 | 8 |  |  |  |  | 32 | $new_seq .= substr $seq, $start - 1, $end - $start + 1; | 
| 379 |  |  |  |  |  |  | } | 
| 380 |  |  |  |  |  |  |  | 
| 381 | 1 |  |  |  |  | 10 | return $self->new( seq_id => $self->full_id, seq => $new_seq ); | 
| 382 |  |  |  |  |  |  | } | 
| 383 |  |  |  |  |  |  |  | 
| 384 |  |  |  |  |  |  |  | 
| 385 |  |  |  |  |  |  | sub raw_str { | 
| 386 | 518 |  |  | 518 | 1 | 1098 | my $self = shift; | 
| 387 | 518 |  |  |  |  | 14253 | return _strip_gaps($self->seq); | 
| 388 |  |  |  |  |  |  | } | 
| 389 |  |  |  |  |  |  |  | 
| 390 |  |  |  |  |  |  | sub raw_seq {                               ## no critic (RequireArgUnpacking) | 
| 391 | 0 |  |  | 0 | 0 | 0 | carp '[BMC] Warning: Method raw_seq is deprecated; use raw_str instead!'; | 
| 392 | 0 |  |  |  |  | 0 | return shift->raw_str(@_); | 
| 393 |  |  |  |  |  |  | } | 
| 394 |  |  |  |  |  |  |  | 
| 395 |  |  |  |  |  |  |  | 
| 396 |  |  |  |  |  |  | sub wrapped_str { | 
| 397 | 113 |  |  | 113 | 1 | 157 | my $self  = shift; | 
| 398 | 113 |  | 50 |  |  | 277 | my $chunk = shift // 60; | 
| 399 |  |  |  |  |  |  |  | 
| 400 | 113 | 50 |  |  |  | 253 | my $nowrap = $chunk < 0 ? 1 : 0; | 
| 401 | 113 |  |  |  |  | 3628 | my $width = $self->seq_len; | 
| 402 | 113 | 50 |  |  |  | 262 | $chunk = $width      if $nowrap; | 
| 403 |  |  |  |  |  |  |  | 
| 404 | 113 |  |  |  |  | 150 | my $str; | 
| 405 | 113 |  |  |  |  | 257 | for (my $site = 0; $site < $width; $site += $chunk) { | 
| 406 | 7533 |  |  |  |  | 245838 | $str .= $self->edit_seq($site, $chunk) . "\n"; | 
| 407 |  |  |  |  |  |  | } | 
| 408 |  |  |  |  |  |  |  | 
| 409 | 113 |  |  |  |  | 531 | return $str; | 
| 410 |  |  |  |  |  |  | } | 
| 411 |  |  |  |  |  |  |  | 
| 412 |  |  |  |  |  |  |  | 
| 413 |  |  |  |  |  |  | sub codons { | 
| 414 |  |  |  |  |  |  | my $self  = shift; | 
| 415 |  |  |  |  |  |  | my $frame = shift // 1;             # defaults to frame +1 | 
| 416 |  |  |  |  |  |  |  | 
| 417 |  |  |  |  |  |  | # get specified DNA strand | 
| 418 |  |  |  |  |  |  | my $dna = $frame < 0 ? $self->reverse_complemented_seq->seq : $self->seq; | 
| 419 |  |  |  |  |  |  | $dna =~ tr/Uu/Tt/;                  # ensure DNA | 
| 420 |  |  |  |  |  |  |  | 
| 421 |  |  |  |  |  |  | # split strand into codons beginning at specified frame | 
| 422 |  |  |  |  |  |  | # ... and discard incomplete codons | 
| 423 |  |  |  |  |  |  | my @codons; | 
| 424 |  |  |  |  |  |  | for (my $i = (abs $frame) - 1; $i < length $dna; $i += 3) { | 
| 425 |  |  |  |  |  |  | my $codon = substr $dna, $i, 3; | 
| 426 |  |  |  |  |  |  | push @codons, $codon if length $codon == 3; | 
| 427 |  |  |  |  |  |  | } | 
| 428 |  |  |  |  |  |  |  | 
| 429 |  |  |  |  |  |  | return \@codons; | 
| 430 |  |  |  |  |  |  | } | 
| 431 |  |  |  |  |  |  |  | 
| 432 |  |  |  |  |  |  |  | 
| 433 |  |  |  |  |  |  | # private subs | 
| 434 |  |  |  |  |  |  |  | 
| 435 |  |  |  |  |  |  | sub _strip_gaps { | 
| 436 | 554 |  |  | 554 |  | 836 | my $seq = shift; | 
| 437 |  |  |  |  |  |  |  | 
| 438 | 554 |  |  |  |  | 3287 | $seq =~ s/$GAP+//xmsg; | 
| 439 | 554 |  |  |  |  | 3849 | return $seq;                                    # strip all gaps | 
| 440 |  |  |  |  |  |  | } | 
| 441 |  |  |  |  |  |  |  | 
| 442 |  |  |  |  |  |  | __PACKAGE__->meta->make_immutable; | 
| 443 |  |  |  |  |  |  | 1; | 
| 444 |  |  |  |  |  |  |  | 
| 445 |  |  |  |  |  |  | __END__ | 
| 446 |  |  |  |  |  |  |  | 
| 447 |  |  |  |  |  |  | =pod | 
| 448 |  |  |  |  |  |  |  | 
| 449 |  |  |  |  |  |  | =head1 NAME | 
| 450 |  |  |  |  |  |  |  | 
| 451 |  |  |  |  |  |  | Bio::MUST::Core::Seq - Nucleotide or protein sequence | 
| 452 |  |  |  |  |  |  |  | 
| 453 |  |  |  |  |  |  | =head1 VERSION | 
| 454 |  |  |  |  |  |  |  | 
| 455 |  |  |  |  |  |  | version 0.212650 | 
| 456 |  |  |  |  |  |  |  | 
| 457 |  |  |  |  |  |  | =head1 SYNOPSIS | 
| 458 |  |  |  |  |  |  |  | 
| 459 |  |  |  |  |  |  | # TODO | 
| 460 |  |  |  |  |  |  |  | 
| 461 |  |  |  |  |  |  | =head1 DESCRIPTION | 
| 462 |  |  |  |  |  |  |  | 
| 463 |  |  |  |  |  |  | # TODO | 
| 464 |  |  |  |  |  |  |  | 
| 465 |  |  |  |  |  |  | =head1 CONSTRUCTORS | 
| 466 |  |  |  |  |  |  |  | 
| 467 |  |  |  |  |  |  | =head2 clone | 
| 468 |  |  |  |  |  |  |  | 
| 469 |  |  |  |  |  |  | =head2 reverse_complemented_seq | 
| 470 |  |  |  |  |  |  |  | 
| 471 |  |  |  |  |  |  | =head2 spliced_seq | 
| 472 |  |  |  |  |  |  |  | 
| 473 |  |  |  |  |  |  | =head1 ACCESSORS | 
| 474 |  |  |  |  |  |  |  | 
| 475 |  |  |  |  |  |  | =head2 all_states | 
| 476 |  |  |  |  |  |  |  | 
| 477 |  |  |  |  |  |  | =head2 state_at | 
| 478 |  |  |  |  |  |  |  | 
| 479 |  |  |  |  |  |  | =head2 delete_site | 
| 480 |  |  |  |  |  |  |  | 
| 481 |  |  |  |  |  |  | =head1 PROPERTIES | 
| 482 |  |  |  |  |  |  |  | 
| 483 |  |  |  |  |  |  | =head2 is_protein | 
| 484 |  |  |  |  |  |  |  | 
| 485 |  |  |  |  |  |  | =head2 is_rna | 
| 486 |  |  |  |  |  |  |  | 
| 487 |  |  |  |  |  |  | =head2 is_aligned | 
| 488 |  |  |  |  |  |  |  | 
| 489 |  |  |  |  |  |  | =head2 is_subseq_of | 
| 490 |  |  |  |  |  |  |  | 
| 491 |  |  |  |  |  |  | =head2 is_superseq_of | 
| 492 |  |  |  |  |  |  |  | 
| 493 |  |  |  |  |  |  | =head2 first_site | 
| 494 |  |  |  |  |  |  |  | 
| 495 |  |  |  |  |  |  | =head2 is_missing | 
| 496 |  |  |  |  |  |  |  | 
| 497 |  |  |  |  |  |  | =head2 is_gap | 
| 498 |  |  |  |  |  |  |  | 
| 499 |  |  |  |  |  |  | =head2 nomiss_seq_len | 
| 500 |  |  |  |  |  |  |  | 
| 501 |  |  |  |  |  |  | =head2 purity | 
| 502 |  |  |  |  |  |  |  | 
| 503 |  |  |  |  |  |  | =head1 MUTATORS | 
| 504 |  |  |  |  |  |  |  | 
| 505 |  |  |  |  |  |  | =head2 uc | 
| 506 |  |  |  |  |  |  |  | 
| 507 |  |  |  |  |  |  | =head2 recode | 
| 508 |  |  |  |  |  |  |  | 
| 509 |  |  |  |  |  |  | =head2 degap | 
| 510 |  |  |  |  |  |  |  | 
| 511 |  |  |  |  |  |  | =head2 gapify | 
| 512 |  |  |  |  |  |  |  | 
| 513 |  |  |  |  |  |  | =head2 spacify | 
| 514 |  |  |  |  |  |  |  | 
| 515 |  |  |  |  |  |  | =head2 trim | 
| 516 |  |  |  |  |  |  |  | 
| 517 |  |  |  |  |  |  | =head2 pad_to | 
| 518 |  |  |  |  |  |  |  | 
| 519 |  |  |  |  |  |  | =head2 clear_new_tag | 
| 520 |  |  |  |  |  |  |  | 
| 521 |  |  |  |  |  |  | =head1 MISC METHODS | 
| 522 |  |  |  |  |  |  |  | 
| 523 |  |  |  |  |  |  | =head2 raw_str | 
| 524 |  |  |  |  |  |  |  | 
| 525 |  |  |  |  |  |  | =head2 wrapped_str | 
| 526 |  |  |  |  |  |  |  | 
| 527 |  |  |  |  |  |  | =head2 codons | 
| 528 |  |  |  |  |  |  |  | 
| 529 |  |  |  |  |  |  | =head1 AUTHOR | 
| 530 |  |  |  |  |  |  |  | 
| 531 |  |  |  |  |  |  | Denis BAURAIN <denis.baurain@uliege.be> | 
| 532 |  |  |  |  |  |  |  | 
| 533 |  |  |  |  |  |  | =head1 CONTRIBUTORS | 
| 534 |  |  |  |  |  |  |  | 
| 535 |  |  |  |  |  |  | =for stopwords Catherine COLSON Arnaud DI FRANCO Valerian LUPO | 
| 536 |  |  |  |  |  |  |  | 
| 537 |  |  |  |  |  |  | =over 4 | 
| 538 |  |  |  |  |  |  |  | 
| 539 |  |  |  |  |  |  | =item * | 
| 540 |  |  |  |  |  |  |  | 
| 541 |  |  |  |  |  |  | Catherine COLSON <ccolson@doct.uliege.be> | 
| 542 |  |  |  |  |  |  |  | 
| 543 |  |  |  |  |  |  | =item * | 
| 544 |  |  |  |  |  |  |  | 
| 545 |  |  |  |  |  |  | Arnaud DI FRANCO <arnaud.difranco@gmail.com> | 
| 546 |  |  |  |  |  |  |  | 
| 547 |  |  |  |  |  |  | =item * | 
| 548 |  |  |  |  |  |  |  | 
| 549 |  |  |  |  |  |  | Valerian LUPO <valerian.lupo@doct.uliege.be> | 
| 550 |  |  |  |  |  |  |  | 
| 551 |  |  |  |  |  |  | =back | 
| 552 |  |  |  |  |  |  |  | 
| 553 |  |  |  |  |  |  | =head1 COPYRIGHT AND LICENSE | 
| 554 |  |  |  |  |  |  |  | 
| 555 |  |  |  |  |  |  | This software is copyright (c) 2013 by University of Liege / Unit of Eukaryotic Phylogenomics / Denis BAURAIN. | 
| 556 |  |  |  |  |  |  |  | 
| 557 |  |  |  |  |  |  | This is free software; you can redistribute it and/or modify it under | 
| 558 |  |  |  |  |  |  | the same terms as the Perl 5 programming language system itself. | 
| 559 |  |  |  |  |  |  |  | 
| 560 |  |  |  |  |  |  | =cut |