File Coverage

blib/lib/Bio/MUST/Core/Seq.pm

Criterion	Covered	Total	%
statement	141	152	92.7
branch	32	36	88.8
condition	12	15	80.0
subroutine	33	37	89.1
pod	24	27	88.8
total	242	267	90.6

line	stmt	bran	cond	sub	pod	time	code
1							package Bio::MUST::Core::Seq;
2							# ABSTRACT: Nucleotide or protein sequence
3							# CONTRIBUTOR: Catherine COLSON <ccolson@doct.uliege.be>
4							# CONTRIBUTOR: Arnaud DI FRANCO <arnaud.difranco@gmail.com>
5							# CONTRIBUTOR: Valerian LUPO <valerian.lupo@doct.uliege.be>
6							$Bio::MUST::Core::Seq::VERSION = '0.212650';
7	17			17		139	use Moose;
	17					41
	17					167
8	17			17		97138	use MooseX::SemiAffordanceAccessor;
	17					137079
	17					76
9	17			17		137026	use namespace::autoclean;
	17					49
	17					191
10
11							# use Smart::Comments '###';
12
13	17			17		1748	use autodie;
	17					45
	17					501
14	17			17		95028	use feature qw(say);
	17					67
	17					1889
15
16	17			17		331	use Carp;
	17					47
	17					1475
17
18	17			17		125	use Bio::MUST::Core::Types;
	17					40
	17					622
19	17			17		127	use Bio::MUST::Core::Constants qw(:seqtypes :seqids :gaps);
	17					42
	17					4661
20	17			17		10027	use aliased 'Bio::MUST::Core::SeqId';
	17					13593
	17					152
21
22							has 'seq_id' => (
23							is => 'rw',
24							isa => 'Bio::MUST::Core::SeqId',
25							required => 1,
26							coerce => 1,
27							handles => qr{.*}xms, # expose all SeqId methods (and attributes)
28							);
29
30
31							has 'seq' => (
32							traits => ['String'],
33							is => 'ro',
34							isa => 'Bio::MUST::Core::Types::Seq',
35							default => q{}, # can be empty
36							coerce => 1,
37							writer => '_set_seq',
38							handles => {
39							seq_len => 'length',
40							append_seq => 'append',
41							replace_seq => 'replace',
42							edit_seq => 'substr',
43							},
44							);
45
46
47
48							# TODO: check whether this could be done by some Moose extension
49
50							sub clone {
51	512			512	1	847	my $self = shift;
52
53	512					1448	return $self->new(
54							seq_id => $self->full_id, seq => $self->seq
55							);
56							}
57
58
59							# boolean assertions
60							# TODO: optimize these assertions via caching
61
62
63							sub is_protein {
64	753			753	1	1223	my $self = shift;
65	753	100				19070	return 1 if $self->seq =~ $PROTLIKE;
66	153					399	return 0; # at least 1 non-nt char
67							}
68
69
70							sub is_rna {
71	11			11	1	18	my $self = shift;
72	11	100	100			267	return 1 if $self->seq =~ $RNALIKE && (not $self->is_protein);
73	9					23	return 0; # at least 1 'U'
74							}
75
76
77							sub is_aligned {
78	8599			8599	1	11777	my $self = shift;
79	8599	100				212220	return 1 if $self->seq =~ $GAP; # at least 1 gap-like char
80	8403					17450	return 0;
81							}
82
83
84							sub is_subseq_of {
85	36			36	1	60	my $self = shift;
86	36					51	my $seq2 = shift; # can be a mere string
87
88	36					67	$self = $self->raw_str;
89	36	100				184	$seq2 = $seq2->isa('Bio::MUST::Core::Seq')
90							? $seq2->raw_str : _strip_gaps($seq2);
91	36	100				260	return 1 if $seq2 =~ m/$self/xmsi; # case-insensitive comparison
92	16					43	return 0; # only here because expensive!
93							}
94
95
96							sub is_superseq_of {
97	36			36	1	62	my $self = shift;
98	36					49	my $seq2 = shift; # can be a mere string
99
100	36					58	$self = $self->raw_str;
101	36	100				158	$seq2 = $seq2->isa('Bio::MUST::Core::Seq')
102							? $seq2->raw_str : _strip_gaps($seq2);
103	36	100				250	return 1 if $self =~ m/$seq2/xmsi; # case-insensitive comparison
104	16					44	return 0; # only here because expensive!
105							}
106
107
108							sub first_site {
109	11			11	1	17	my $self = shift;
110
111	11					266	my ($leading_gaps) = $self->seq =~ m{ \A ($GAP+) }xms;
112	11		100			53	return length $leading_gaps // 0;
113							}
114
115
116							sub uc { ## no critic (ProhibitBuiltinHomonyms)
117	5			5	1	8	my $self = shift;
118
119	5					123	$self->_set_seq( uc $self->seq );
120	5					16	return $self;
121							}
122
123							sub uc_seq { ## no critic (RequireArgUnpacking)
124	0			0	0	0	carp '[BMC] Warning: Method uc_seq is deprecated; use uc instead!';
125	0					0	return shift->uc(@_);
126							}
127
128
129							sub recode {
130	5			5	1	9	my $self = shift;
131	5					7	my $base_for = shift;
132
133	5					14	my @states = $self->all_states;
134	5					14	my @rec_states;
135
136	5					7	for my $state (@states) {
137	700		33			1152	my $rec_state = $base_for->{$state} // $FRAMESHIFT;
138	700					1008	push @rec_states, $rec_state;
139							}
140
141	5					42	my $new_seq = join q{}, @rec_states;
142	5					158	$self->_set_seq($new_seq);
143
144	5					74	return $self;
145							}
146
147							sub recode_seq { ## no critic (RequireArgUnpacking)
148	0			0	0	0	carp '[BMC] Warning: Method recode_seq is deprecated; use recode instead!';
149	0					0	return shift->recode(@_);
150							}
151
152							# gap cleaning methods
153
154
155							sub degap {
156	38			38	1	94	my $self = shift;
157
158	38					101	$self->_set_seq($self->raw_str);
159	38					287	return $self;
160							}
161
162
163							sub gapify {
164	181			181	1	316	my $self = shift;
165	181		100			455	my $char = shift // '*'; # defaults to gap
166
167							# original version: very general but much too slow on large matrices
168							# my $regex = $PROTMISS; # defaults to protein seq
169							#
170							# # in case of DNA ensure correct 'missification' (if applicable)
171							# unless ($self->is_protein) {
172							# $regex = $DNAMISS;
173							# $char = 'N' if $char =~ $DNAMISS;
174							# }
175							#
176							# ( my $seq = $self->seq ) =~ s{$regex}{$char}xmsg;
177
178							# alternative version: hard-coding $char (if possible) gives a 250% boost
179							# my $seq = $self->seq;
180							# if ($self->is_protein) {
181							# if ($char eq '*') {
182							# $seq =~ s{$PROTMISS}{*}xmsg;
183							# }
184							# elsif ($char eq 'X') {
185							# $seq =~ s{$PROTMISS}{X}xmsg;
186							# }
187							# else {
188							# $seq =~ s{$PROTMISS}{$char}xmsg;
189							# }
190							# }
191							# else {
192							# if ($char eq '*') {
193							# $seq =~ s{$DNAMISS}{*}xmsg;
194							# }
195							# elsif ($char eq 'X') {
196							# $seq =~ s{$DNAMISS}{N}xmsg;
197							# }
198							# else {
199							# $seq =~ s{$DNAMISS}{$char}xmsg;
200							# }
201							# }
202
203							# current version: hard-coded tr/// for 900% boost with s/// fall-back
204							# Note: $PROTMISS and $DNAMISS regexes in Constants.pm are ignored!
205	181					4674	my $seq = $self->seq;
206	181	100				458	if ($self->is_protein) {
207	173	100				524	if ($char eq '*') {
		50
208	10					28	$seq =~ tr{?XxBJOUZbjouz}{*};
209							}
210							elsif ($char eq 'X') {
211	163					841	$seq =~ tr{?XxBJOUZbjouz}{X};
212							}
213							else {
214	0					0	$seq =~ s{[?XxBJOUZbjouz]}{$char}xmsg;
215							}
216							}
217							else {
218	8	100				29	if ($char eq '*') {
		50
219	4					11	$seq =~ tr{?XxNnBDHKMRSVWYbdhkmrsvwy}{*};
220							}
221							elsif ($char eq 'X') {
222	4					10	$seq =~ tr{?XxNnBDHKMRSVWYbdhkmrsvwy}{N};
223							}
224							else {
225	0					0	$seq =~ s{[?XxNnBDHKMRSVWYbdhkmrsvwy]}{$char}xmsg;
226							}
227							}
228
229	181					5400	$self->_set_seq($seq);
230	181					642	return $self;
231							}
232
233
234							sub spacify {
235	3069			3069	1	4957	my $self = shift;
236
237	3069					74635	my $seq = $self->seq;
238
239							# uniformize runs of [*-space] having at least one 'true' space
240							# Note: we cannot use replace_seq because of the g flag
241	3069					105886	$seq =~ s{ ( $GAP+ \ + ) }{ ' ' x length($1) }xmseg;
	550					5652
242	3069					80009	$seq =~ s{ ( \ + $GAP+ ) }{ ' ' x length($1) }xmseg;
	561					5444
243
244							# Note: two simpler regexes are muuuuuch faster than one complicated regex!
245							# $seq =~ s{ ( $GAP* \ + $GAP* ) }{ ' ' x length($1) }xmseg;
246
247	3069					89360	$self->_set_seq($seq);
248	3069					10453	return $self;
249							}
250
251
252							sub trim {
253	3068			3068	1	5248	my $self = shift;
254
255	3068					107964	$self->replace_seq( qr{ $GAP+\z }xms, q{} );
256	3068					12810	return $self;
257							}
258
259
260							sub pad_to {
261	3068			3068	1	4844	my $self = shift;
262	3068					4093	my $bound = shift;
263
264	3068					95084	$self->append_seq( q{ } x ($bound - $self->seq_len) );
265	3068					9411	return $self;
266							}
267
268
269							sub clear_new_tag {
270	3			3	1	5	my $self = shift;
271
272	3					12	(my $full_id = $self->full_id) =~ s{$NEW_TAG\z}{}xms;
273	3					102	$self->set_seq_id( SeqId->new( full_id => $full_id ) );
274
275	3					86	return $self;
276							}
277
278							# site-wise methods (0-numbered)
279
280
281							sub all_states {
282	4203			4203	1	6455	my $self = shift;
283	4203					111372	return split //, $self->seq;
284							}
285
286
287							sub state_at { ## no critic (RequireArgUnpacking)
288	2002847			2002847	1	62953193	return shift->edit_seq(@_, 1);
289							}
290
291
292							sub delete_site { ## no critic (RequireArgUnpacking)
293	0			0	1	0	my $self = shift;
294
295	0					0	$self->edit_seq(@_, 1, q{});
296	0					0	return $self;
297							}
298
299
300							sub is_missing {
301	40			40	1	144	my $self = shift;
302	40					53	my $site = shift;
303
304	40					75	my $state = $self->state_at($site);
305	40	100				190	return 1 if $state =~ $PROTMISS;
306	38	100	100			113	return 1 if $state =~ $DNAMISS && (not $self->is_protein);
307	37					228	return 0; # X (or N depending on seq type)
308							}
309
310
311							sub is_gap {
312	139			139	1	256	my $self = shift;
313	139					180	my $site = shift;
314
315	139	100				259	return 1 if $self->state_at($site) =~ $GAP;
316	109					322	return 0;
317							}
318
319
320							# global methods
321
322							around qw(purity reverse_complemented_seq codons) => sub {
323							my $method = shift;
324							my $self = shift;
325
326							# Note: we return an explicit undef to emulate other accessor behavior
327							if ($self->is_protein) {
328							carp '[BMC] Warning: sequence looks like a protein; returning undef!';
329							return undef; ## no critic (ProhibitExplicitReturnUndef)
330							}
331
332							return $self->$method(@_);
333							};
334
335
336							sub nomiss_seq_len {
337	361			361	1	613	my $self = shift;
338
339	361	100				612	my $regex = $self->is_protein ? $PROTMISS : $DNAMISS;
340	361					793	(my $raw_str = $self->raw_str) =~ s/$regex//xmsg;
341							# TODO: decide how to handle ambiguous nucleotides
342
343	361					1133	return length $raw_str;
344							}
345
346
347							sub purity {
348							my $self = shift;
349
350							(my $pure_seq = $self->seq) =~ s/$NONPUREDNA//xmsg;
351							my $purity = 1.0 * length($pure_seq) / $self->seq_len;
352
353							return $purity;
354							}
355
356
357							sub reverse_complemented_seq {
358							my $self = shift;
359
360							# reverse complement and preserve case
361							# Note: RNA always becomes DNA
362							my $new_seq = scalar reverse $self->seq;
363							$new_seq =~ tr/ATUGCYRSWKMBDHVN/TAACGRYSWMKVHDBN/;
364							$new_seq =~ tr/atugcyrswkmbdhvn/taacgryswmkvhdbn/;
365
366							return $self->new( seq_id => $self->full_id, seq => $new_seq );
367							}
368
369
370							sub spliced_seq {
371	1			1	1	9	my $self = shift;
372	1					3	my $blocks = shift;
373
374	1					2	my $new_seq;
375	1					28	my $seq = $self->seq;
376	1					2	for my $block ( @{$blocks} ) {
	1					4
377	8					11	my ($start, $end) = @{$block};
	8					17
378	8					32	$new_seq .= substr $seq, $start - 1, $end - $start + 1;
379							}
380
381	1					10	return $self->new( seq_id => $self->full_id, seq => $new_seq );
382							}
383
384
385							sub raw_str {
386	518			518	1	1098	my $self = shift;
387	518					14253	return _strip_gaps($self->seq);
388							}
389
390							sub raw_seq { ## no critic (RequireArgUnpacking)
391	0			0	0	0	carp '[BMC] Warning: Method raw_seq is deprecated; use raw_str instead!';
392	0					0	return shift->raw_str(@_);
393							}
394
395
396							sub wrapped_str {
397	113			113	1	157	my $self = shift;
398	113		50			277	my $chunk = shift // 60;
399
400	113	50				253	my $nowrap = $chunk < 0 ? 1 : 0;
401	113					3628	my $width = $self->seq_len;
402	113	50				262	$chunk = $width if $nowrap;
403
404	113					150	my $str;
405	113					257	for (my $site = 0; $site < $width; $site += $chunk) {
406	7533					245838	$str .= $self->edit_seq($site, $chunk) . "\n";
407							}
408
409	113					531	return $str;
410							}
411
412
413							sub codons {
414							my $self = shift;
415							my $frame = shift // 1; # defaults to frame +1
416
417							# get specified DNA strand
418							my $dna = $frame < 0 ? $self->reverse_complemented_seq->seq : $self->seq;
419							$dna =~ tr/Uu/Tt/; # ensure DNA
420
421							# split strand into codons beginning at specified frame
422							# ... and discard incomplete codons
423							my @codons;
424							for (my $i = (abs $frame) - 1; $i < length $dna; $i += 3) {
425							my $codon = substr $dna, $i, 3;
426							push @codons, $codon if length $codon == 3;
427							}
428
429							return \@codons;
430							}
431
432
433							# private subs
434
435							sub _strip_gaps {
436	554			554		836	my $seq = shift;
437
438	554					3287	$seq =~ s/$GAP+//xmsg;
439	554					3849	return $seq; # strip all gaps
440							}
441
442							__PACKAGE__->meta->make_immutable;
443							1;
444
445							__END__
446
447							=pod
448
449							=head1 NAME
450
451							Bio::MUST::Core::Seq - Nucleotide or protein sequence
452
453							=head1 VERSION
454
455							version 0.212650
456
457							=head1 SYNOPSIS
458
459							# TODO
460
461							=head1 DESCRIPTION
462
463							# TODO
464
465							=head1 CONSTRUCTORS
466
467							=head2 clone
468
469							=head2 reverse_complemented_seq
470
471							=head2 spliced_seq
472
473							=head1 ACCESSORS
474
475							=head2 all_states
476
477							=head2 state_at
478
479							=head2 delete_site
480
481							=head1 PROPERTIES
482
483							=head2 is_protein
484
485							=head2 is_rna
486
487							=head2 is_aligned
488
489							=head2 is_subseq_of
490
491							=head2 is_superseq_of
492
493							=head2 first_site
494
495							=head2 is_missing
496
497							=head2 is_gap
498
499							=head2 nomiss_seq_len
500
501							=head2 purity
502
503							=head1 MUTATORS
504
505							=head2 uc
506
507							=head2 recode
508
509							=head2 degap
510
511							=head2 gapify
512
513							=head2 spacify
514
515							=head2 trim
516
517							=head2 pad_to
518
519							=head2 clear_new_tag
520
521							=head1 MISC METHODS
522
523							=head2 raw_str
524
525							=head2 wrapped_str
526
527							=head2 codons
528
529							=head1 AUTHOR
530
531							Denis BAURAIN <denis.baurain@uliege.be>
532
533							=head1 CONTRIBUTORS
534
535							=for stopwords Catherine COLSON Arnaud DI FRANCO Valerian LUPO
536
537							=over 4
538
539							=item *
540
541							Catherine COLSON <ccolson@doct.uliege.be>
542
543							=item *
544
545							Arnaud DI FRANCO <arnaud.difranco@gmail.com>
546
547							=item *
548
549							Valerian LUPO <valerian.lupo@doct.uliege.be>
550
551							=back
552
553							=head1 COPYRIGHT AND LICENSE
554
555							This software is copyright (c) 2013 by University of Liege / Unit of Eukaryotic Phylogenomics / Denis BAURAIN.
556
557							This is free software; you can redistribute it and/or modify it under
558							the same terms as the Perl 5 programming language system itself.
559
560							=cut