File Coverage

blib/lib/Bio/MUST/Core/Seq.pm

Criterion	Covered	Total	%
statement	138	147	93.8
branch	28	30	93.3
condition	12	15	80.0
subroutine	33	37	89.1
pod	24	27	88.8
total	235	256	91.8

line	stmt	bran	cond	sub	pod	time	code
1							package Bio::MUST::Core::Seq;
2							# ABSTRACT: Nucleotide or protein sequence
3							# CONTRIBUTOR: Catherine COLSON <ccolson@doct.uliege.be>
4							# CONTRIBUTOR: Arnaud DI FRANCO <arnaud.difranco@gmail.com>
5							# CONTRIBUTOR: Valerian LUPO <valerian.lupo@doct.uliege.be>
6							$Bio::MUST::Core::Seq::VERSION = '0.212670';
7	17			17		132	use Moose;
	17					47
	17					138
8	17			17		97172	use MooseX::SemiAffordanceAccessor;
	17					140197
	17					75
9	17			17		137647	use namespace::autoclean;
	17					43
	17					186
10
11							# use Smart::Comments '###';
12
13	17			17		1899	use autodie;
	17					41
	17					390
14	17			17		93832	use feature qw(say);
	17					45
	17					1842
15
16	17			17		139	use Carp;
	17					52
	17					1522
17
18	17			17		122	use Bio::MUST::Core::Types;
	17					50
	17					564
19	17			17		103	use Bio::MUST::Core::Constants qw(:seqtypes :seqids :gaps);
	17					34
	17					4602
20	17			17		10074	use aliased 'Bio::MUST::Core::SeqId';
	17					13836
	17					112
21
22							has 'seq_id' => (
23							is => 'rw',
24							isa => 'Bio::MUST::Core::SeqId',
25							required => 1,
26							coerce => 1,
27							handles => qr{.*}xms, # expose all SeqId methods (and attributes)
28							);
29
30
31							has 'seq' => (
32							traits => ['String'],
33							is => 'ro',
34							isa => 'Bio::MUST::Core::Types::Seq',
35							default => q{}, # can be empty
36							coerce => 1,
37							writer => '_set_seq',
38							handles => {
39							seq_len => 'length',
40							append_seq => 'append',
41							replace_seq => 'replace',
42							edit_seq => 'substr',
43							},
44							);
45
46
47
48							# TODO: check whether this could be done by some Moose extension
49
50							sub clone {
51	402			402	1	694	my $self = shift;
52
53	402					1172	return $self->new(
54							seq_id => $self->full_id, seq => $self->seq
55							);
56							}
57
58
59							# boolean assertions
60							# TODO: optimize these assertions via caching
61
62
63							sub is_protein {
64	643			643	1	962	my $self = shift;
65	643	100				16912	return 1 if $self->seq =~ $PROTLIKE;
66	153					461	return 0; # at least 1 non-nt char
67							}
68
69
70							sub is_rna {
71	11			11	1	26	my $self = shift;
72	11	100	100			276	return 1 if $self->seq =~ $RNALIKE && (not $self->is_protein);
73	9					43	return 0; # at least 1 'U'
74							}
75
76
77							sub is_aligned {
78	8599			8599	1	10823	my $self = shift;
79	8599	100				191079	return 1 if $self->seq =~ $GAP; # at least 1 gap-like char
80	8403					15808	return 0;
81							}
82
83
84							sub is_subseq_of {
85	36			36	1	81	my $self = shift;
86	36					58	my $seq2 = shift; # can be a mere string
87
88	36					103	$self = $self->raw_str;
89	36	100				239	$seq2 = $seq2->isa('Bio::MUST::Core::Seq')
90							? $seq2->raw_str : _strip_gaps($seq2);
91	36	100				411	return 1 if $seq2 =~ m/$self/xmsi; # case-insensitive comparison
92	16					80	return 0; # only here because expensive!
93							}
94
95
96							sub is_superseq_of {
97	36			36	1	155	my $self = shift;
98	36					62	my $seq2 = shift; # can be a mere string
99
100	36					77	$self = $self->raw_str;
101	36	100				167	$seq2 = $seq2->isa('Bio::MUST::Core::Seq')
102							? $seq2->raw_str : _strip_gaps($seq2);
103	36	100				303	return 1 if $self =~ m/$seq2/xmsi; # case-insensitive comparison
104	16					56	return 0; # only here because expensive!
105							}
106
107
108							sub first_site {
109	11			11	1	26	my $self = shift;
110
111	11					276	my ($leading_gaps) = $self->seq =~ m{ \A ($GAP+) }xms;
112	11		100			80	return length $leading_gaps // 0;
113							}
114
115
116							sub uc { ## no critic (ProhibitBuiltinHomonyms)
117	5			5	1	9	my $self = shift;
118
119	5					129	$self->_set_seq( uc $self->seq );
120	5					18	return $self;
121							}
122
123							sub uc_seq { ## no critic (RequireArgUnpacking)
124	0			0	0	0	carp '[BMC] Warning: Method uc_seq is deprecated; use uc instead!';
125	0					0	return shift->uc(@_);
126							}
127
128
129							sub recode {
130	5			5	1	10	my $self = shift;
131	5					10	my $base_for = shift;
132
133	5					13	my @states = $self->all_states;
134	5					11	my @rec_states;
135
136	5					11	for my $state (@states) {
137	700		33			1202	my $rec_state = $base_for->{$state} // $FRAMESHIFT;
138	700					934	push @rec_states, $rec_state;
139							}
140
141	5					33	my $new_seq = join q{}, @rec_states;
142	5					157	$self->_set_seq($new_seq);
143
144	5					79	return $self;
145							}
146
147							sub recode_seq { ## no critic (RequireArgUnpacking)
148	0			0	0	0	carp '[BMC] Warning: Method recode_seq is deprecated; use recode instead!';
149	0					0	return shift->recode(@_);
150							}
151
152							# gap cleaning methods
153
154
155							sub degap {
156	38			38	1	92	my $self = shift;
157
158	38					166	$self->_set_seq($self->raw_str);
159	38					279	return $self;
160							}
161
162
163							sub gapify {
164	71			71	1	137	my $self = shift;
165	71		100			202	my $char = shift // '*'; # defaults to gap
166
167	71					128	my $regex = $PROTMISS; # defaults to protein seq
168
169							# in case of DNA ensure correct 'missification' (if applicable)
170	71	100				157	unless ($self->is_protein) {
171	8					14	$regex = $DNAMISS;
172	8	100				33	$char = 'N' if $char =~ $DNAMISS;
173							}
174
175	71					1801	( my $seq = $self->seq ) =~ s{$regex}{$char}xmsg;
176
177							# alternative versions developed due to a design error in Ali::store_phylip
178							# They are now commented because less general and not critical anymore.
179
180							# alt vers 1: hard-coding $char (if possible) gives a 250% boost
181							# my $seq = $self->seq;
182							# if ($self->is_protein) {
183							# if ($char eq '*') {
184							# $seq =~ s{$PROTMISS}{*}xmsg;
185							# }
186							# elsif ($char eq 'X') {
187							# $seq =~ s{$PROTMISS}{X}xmsg;
188							# }
189							# else {
190							# $seq =~ s{$PROTMISS}{$char}xmsg;
191							# }
192							# }
193							# else {
194							# if ($char eq '*') {
195							# $seq =~ s{$DNAMISS}{*}xmsg;
196							# }
197							# elsif ($char eq 'X') {
198							# $seq =~ s{$DNAMISS}{N}xmsg;
199							# }
200							# else {
201							# $seq =~ s{$DNAMISS}{$char}xmsg;
202							# }
203							# }
204
205							# alt vers 2: hard-coded tr/// for 900% boost with s/// fall-back
206							# Note: $PROTMISS and $DNAMISS regexes in Constants.pm are ignored!
207							# my $seq = $self->seq;
208							# if ($self->is_protein) {
209							# if ($char eq '*') {
210							# $seq =~ tr{?XxBJOUZbjouz}{*};
211							# }
212							# elsif ($char eq 'X') {
213							# $seq =~ tr{?XxBJOUZbjouz}{X};
214							# }
215							# else {
216							# $seq =~ s{[?XxBJOUZbjouz]}{$char}xmsg;
217							# }
218							# }
219							# else {
220							# if ($char eq '*') {
221							# $seq =~ tr{?XxNnBDHKMRSVWYbdhkmrsvwy}{*};
222							# }
223							# elsif ($char eq 'X') {
224							# $seq =~ tr{?XxNnBDHKMRSVWYbdhkmrsvwy}{N};
225							# }
226							# else {
227							# $seq =~ s{[?XxNnBDHKMRSVWYbdhkmrsvwy]}{$char}xmsg;
228							# }
229							# }
230
231	71					2475	$self->_set_seq($seq);
232	71					430	return $self;
233							}
234
235
236							sub spacify {
237	3069			3069	1	4808	my $self = shift;
238
239	3069					75319	my $seq = $self->seq;
240
241							# uniformize runs of [*-space] having at least one 'true' space
242							# Note: we cannot use replace_seq because of the g flag
243	3069					103260	$seq =~ s{ ( $GAP+ \ + ) }{ ' ' x length($1) }xmseg;
	550					5655
244	3069					79700	$seq =~ s{ ( \ + $GAP+ ) }{ ' ' x length($1) }xmseg;
	561					5426
245
246							# Note: two simpler regexes are muuuuuch faster than one complicated regex!
247							# $seq =~ s{ ( $GAP* \ + $GAP* ) }{ ' ' x length($1) }xmseg;
248
249	3069					89705	$self->_set_seq($seq);
250	3069					10557	return $self;
251							}
252
253
254							sub trim {
255	3068			3068	1	5429	my $self = shift;
256
257	3068					110315	$self->replace_seq( qr{ $GAP+\z }xms, q{} );
258	3068					12903	return $self;
259							}
260
261
262							sub pad_to {
263	3068			3068	1	4785	my $self = shift;
264	3068					4082	my $bound = shift;
265
266	3068					97954	$self->append_seq( q{ } x ($bound - $self->seq_len) );
267	3068					9775	return $self;
268							}
269
270
271							sub clear_new_tag {
272	3			3	1	6	my $self = shift;
273
274	3					11	(my $full_id = $self->full_id) =~ s{$NEW_TAG\z}{}xms;
275	3					88	$self->set_seq_id( SeqId->new( full_id => $full_id ) );
276
277	3					88	return $self;
278							}
279
280							# site-wise methods (0-numbered)
281
282
283							sub all_states {
284	4203			4203	1	6205	my $self = shift;
285	4203					108452	return split //, $self->seq;
286							}
287
288
289							sub state_at { ## no critic (RequireArgUnpacking)
290	2002847			2002847	1	65348612	return shift->edit_seq(@_, 1);
291							}
292
293
294							sub delete_site { ## no critic (RequireArgUnpacking)
295	0			0	1	0	my $self = shift;
296
297	0					0	$self->edit_seq(@_, 1, q{});
298	0					0	return $self;
299							}
300
301
302							sub is_missing {
303	40			40	1	156	my $self = shift;
304	40					58	my $site = shift;
305
306	40					98	my $state = $self->state_at($site);
307	40	100				209	return 1 if $state =~ $PROTMISS;
308	38	100	100			140	return 1 if $state =~ $DNAMISS && (not $self->is_protein);
309	37					235	return 0; # X (or N depending on seq type)
310							}
311
312
313							sub is_gap {
314	139			139	1	273	my $self = shift;
315	139					176	my $site = shift;
316
317	139	100				245	return 1 if $self->state_at($site) =~ $GAP;
318	109					344	return 0;
319							}
320
321
322							# global methods
323
324							around qw(purity reverse_complemented_seq codons) => sub {
325							my $method = shift;
326							my $self = shift;
327
328							# Note: we return an explicit undef to emulate other accessor behavior
329							if ($self->is_protein) {
330							carp '[BMC] Warning: sequence looks like a protein; returning undef!';
331							return undef; ## no critic (ProhibitExplicitReturnUndef)
332							}
333
334							return $self->$method(@_);
335							};
336
337
338							sub nomiss_seq_len {
339	361			361	1	572	my $self = shift;
340
341	361	100				655	my $regex = $self->is_protein ? $PROTMISS : $DNAMISS;
342	361					789	(my $raw_str = $self->raw_str) =~ s/$regex//xmsg;
343							# TODO: decide how to handle ambiguous nucleotides
344
345	361					1134	return length $raw_str;
346							}
347
348
349							sub purity {
350							my $self = shift;
351
352							(my $pure_seq = $self->seq) =~ s/$NONPUREDNA//xmsg;
353							my $purity = 1.0 * length($pure_seq) / $self->seq_len;
354
355							return $purity;
356							}
357
358
359							sub reverse_complemented_seq {
360							my $self = shift;
361
362							# reverse complement and preserve case
363							# Note: RNA always becomes DNA
364							my $new_seq = scalar reverse $self->seq;
365							$new_seq =~ tr/ATUGCYRSWKMBDHVN/TAACGRYSWMKVHDBN/;
366							$new_seq =~ tr/atugcyrswkmbdhvn/taacgryswmkvhdbn/;
367
368							return $self->new( seq_id => $self->full_id, seq => $new_seq );
369							}
370
371
372							sub spliced_seq {
373	1			1	1	8	my $self = shift;
374	1					3	my $blocks = shift;
375
376	1					2	my $new_seq;
377	1					30	my $seq = $self->seq;
378	1					4	for my $block ( @{$blocks} ) {
	1					3
379	8					12	my ($start, $end) = @{$block};
	8					14
380	8					20	$new_seq .= substr $seq, $start - 1, $end - $start + 1;
381							}
382
383	1					9	return $self->new( seq_id => $self->full_id, seq => $new_seq );
384							}
385
386
387							sub raw_str {
388	518			518	1	860	my $self = shift;
389	518					12901	return _strip_gaps($self->seq);
390							}
391
392							sub raw_seq { ## no critic (RequireArgUnpacking)
393	0			0	0	0	carp '[BMC] Warning: Method raw_seq is deprecated; use raw_str instead!';
394	0					0	return shift->raw_str(@_);
395							}
396
397
398							sub wrapped_str {
399	113			113	1	185	my $self = shift;
400	113		50			597	my $chunk = shift // 60;
401
402	113	50				259	my $nowrap = $chunk < 0 ? 1 : 0;
403	113					3767	my $width = $self->seq_len;
404	113	50				266	$chunk = $width if $nowrap;
405
406	113					154	my $str;
407	113					276	for (my $site = 0; $site < $width; $site += $chunk) {
408	7533					245226	$str .= $self->edit_seq($site, $chunk) . "\n";
409							}
410
411	113					474	return $str;
412							}
413
414
415							sub codons {
416							my $self = shift;
417							my $frame = shift // 1; # defaults to frame +1
418
419							# get specified DNA strand
420							my $dna = $frame < 0 ? $self->reverse_complemented_seq->seq : $self->seq;
421							$dna =~ tr/Uu/Tt/; # ensure DNA
422
423							# split strand into codons beginning at specified frame
424							# ... and discard incomplete codons
425							my @codons;
426							for (my $i = (abs $frame) - 1; $i < length $dna; $i += 3) {
427							my $codon = substr $dna, $i, 3;
428							push @codons, $codon if length $codon == 3;
429							}
430
431							return \@codons;
432							}
433
434
435							# private subs
436
437							sub _strip_gaps {
438	554			554		1011	my $seq = shift;
439
440	554					3285	$seq =~ s/$GAP+//xmsg;
441	554					3886	return $seq; # strip all gaps
442							}
443
444							__PACKAGE__->meta->make_immutable;
445							1;
446
447							__END__
448
449							=pod
450
451							=head1 NAME
452
453							Bio::MUST::Core::Seq - Nucleotide or protein sequence
454
455							=head1 VERSION
456
457							version 0.212670
458
459							=head1 SYNOPSIS
460
461							# TODO
462
463							=head1 DESCRIPTION
464
465							# TODO
466
467							=head1 CONSTRUCTORS
468
469							=head2 clone
470
471							=head2 reverse_complemented_seq
472
473							=head2 spliced_seq
474
475							=head1 ACCESSORS
476
477							=head2 all_states
478
479							=head2 state_at
480
481							=head2 delete_site
482
483							=head1 PROPERTIES
484
485							=head2 is_protein
486
487							=head2 is_rna
488
489							=head2 is_aligned
490
491							=head2 is_subseq_of
492
493							=head2 is_superseq_of
494
495							=head2 first_site
496
497							=head2 is_missing
498
499							=head2 is_gap
500
501							=head2 nomiss_seq_len
502
503							=head2 purity
504
505							=head1 MUTATORS
506
507							=head2 uc
508
509							=head2 recode
510
511							=head2 degap
512
513							=head2 gapify
514
515							=head2 spacify
516
517							=head2 trim
518
519							=head2 pad_to
520
521							=head2 clear_new_tag
522
523							=head1 MISC METHODS
524
525							=head2 raw_str
526
527							=head2 wrapped_str
528
529							=head2 codons
530
531							=head1 AUTHOR
532
533							Denis BAURAIN <denis.baurain@uliege.be>
534
535							=head1 CONTRIBUTORS
536
537							=for stopwords Catherine COLSON Arnaud DI FRANCO Valerian LUPO
538
539							=over 4
540
541							=item *
542
543							Catherine COLSON <ccolson@doct.uliege.be>
544
545							=item *
546
547							Arnaud DI FRANCO <arnaud.difranco@gmail.com>
548
549							=item *
550
551							Valerian LUPO <valerian.lupo@doct.uliege.be>
552
553							=back
554
555							=head1 COPYRIGHT AND LICENSE
556
557							This software is copyright (c) 2013 by University of Liege / Unit of Eukaryotic Phylogenomics / Denis BAURAIN.
558
559							This is free software; you can redistribute it and/or modify it under
560							the same terms as the Perl 5 programming language system itself.
561
562							=cut