File Coverage

blib/lib/Bio/MUST/Core/Seq.pm

Criterion	Covered	Total	%
statement	138	147	93.8
branch	28	30	93.3
condition	12	15	80.0
subroutine	33	37	89.1
pod	24	27	88.8
total	235	256	91.8

line	stmt	bran	cond	sub	pod	time	code
1							package Bio::MUST::Core::Seq;
2							# ABSTRACT: Nucleotide or protein sequence
3							# CONTRIBUTOR: Catherine COLSON <ccolson@doct.uliege.be>
4							# CONTRIBUTOR: Arnaud DI FRANCO <arnaud.difranco@gmail.com>
5							# CONTRIBUTOR: Valerian LUPO <valerian.lupo@doct.uliege.be>
6							$Bio::MUST::Core::Seq::VERSION = '0.212530';
7	17			17		138	use Moose;
	17					42
	17					160
8	17			17		96484	use MooseX::SemiAffordanceAccessor;
	17					135715
	17					79
9	17			17		134067	use namespace::autoclean;
	17					43
	17					187
10
11							# use Smart::Comments '###';
12
13	17			17		1814	use autodie;
	17					56
	17					373
14	17			17		92073	use feature qw(say);
	17					48
	17					1776
15
16	17			17		151	use Carp;
	17					36
	17					1599
17
18	17			17		129	use Bio::MUST::Core::Types;
	17					38
	17					654
19	17			17		105	use Bio::MUST::Core::Constants qw(:seqtypes :seqids :gaps);
	17					56
	17					4988
20	17			17		10770	use aliased 'Bio::MUST::Core::SeqId';
	17					14109
	17					111
21
22							has 'seq_id' => (
23							is => 'rw',
24							isa => 'Bio::MUST::Core::SeqId',
25							required => 1,
26							coerce => 1,
27							handles => qr{.*}xms, # expose all SeqId methods (and attributes)
28							);
29
30
31							has 'seq' => (
32							traits => ['String'],
33							is => 'ro',
34							isa => 'Bio::MUST::Core::Types::Seq',
35							default => q{}, # can be empty
36							coerce => 1,
37							writer => '_set_seq',
38							handles => {
39							seq_len => 'length',
40							append_seq => 'append',
41							replace_seq => 'replace',
42							edit_seq => 'substr',
43							},
44							);
45
46
47
48							# TODO: check whether this could be done by some Moose extension
49
50							sub clone {
51	512			512	1	922	my $self = shift;
52
53	512					1641	return $self->new(
54							seq_id => $self->full_id, seq => $self->seq
55							);
56							}
57
58
59							# boolean assertions
60							# TODO: optimize these assertions via caching
61
62
63							sub is_protein {
64	753			753	1	1171	my $self = shift;
65	753	100				19642	return 1 if $self->seq =~ $PROTLIKE;
66	153					412	return 0; # at least 1 non-nt char
67							}
68
69
70							sub is_rna {
71	11			11	1	21	my $self = shift;
72	11	100	100			258	return 1 if $self->seq =~ $RNALIKE && (not $self->is_protein);
73	9					26	return 0; # at least 1 'U'
74							}
75
76
77							sub is_aligned {
78	8599			8599	1	12552	my $self = shift;
79	8599	100				210563	return 1 if $self->seq =~ $GAP; # at least 1 gap-like char
80	8403					17892	return 0;
81							}
82
83
84							sub is_subseq_of {
85	36			36	1	70	my $self = shift;
86	36					52	my $seq2 = shift; # can be a mere string
87
88	36					70	$self = $self->raw_str;
89	36	100				186	$seq2 = $seq2->isa('Bio::MUST::Core::Seq')
90							? $seq2->raw_str : _strip_gaps($seq2);
91	36	100				276	return 1 if $seq2 =~ m/$self/xmsi; # case-insensitive comparison
92	16					46	return 0; # only here because expensive!
93							}
94
95
96							sub is_superseq_of {
97	36			36	1	63	my $self = shift;
98	36					47	my $seq2 = shift; # can be a mere string
99
100	36					67	$self = $self->raw_str;
101	36	100				145	$seq2 = $seq2->isa('Bio::MUST::Core::Seq')
102							? $seq2->raw_str : _strip_gaps($seq2);
103	36	100				252	return 1 if $self =~ m/$seq2/xmsi; # case-insensitive comparison
104	16					44	return 0; # only here because expensive!
105							}
106
107
108							sub first_site {
109	11			11	1	20	my $self = shift;
110
111	11					266	my ($leading_gaps) = $self->seq =~ m{ \A ($GAP+) }xms;
112	11		100			55	return length $leading_gaps // 0;
113							}
114
115
116							sub uc { ## no critic (ProhibitBuiltinHomonyms)
117	5			5	1	9	my $self = shift;
118
119	5					128	$self->_set_seq( uc $self->seq );
120	5					16	return $self;
121							}
122
123							sub uc_seq { ## no critic (RequireArgUnpacking)
124	0			0	0	0	carp '[BMC] Warning: Method uc_seq is deprecated; use uc instead!';
125	0					0	return shift->uc(@_);
126							}
127
128
129							sub recode {
130	5			5	1	12	my $self = shift;
131	5					9	my $base_for = shift;
132
133	5					14	my @states = $self->all_states;
134	5					14	my @rec_states;
135
136	5					9	for my $state (@states) {
137	700		33			1210	my $rec_state = $base_for->{$state} // $FRAMESHIFT;
138	700					999	push @rec_states, $rec_state;
139							}
140
141	5					33	my $new_seq = join q{}, @rec_states;
142	5					169	$self->_set_seq($new_seq);
143
144	5					79	return $self;
145							}
146
147							sub recode_seq { ## no critic (RequireArgUnpacking)
148	0			0	0	0	carp '[BMC] Warning: Method recode_seq is deprecated; use recode instead!';
149	0					0	return shift->recode(@_);
150							}
151
152							# gap cleaning methods
153
154
155							sub degap {
156	38			38	1	73	my $self = shift;
157
158	38					97	$self->_set_seq($self->raw_str);
159	38					274	return $self;
160							}
161
162
163							sub gapify {
164	181			181	1	314	my $self = shift;
165	181		100			463	my $char = shift // '*'; # defaults to gap
166
167	181					315	my $regex = $PROTMISS; # defaults to protein seq
168
169							# in case of DNA ensure correct 'missification' (if applicable)
170	181	100				451	unless ($self->is_protein) {
171	8					13	$regex = $DNAMISS;
172	8	100				29	$char = 'N' if $char =~ $DNAMISS;
173							}
174
175	181					4531	( my $seq = $self->seq ) =~ s{$regex}{$char}xmsg;
176
177	181					5421	$self->_set_seq($seq);
178	181					721	return $self;
179							}
180
181
182							sub spacify {
183	3069			3069	1	4994	my $self = shift;
184
185	3069					72833	my $seq = $self->seq;
186
187							# uniformize runs of [*-space] having at least one 'true' space
188							# Note: we cannot use replace_seq because of the g flag
189	3069					103513	$seq =~ s{ ( $GAP+ \ + ) }{ ' ' x length($1) }xmseg;
	550					5604
190	3069					78978	$seq =~ s{ ( \ + $GAP+ ) }{ ' ' x length($1) }xmseg;
	561					5176
191
192							# Note: two simpler regexes are muuuuuch faster than one complicated regex!
193							# $seq =~ s{ ( $GAP* \ + $GAP* ) }{ ' ' x length($1) }xmseg;
194
195	3069					87471	$self->_set_seq($seq);
196	3069					10472	return $self;
197							}
198
199
200							sub trim {
201	3068			3068	1	5371	my $self = shift;
202
203	3068					108948	$self->replace_seq( qr{ $GAP+\z }xms, q{} );
204	3068					12910	return $self;
205							}
206
207
208							sub pad_to {
209	3068			3068	1	4767	my $self = shift;
210	3068					3948	my $bound = shift;
211
212	3068					95275	$self->append_seq( q{ } x ($bound - $self->seq_len) );
213	3068					9675	return $self;
214							}
215
216
217							sub clear_new_tag {
218	3			3	1	7	my $self = shift;
219
220	3					13	(my $full_id = $self->full_id) =~ s{$NEW_TAG\z}{}xms;
221	3					92	$self->set_seq_id( SeqId->new( full_id => $full_id ) );
222
223	3					97	return $self;
224							}
225
226							# site-wise methods (0-numbered)
227
228
229							sub all_states {
230	4203			4203	1	6718	my $self = shift;
231	4203					110771	return split //, $self->seq;
232							}
233
234
235							sub state_at { ## no critic (RequireArgUnpacking)
236	2002847			2002847	1	59025776	return shift->edit_seq(@_, 1);
237							}
238
239
240							sub delete_site { ## no critic (RequireArgUnpacking)
241	0			0	1	0	my $self = shift;
242
243	0					0	$self->edit_seq(@_, 1, q{});
244	0					0	return $self;
245							}
246
247
248							sub is_missing {
249	40			40	1	121	my $self = shift;
250	40					50	my $site = shift;
251
252	40					72	my $state = $self->state_at($site);
253	40	100				166	return 1 if $state =~ $PROTMISS;
254	38	100	100			155	return 1 if $state =~ $DNAMISS && (not $self->is_protein);
255	37					220	return 0; # X (or N depending on seq type)
256							}
257
258
259							sub is_gap {
260	139			139	1	264	my $self = shift;
261	139					171	my $site = shift;
262
263	139	100				248	return 1 if $self->state_at($site) =~ $GAP;
264	109					405	return 0;
265							}
266
267
268							# global methods
269
270							around qw(purity reverse_complemented_seq codons) => sub {
271							my $method = shift;
272							my $self = shift;
273
274							# Note: we return an explicit undef to emulate other accessor behavior
275							if ($self->is_protein) {
276							carp '[BMC] Warning: sequence looks like a protein; returning undef!';
277							return undef; ## no critic (ProhibitExplicitReturnUndef)
278							}
279
280							return $self->$method(@_);
281							};
282
283
284							sub nomiss_seq_len {
285	361			361	1	554	my $self = shift;
286
287	361	100				611	my $regex = $self->is_protein ? $PROTMISS : $DNAMISS;
288	361					763	(my $raw_str = $self->raw_str) =~ s/$regex//xmsg;
289							# TODO: decide how to handle ambiguous nucleotides
290
291	361					1090	return length $raw_str;
292							}
293
294
295							sub purity {
296							my $self = shift;
297
298							(my $pure_seq = $self->seq) =~ s/$NONPUREDNA//xmsg;
299							my $purity = 1.0 * length($pure_seq) / $self->seq_len;
300
301							return $purity;
302							}
303
304
305							sub reverse_complemented_seq {
306							my $self = shift;
307
308							# reverse complement and preserve case
309							# Note: RNA always becomes DNA
310							my $new_seq = scalar reverse $self->seq;
311							$new_seq =~ tr/ATUGCYRSWKMBDHVN/TAACGRYSWMKVHDBN/;
312							$new_seq =~ tr/atugcyrswkmbdhvn/taacgryswmkvhdbn/;
313
314							return $self->new( seq_id => $self->full_id, seq => $new_seq );
315							}
316
317
318							sub spliced_seq {
319	1			1	1	6	my $self = shift;
320	1					3	my $blocks = shift;
321
322	1					2	my $new_seq;
323	1					27	my $seq = $self->seq;
324	1					3	for my $block ( @{$blocks} ) {
	1					4
325	8					12	my ($start, $end) = @{$block};
	8					13
326	8					20	$new_seq .= substr $seq, $start - 1, $end - $start + 1;
327							}
328
329	1					6	return $self->new( seq_id => $self->full_id, seq => $new_seq );
330							}
331
332
333							sub raw_str {
334	518			518	1	716	my $self = shift;
335	518					12520	return _strip_gaps($self->seq);
336							}
337
338							sub raw_seq { ## no critic (RequireArgUnpacking)
339	0			0	0	0	carp '[BMC] Warning: Method raw_seq is deprecated; use raw_str instead!';
340	0					0	return shift->raw_str(@_);
341							}
342
343
344							sub wrapped_str {
345	113			113	1	193	my $self = shift;
346	113		50			260	my $chunk = shift // 60;
347
348	113	50				297	my $nowrap = $chunk < 0 ? 1 : 0;
349	113					3810	my $width = $self->seq_len;
350	113	50				285	$chunk = $width if $nowrap;
351
352	113					174	my $str;
353	113					335	for (my $site = 0; $site < $width; $site += $chunk) {
354	7533					244971	$str .= $self->edit_seq($site, $chunk) . "\n";
355							}
356
357	113					690	return $str;
358							}
359
360
361							sub codons {
362							my $self = shift;
363							my $frame = shift // 1; # defaults to frame +1
364
365							# get specified DNA strand
366							my $dna = $frame < 0 ? $self->reverse_complemented_seq->seq : $self->seq;
367							$dna =~ tr/Uu/Tt/; # ensure DNA
368
369							# split strand into codons beginning at specified frame
370							# ... and discard incomplete codons
371							my @codons;
372							for (my $i = (abs $frame) - 1; $i < length $dna; $i += 3) {
373							my $codon = substr $dna, $i, 3;
374							push @codons, $codon if length $codon == 3;
375							}
376
377							return \@codons;
378							}
379
380
381							# private subs
382
383							sub _strip_gaps {
384	554			554		859	my $seq = shift;
385
386	554					3872	$seq =~ s/$GAP+//xmsg;
387	554					3815	return $seq; # strip all gaps
388							}
389
390							__PACKAGE__->meta->make_immutable;
391							1;
392
393							__END__
394
395							=pod
396
397							=head1 NAME
398
399							Bio::MUST::Core::Seq - Nucleotide or protein sequence
400
401							=head1 VERSION
402
403							version 0.212530
404
405							=head1 SYNOPSIS
406
407							# TODO
408
409							=head1 DESCRIPTION
410
411							# TODO
412
413							=head1 CONSTRUCTORS
414
415							=head2 clone
416
417							=head2 reverse_complemented_seq
418
419							=head2 spliced_seq
420
421							=head1 ACCESSORS
422
423							=head2 all_states
424
425							=head2 state_at
426
427							=head2 delete_site
428
429							=head1 PROPERTIES
430
431							=head2 is_protein
432
433							=head2 is_rna
434
435							=head2 is_aligned
436
437							=head2 is_subseq_of
438
439							=head2 is_superseq_of
440
441							=head2 first_site
442
443							=head2 is_missing
444
445							=head2 is_gap
446
447							=head2 nomiss_seq_len
448
449							=head2 purity
450
451							=head1 MUTATORS
452
453							=head2 uc
454
455							=head2 recode
456
457							=head2 degap
458
459							=head2 gapify
460
461							=head2 spacify
462
463							=head2 trim
464
465							=head2 pad_to
466
467							=head2 clear_new_tag
468
469							=head1 MISC METHODS
470
471							=head2 raw_str
472
473							=head2 wrapped_str
474
475							=head2 codons
476
477							=head1 AUTHOR
478
479							Denis BAURAIN <denis.baurain@uliege.be>
480
481							=head1 CONTRIBUTORS
482
483							=for stopwords Catherine COLSON Arnaud DI FRANCO Valerian LUPO
484
485							=over 4
486
487							=item *
488
489							Catherine COLSON <ccolson@doct.uliege.be>
490
491							=item *
492
493							Arnaud DI FRANCO <arnaud.difranco@gmail.com>
494
495							=item *
496
497							Valerian LUPO <valerian.lupo@doct.uliege.be>
498
499							=back
500
501							=head1 COPYRIGHT AND LICENSE
502
503							This software is copyright (c) 2013 by University of Liege / Unit of Eukaryotic Phylogenomics / Denis BAURAIN.
504
505							This is free software; you can redistribute it and/or modify it under
506							the same terms as the Perl 5 programming language system itself.
507
508							=cut