File Coverage

blib/lib/Bio/Gonzales/Seq.pm

Criterion	Covered	Total	%
statement	75	155	48.3
branch	18	82	21.9
condition	10	40	25.0
subroutine	22	39	56.4
pod	13	27	48.1
total	138	343	40.2

line	stmt	bran	cond	sub	pod	time	code
1							package Bio::Gonzales::Seq;
2
3	9			9		118266	use v5.10;
	9					58
4
5	9			9		615	use Mouse;
	9					31017
	9					52
6
7	9			9		11528	use overload '""' => 'all';
	9					6267
	9					64
8	9			9		704	use Carp;
	9					21
	9					587
9	9			9		716	use Data::Dumper;
	9					7239
	9					551
10	9			9		4727	use Digest::SHA1 qw/sha1_hex/;
	9					7147
	9					560
11	9			9		81	use Digest::MD5 qw/md5_hex/;
	9					33
	9					460
12	9			9		4732	use Digest::SHA2;
	9					7121
	9					23544
13
14							our $WIDTH = 80;
15							our $VERSION = '0.083'; # VERSION
16
17							has id => ( is => 'rw', required => 1 );
18							has desc => ( is => 'rw', default => '' );
19							has seq => ( is => 'rw', required => 1 );
20							has delim => ( is => 'rw', default => " " );
21							has info => ( is => 'rw', default => sub { {} } );
22							has gaps => ( is => 'rw', lazy_build => 1 );
23							has length => ( is => 'rw', lazy_build => 1 );
24
25							sub _build_gaps {
26	0			0		0	my $gaps = ( shift->seq() =~ tr/-.// );
27	0					0	return $gaps;
28							}
29
30	1			1	0	9	sub ungapped_seq { return shift->gapless_seq(@_); }
31
32							sub _build_length {
33	5			5		20	return CORE::length( shift->seq() );
34							}
35
36							sub BUILDARGS {
37	61			61	1	735	my $class = shift;
38
39	61					94	my %a;
40	61	50				129	if ( scalar @_ == 1 ) {
41	0	0				0	( ref( $_[0] ) eq 'HASH' )
42							\|\| $class->meta->throw_error("Single parameters to new() must be a HASH ref");
43	0					0	%a = %{ $_[0] };
	0					0
44							} else {
45	61					219	%a = @_;
46							}
47
48							delete $a{delim}
49	61	100	100			248	if ( exists( $a{delim} ) && !defined( $a{delim} ) );
50							delete $a{desc}
51	61	100	100			190	if ( exists( $a{desc} ) && !defined( $a{desc} ) );
52
53	61					122	$a{seq} = _filter_seq( $a{seq} );
54
55	61					577	return \%a;
56							}
57
58							sub as_hash {
59	0			0	0	0	my $self = shift;
60	0					0	return { id => $self->id, desc => $self->desc, seq => $self->seq, info => $self->info };
61							}
62
63							sub Format_seq_string {
64	1			1	0	3	my ( $str, $width ) = @_;
65	1		33			5	$width //= $WIDTH;
66
67	1	50	33			8	if ( defined $str && length($str) > 0 ) {
68	1					15	$str =~ tr/ \t\n\r//d; # Remove whitespace and numbers
69	1					6	$str =~ s/\d+//g;
70	1					35	$str =~ s/(.{1,$width})/$1\n/g;
71	1					29	return $str;
72							}
73							}
74
75							sub def {
76	0			0	1	0	my ($self) = @_;
77
78	0					0	return join( $self->delim, $self->id, $self->desc );
79							}
80
81							before 'desc' => sub {
82							my $self = shift;
83
84							if ( @_ == 1 && $_[0] eq '' ) {
85							$self->delim('');
86							}
87							};
88
89							sub _filter_seq {
90	63			63		112	my ($seq) = @_;
91
92	63	100				235	$seq = join( "", @$seq )
93							if ( ref($seq) eq 'ARRAY' );
94	63					305	$seq =~ y/ \t\n\r\f//d;
95
96	63					146	return $seq;
97							}
98
99							around 'seq' => sub {
100							my $orig = shift;
101							my $self = shift;
102
103							return $self->$orig()
104							unless @_;
105
106							return $self->$orig( _filter_seq(@_) );
107							};
108
109							sub gapless_seq {
110	1			1	1	3	( my $seq = shift->seq ) =~ tr/-.//d;
111	1					5	return $seq;
112							}
113
114							sub rm_gaps {
115	0			0	0	0	my ($self) = @_;
116
117	0					0	$self->seq( $self->gapless_seq );
118	0					0	return $self;
119							}
120
121							sub clone {
122	0			0	1	0	my ($self) = @_;
123
124	0					0	return __PACKAGE__->new(
125							id => $self->id,
126							desc => $self->desc,
127							seq => $self->seq,
128							delim => $self->delim,
129							info => $self->info
130							);
131							#shift->clone_object(@_)
132							}
133
134							sub clone_empty {
135	0			0	1	0	my ($self) = @_;
136
137	0					0	return __PACKAGE__->new(
138							id => $self->id,
139							desc => $self->desc,
140							seq => '',
141							delim => $self->delim,
142							info => $self->info
143							);
144							}
145
146	1			1	1	7	sub display_id { shift->id(@_) }
147
148							sub ungapped_length {
149	0			0	1	0	my ($self) = @_;
150
151	0					0	return $self->length - $self->gaps;
152							}
153
154	0			0	0	0	sub sequence { shift->seq(@_) }
155
156							sub stringify {
157	220			220	0	327	my ($self) = @_;
158
159	220	100				676	return ">" . $self->id . ( $self->desc ? $self->delim . $self->desc : "" ) . "\n" . $self->seq . "\n";
160							}
161
162	220			220	1	459	sub all { shift->stringify(@_) }
163
164	0			0	1	0	sub all_formatted { shift->stringify_pretty(@_) }
165
166							sub stringify_pretty {
167	1			1	0	4	my ( $self, $width ) = @_;
168	1		33			6	$width //= $WIDTH;
169
170							return
171	1	50				11	">"
172							. $self->id
173							. ( $self->desc ? $self->delim . $self->desc : "" ) . "\n"
174							. Format_seq_string( $self->seq );
175							}
176
177	1			1	1	4	sub all_pretty { shift->stringify_pretty(@_) }
178
179	0			0	0	0	sub pretty { shift->stringify_pretty(@_) }
180
181							sub as_primaryseq {
182	0			0	1	0	my ($self) = @_;
183
184	0					0	return Bio::PrimarySeq->new(
185							-seq => $self->seq,
186							-id => $self->id,
187							-desc => $self->desc,
188							-alphabet => $self->guess_alphabet,
189							-direct => 1,
190							);
191							}
192
193							sub guess_alphabet {
194	1			1	0	3	my ($self) = @_;
195
196	1					3	my $str = $self->seq();
197	1					6	$str =~ s/[-.?*]//gi;
198
199	1					2	my $alphabet;
200
201							# Check for sequences without valid letters
202	1					2	my $total = CORE::length($str);
203
204	1	50				5	if ( $str =~ m/[EFIJLOPQXZ]/i ) {
205							# Start with a safe method to find proteins.
206							# Unambiguous IUPAC letters for proteins are: E,F,I,J,L,O,P,Q,X,Z
207	0					0	$alphabet = 'protein';
208							} else {
209							# Alphabet is unsure, could still be DNA, RNA or protein.
210							# DNA and RNA contain mostly A, T, U, G, C and N, but the other letters
211							# they use are also among the 15 valid letters that a protein sequence
212							# can contain at this stage. Make our best guess based on sequence
213							# composition. If it contains over 70% of ACGTUN, it is likely nucleic.
214	1	50				8	if ( ( $str =~ tr/ATUGCNatugcn// ) / $total > 0.7 ) {
215	1	50				4	if ( $str =~ m/U/i ) {
216	0					0	$alphabet = 'rna';
217							} else {
218	1					3	$alphabet = 'dna';
219							}
220							} else {
221	0					0	$alphabet = 'protein';
222							}
223							}
224	1					4	return $alphabet;
225							}
226
227							sub revcom {
228	1			1	1	8	my ($self) = @_;
229
230	1					4	$self->seq( _revcom_from_string( $self->seq, $self->guess_alphabet ) );
231
232	1					3	return $self;
233							}
234
235							sub revcom_seq {
236	0			0	0	0	my $self = shift;
237	0					0	return _revcom_from_string( $self->seq, $self->guess_alphabet );
238							}
239
240							sub subseq {
241	0			0	1	0	my ( $self, $range, $c ) = @_;
242
243	0	0	0			0	$range = [ $range->start, $range->end, $range->strand ]
244							if ( blessed($range) && $range->isa('Bio::Gonzales::Feat') );
245	0					0	my ( $seq, $corrected_range ) = $self->subseq_as_string( $range, $c );
246	0					0	my ( $b, $e, $strand, @rest ) = @$corrected_range;
247
248	0					0	my $keep_original_id = $c->{keep_id};
249
250	0					0	my $new_seq = $self->clone_empty;
251	0					0	$new_seq->seq($seq);
252
253	0	0				0	if ( $c->{attach_details} ) {
254	0					0	my $info = $new_seq->info;
255	0					0	$info->{subseq} = { from => $b + 1, to => $e };
256	0	0				0	$info->{subseq}{strand} = ( $strand < 0 ? '-' : ( $strand > 0 ? '+' : '.' ) );
		0
257	0	0				0	$info->{subseq}{rest} = \@rest if ( @rest > 0 );
258							}
259
260	0	0				0	unless ($keep_original_id) {
261	0					0	$new_seq->id( $new_seq->id . "\|" . ( $b + 1 ) . "..$e" );
262	0	0				0	$new_seq->id( $new_seq->id . "\|" . ( $strand < 0 ? '-' : ( $strand > 0 ? '+' : '.' ) ) )
		0
		0
263							if ( defined($strand) ); #print also nothing if strand == 0
264	0	0				0	$new_seq->id( $new_seq->id . "\|" . join( "\|", @rest ) ) if ( @rest > 0 );
265							}
266	0					0	return $new_seq;
267							}
268
269							sub sha1_checksum {
270	0			0	0	0	return sha1_hex( shift->seq );
271							}
272
273							sub md5_checksum {
274	0			0	0	0	return md5_hex( shift->seq );
275							}
276
277							sub sha512_checksum {
278	0			0	0	0	my $self = shift;
279	0					0	my $sha2 = Digest::SHA2->new(512);
280	0					0	$sha2->add( $self->seq );
281	0					0	return $sha2->hexdigest;
282							}
283
284							sub subseq_as_string {
285	0			0	0	0	my ( $self, $range, $c ) = @_;
286
287	0	0	0			0	confess "you use the deprecated version of subseq" if ( defined($c) && ref $c ne 'HASH' );
288
289	0					0	my ( $b, $e, $strand, @rest ) = @$range;
290	0	0				0	if ( $c->{relaxed_range} ) {
291							#if b or e are not defined, just take the beginning and end as given
292							#warn "requested invalid subseq range ($b,$e;$strand) from " . $self->id . ", using relaxed boundaries."
293							#unless ( $b && $e );
294	0		0			0	$b \|\|= '^';
295	0		0			0	$e \|\|= '$';
296							}
297
298	0	0	0			0	confess "requested invalied subseq range ($b,$e;$strand) from "
299							. $self->id . "\n"
300							. Dumper($range)
301							. Dumper( $self->clone_empty )
302							unless ( $b && $e );
303
304	0					0	my $seq_len = $self->length;
305
306	0	0				0	$b = 1 if ( $b eq '^' );
307	0	0				0	$b = $seq_len if ( $b eq '$' );
308
309	0	0				0	$e = 1 if ( $e eq '^' );
310	0	0				0	$e = $seq_len if ( $e eq '$' );
311
312	0	0	0			0	croak "subseq range error: $b > $e" if ( $b > $e && $b > 0 && $e > 0 );
			0
313
314							#count from the end,
315	0	0				0	if ( $b < 0 ) {
316	0	0				0	$b = $c->{wrap} ? $seq_len + $b + 1 : 1;
317							}
318	0	0				0	if ( $e < 0 ) {
319	0	0				0	$e = $c->{wrap} ? $seq_len + $e + 1 : $seq_len;
320							}
321
322							#get the index right for substr.
323	0					0	$b--;
324
325	0					0	my $seq = substr( $self->{seq}, $b, $e - $b );
326
327	0	0	0			0	if ( $strand && $strand < 0 ) {
328	0	0				0	if ( $c->{relaxed_revcom} ) {
329	0					0	$seq =~ y/AGCTNagctn/N/c;
330							} else {
331	0	0				0	confess "cannot create reverse complement, sequence contains non-AGCTN characters"
332							if ( $seq =~ /[^AGCTN]/i );
333							}
334
335	0					0	$seq = _revcom_from_string( $seq, $self->guess_alphabet );
336							}
337
338	0	0				0	return wantarray ? ( $seq, [ $b, $e, $strand, @rest ] ) : $seq;
339							}
340
341							sub _revcom_from_string {
342	1			1		3	my ( $string, $alphabet ) = @_;
343
344							# Check that reverse-complementing makes sense
345	1	50				6	if ( $alphabet eq 'protein' ) {
346	0					0	confess("Sequence is a protein. Cannot revcom.");
347							}
348	1	50	33			6	if ( $alphabet ne 'dna' && $alphabet ne 'rna' ) {
349	0					0	carp "Sequence is not dna or rna, but [$alphabet]. Attempting to revcom, "
350							. "but unsure if this is right.";
351							}
352
353							# If sequence is RNA, map to DNA (then map back later)
354	1	50				4	if ( $alphabet eq 'rna' ) {
355	0					0	$string =~ tr/uU/tT/;
356							}
357
358							# Reverse-complement now
359	1					2	$string =~ tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/;
360	1					2	$string = CORE::reverse $string;
361
362							# Map back RNA to DNA
363	1	50				3	if ( $alphabet eq 'rna' ) {
364	0					0	$string =~ tr/tT/uU/;
365							}
366
367	1					4	return $string;
368							}
369
370							1;
371
372							__END__
373
374							=head1 NAME
375
376							Bio::Gonzales::Seq - Gonzales Sequence Object
377
378							=head1 SYNOPSIS
379
380							my $seq = Bio::Gonzales::Seq->new(id => $id, seq => $seq, desc? => '', delim? => ' ');
381
382							print $seq->def;
383							print $seq->desc;
384
385							=head1 DESCRIPTION
386
387							=head1 METHODS
388
389							=over 4
390
391							=item B<< $seq->id >>
392
393							=item B<< $seq->desc >>
394
395							The description of a sequence object. In case of FASTA-files, this corresponds
396							to the text after the first space.
397
398							=item B<< $seq->seq >>
399
400							=item B<< $seq->delim >>
401
402							=item B<< $seq->info >>
403
404							An hash of additional stuff you can store about the sequence
405
406							=item B<< $seq->gaps >>
407
408							=item B<< $seq->length >>
409
410							=item B<< $seq->def >>
411
412							The definition also known as the FASTA header line w/o ">"
413
414							=item B<< $seq->clone >>
415
416							Clone the sequence
417
418							=item B<< $seq->clone_empty >>
419
420							Clone the sequence properties, do not clone the sequence string.
421
422							=item B<< $seq->display_id >>
423
424							Same as C<$seq->id>
425
426							=item B<< $seq->ungapped_length >>
427
428							=item B<< $seq->all >>
429
430							=item B<< "$seq" >>
431
432							The complete sequence in fasta format, ready to be written.
433
434							=item B<< $seq->all_formatted >>
435
436							=item B<< $seq->all_pretty >>
437
438							The complete sequence in I<pretty> fasta format, ready to be written.
439
440							=item B<< $seq->as_primaryseq >>
441
442							Return a Bio::PrimarySeqI compatible object, so you can use it in BioPerl.
443
444							=item B<< $seq_string = $seq->gapless_seq >>
445
446							=item B<< $seq->rm_gaps! >>
447
448							=item B<< $seq->revcom >>
449
450							Create the reverse complement of the sequence. B<THIS FUNCTION ALTERS THE SEQUENCE OBJECT>.
451
452							=item B<< $seq->subseq( [ $begin, $end, $strand , @rest ], \%c ) >>
453
454							Gets a subseq from C<$seq>. Config options can be:
455
456							%c = (
457							keep_id => 1, # keeps the original id of the sequence
458							attach_details => 1, # keeps the original range and strand in $seq->info->{subseq}
459							wrap => 1, # see further down
460							relaxed_range => 1, # substitute 0 or undef for $begin with '^' and for $end with '$'
461							relaxed_revcom => 1, # substitute N for all characters that are non-AGCTN before doing a reverse complement
462							);
463
464							There are several possibilities for C<$begin> and C<$end>:
465
466							GGCAAAGGA ATGATGGTGT GCAGGCTTGG CATGGGAGAC
467							^..........^ (1,11) OR ('^', 11)
468							^.....................................^ (4,'$')
469							^..............^ (21,35) { with wrap on: OR (-19,35) OR (-19, -5) }
470							^..................^ (21,35) { with wrap on: OR (-19,'$') }
471
472							=over 4
473
474							=item C<wrap>
475
476							The default is to limit all negative
477							values to the sequence boundaries, so a negative begin would be equal to 1 or
478							'^' and a negative end would be equal to '$'.
479
480							=back
481
482							See also L<Bio::Gonzales::Seq::IO/fasubseq>.
483
484							=back
485
486							=over 4
487
488							=item B<< my $reverse_complement_string = Bio::Gonzales::Seq::_revcom_from_string($seq_string, $alphabet) >>
489
490							Stolen from L<Bio::Perl>. Alphabet can be 'rna' or 'dna';
491
492							=back
493
494							=head1 AUTHOR
495
496							jw bargsten, C<< <joachim.bargsten at wur.nl> >>
497
498							=cut