File Coverage

blib/lib/Bio/Gonzales/Seq/Util.pm

Criterion	Covered	Total	%
statement	57	161	35.4
branch	8	54	14.8
condition	5	12	41.6
subroutine	12	26	46.1
pod	2	12	16.6
total	84	265	31.7

line	stmt	bran	cond	sub	pod	time	code
1							package Bio::Gonzales::Seq::Util;
2
3	5			5		122340	use warnings;
	5					20
	5					196
4	5			5		28	use strict;
	5					12
	5					95
5	5			5		27	use Carp;
	5					7
	5					309
6
7	5			5		31	use Scalar::Util qw/blessed/;
	5					12
	5					249
8	5			5		50	use Data::Dumper;
	5					11
	5					244
9	5			5		4098	use Bio::Gonzales::Matrix::IO;
	5					40
	5					496
10	5			5		54	use File::Which qw/which/;
	5					12
	5					263
11	5			5		2495	use Bio::Gonzales::Seq::IO;
	5					17
	5					429
12	5			5		37	use Bio::Gonzales::Seq;
	5					73
	5					110
13
14	5			5		72	use base 'Exporter';
	5					12
	5					10336
15							our ( @EXPORT, @EXPORT_OK, %EXPORT_TAGS );
16							our $VERSION = '0.083'; # VERSION
17
18							@EXPORT = qw();
19							%EXPORT_TAGS = ();
20							@EXPORT_OK = qw(
21							pairwise_identity_l
22							pairwise_identity_s
23							pairwise_identity_gaps_l
24							pairwise_identity_gaps_s
25							pairwise_identities
26							map_seqids
27							seqid_mapper
28							crc64
29							strand_convert
30							seq_lengths
31							seq_apply
32							revcom_seq_string
33							);
34
35							our %STRAND_CHAR_TABLE = (
36							'+' => 1,
37							'-' => -1,
38							'.' => 0,
39							-1 => '-',
40							1 => '+',
41							0 => '.',
42							);
43
44							our $BLASTDB_CMD = which('blastdbcmd');
45							our $SAMTOOLS_CMD = which('samtools');
46
47							sub strand_convert {
48	142	100	33	142	0	780	if ( @_ && @_ > 0 && $_[-1] && exists( $STRAND_CHAR_TABLE{ $_[-1] } ) ) {
			66
			66
49	1					6	return $STRAND_CHAR_TABLE{ $_[-1] };
50							} else {
51	141					385	return '.';
52							}
53							}
54
55							sub seq_lengths {
56	0			0	0	0	my $f = shift;
57
58	0					0	my $d;
59	0	0				0	if ( -f $f . ".fai" ) {
		0
		0
60	0					0	$d = mslurp( $f . ".fai", { header => undef } );
61							} elsif ( -f $f . ".nhr" ) {
62	0	0				0	open my $fh, '-\|', $BLASTDB_CMD, '-db', $f, '-entry', 'all', '-outfmt', "%a\t%l"
63							or die "Can't open filehandle: $!";
64	0					0	$d = mslurp( $fh, { header => undef } );
65	0					0	close $fh;
66							} elsif ($SAMTOOLS_CMD) {
67	0	0				0	system( 'samtools', 'faidx', $f ) == 0 or die "system failed: $?";
68	0	0				0	$d = mslurp( $f . ".fai", { header => undef } ) if ( -f $f . ".fai" );
69							}
70
71							# nothing worked so far
72	0	0				0	unless ($d) {
73	0					0	say STDERR "could not use samtools or blast indices, using std method";
74	0					0	my @lengths;
75	0					0	my $fit = faiterate($f);
76	0					0	while ( my $seq = $fit->() ) {
77	0					0	push @lengths, [ $seq->id, $seq->length ];
78							}
79	0					0	$d = \@lengths;
80							}
81
82	0					0	my %sl;
83	0					0	for my $r (@$d) {
84	0	0				0	die "double ID" if ( $sl{ $r->[0] } );
85	0					0	$sl{ $r->[0] } = $r->[1];
86							}
87
88	0					0	return \%sl;
89							}
90
91							sub revcom_seq_string {
92	0			0	0	0	return Bio::Gonzales::Seq::_revcom_from_string(@_);
93							}
94
95							sub seq_apply {
96	0			0	0	0	my ( $f, $sub, $pattern ) = @_;
97
98	0	0	0			0	die "file or sub ref not correct" unless ( ref $sub eq 'CODE' && -f $f );
99	0					0	my $seq_lengths = seq_lengths($f);
100	0					0	my @res;
101	0					0	my $num = keys %$seq_lengths;
102	0					0	for my $sid ( keys %$seq_lengths ) {
103	0					0	my $spat;
104	0	0				0	if ($pattern) {
105	0					0	$spat = $pattern;
106	0					0	$spat =~ s/\{id\}/$sid/g;
107	0					0	$spat =~ s/\{begin\}/1/g;
108	0					0	$spat =~ s/\{end\}/$seq_lengths->{$sid}/g;
109							}
110	0					0	push @res, $sub->( { id => $sid, begin => 1, end => $seq_lengths->{$sid}, num => $num }, $spat );
111							}
112	0					0	return \@res;
113							}
114
115							sub pairwise_identity_l {
116	0			0	0	0	my ( $seq1, $seq2 ) = @_;
117	0					0	return _pairwise_identity_generic( $seq1, $seq2, 1, 0 );
118							}
119
120							sub pairwise_identity_s {
121	0			0	0	0	my ( $seq1, $seq2 ) = @_;
122	0					0	return _pairwise_identity_generic( $seq1, $seq2, 0, 0 );
123							}
124
125							sub pairwise_identity_gaps_l {
126	0			0	0	0	my ( $seq1, $seq2 ) = @_;
127	0					0	return _pairwise_identity_generic( $seq1, $seq2, 1, 1 );
128							}
129
130							sub _pairwise_identity_generic {
131	0			0		0	my ( $seq1, $seq2, $use_longest, $include_gaps ) = @_;
132
133	0	0				0	$seq1 = $seq1->seq if ( blessed $seq1);
134	0	0				0	$seq2 = $seq2->seq if ( blessed $seq2);
135
136	0					0	my $seq1_gaps = 0;
137	0					0	my $seq2_gaps = 0;
138	0	0				0	if ( !$include_gaps ) {
139	0					0	$seq1_gaps = $seq1 =~ y/-/./;
140	0					0	$seq2_gaps = () = $seq2 =~ /-/g;
141							}
142
143	0					0	my $mask = $seq1 ^ $seq2;
144	0					0	my $matches = $mask =~ tr/\x0/\x0/;
145
146	0					0	my $longest;
147							my $shortest;
148	0	0				0	if ( length($seq2) - $seq2_gaps < length($seq1) - $seq1_gaps ) {
149	0					0	$longest = length($seq1) - $seq1_gaps;
150	0					0	$shortest = length($seq2) - $seq2_gaps;
151							} else {
152	0					0	$shortest = length($seq1) - $seq1_gaps;
153	0					0	$longest = length($seq2) - $seq2_gaps;
154							}
155
156	0	0				0	if ($use_longest) {
157	0					0	return ( $matches / $longest );
158							} else {
159	0					0	return ( $matches / $shortest );
160							}
161							}
162
163							sub pairwise_identity_gaps_s {
164	0			0	0	0	my ( $seq1, $seq2 ) = @_;
165
166	0					0	return _pairwise_identity_generic( $seq1, $seq2, 0, 1 );
167							}
168
169							sub pairwise_identities {
170	0			0	0	0	my ( $sub, @seqs ) = @_;
171
172							#creating an upper triangular matrix
173
174	0					0	my @dist;
175	0					0	for ( my $i = 0; $i < @seqs; $i++ ) {
176	0					0	push @dist, [];
177							}
178
179	0					0	my $i;
180	0					0	for ( $i = 0; $i < @seqs - 1; $i++ ) {
181	0					0	$dist[$i][$i] = 1;
182	0					0	for ( my $j = $i + 1; $j < @seqs; $j++ ) {
183	0					0	$dist[$j][$i] = $sub->( $seqs[$j], $seqs[$i] );
184							}
185							}
186	0					0	$dist[$i][$i] = 1;
187
188	0					0	return \@dist;
189							}
190
191							sub map_seqids {
192	0			0	1	0	my ( $seqs, $pattern ) = @_;
193
194	0					0	my %map;
195
196	0					0	my $i = seqid_mapper($pattern);
197
198	0					0	for my $s (@$seqs) {
199	0					0	my ( $new, $old ) = $i->($s);
200	0					0	$map{$new} = $old;
201							}
202
203	0					0	return \%map;
204							}
205
206							sub seqid_mapper {
207	0			0	1	0	my ( $pattern, @extra_args ) = @_;
208	0	0				0	$pattern = 's%08d' unless defined $pattern;
209
210	0					0	my $handler;
211	0	0				0	if ( ref $pattern eq 'CODE' ) {
		0
212	0					0	$handler = $pattern;
213							} elsif ( ref $pattern eq 'HASH' ) {
214	0					0	my $i = 1;
215							$handler = sub {
216	0			0		0	my $id = shift;
217	0	0				0	return defined $pattern->{$id} ? $pattern->{$id} : 'unknown_' . $i++;
218	0					0	};
219							} else {
220	0					0	my $i = 1;
221							$handler = sub {
222	0			0		0	return sprintf $pattern, $i++;
223
224	0					0	};
225							}
226
227							return sub {
228	0			0		0	my ($seq) = @_;
229	0	0				0	return unless ($seq);
230	0	0				0	unless ( blessed($seq) ) {
231	0	0				0	if ( ref $seq eq 'ARRAY' ) {
232	0					0	croak "you supplied an array to the mapper function, use single seq Bio::Gonzales::Seq objects";
233							} else {
234	0					0	confess Dumper $seq ;
235							}
236							}
237
238	0					0	my $orig_id = $seq->id;
239	0					0	my $id = $handler->( $orig_id, @extra_args );
240	0					0	$seq->id($id);
241
242	0					0	return ( $id, $orig_id );
243	0					0	};
244							}
245
246							{
247							my $POLY64REVh = 0xd8000000;
248							my @CRCTableh = 256;
249							my @CRCTablel = 256;
250							my $initialized;
251
252							sub crc64 {
253	25			25	0	35	my $sequence = shift;
254	25					43	my $crcl = 0;
255	25					32	my $crch = 0;
256	25	100				53	if ( !$initialized ) {
257	1					2	$initialized = 1;
258	1					4	for ( my $i = 0; $i < 256; $i++ ) {
259	256					305	my $partl = $i;
260	256					301	my $parth = 0;
261	256					432	for ( my $j = 0; $j < 8; $j++ ) {
262	2048					2510	my $rflag = $partl & 1;
263	2048					2428	$partl >>= 1;
264	2048	50				3211	$partl \|= ( 1 << 31 ) if $parth & 1;
265	2048					2432	$parth >>= 1;
266	2048	100				4098	$parth ^= $POLY64REVh if $rflag;
267							}
268	256					346	$CRCTableh[$i] = $parth;
269	256					446	$CRCTablel[$i] = $partl;
270							}
271							}
272
273	25					1021	foreach ( split '', $sequence ) {
274	4581					6364	my $shr = ( $crch & 0xFF ) << 24;
275	4581					5502	my $temp1h = $crch >> 8;
276	4581					5583	my $temp1l = ( $crcl >> 8 ) \| $shr;
277	4581					6582	my $tableindex = ( $crcl ^ ( unpack "C", $_ ) ) & 0xFF;
278	4581					6057	$crch = $temp1h ^ $CRCTableh[$tableindex];
279	4581					6550	$crcl = $temp1l ^ $CRCTablel[$tableindex];
280							}
281	25	50				400	return wantarray ? ( $crch, $crcl ) : sprintf( "%08X%08X", $crch, $crcl );
282							}
283							}
284
285							1;
286							__END__
287
288							=head1 NAME
289
290
291
292							=head1 SYNOPSIS
293
294							use Bio::Gonzales::Seq::Util qw(
295							pairwise_identity_l
296							pairwise_identity_s
297							pairwise_identity_gaps_l
298							pairwise_identity_gaps_s
299							pairwise_identities
300							map_seqids
301							seqid_mapper
302							overlaps
303							cluster_overlapping_ranges
304							);
305
306							=head1 DESCRIPTION
307
308							=head1 SUBROUTINES
309
310							=over 4
311
312							=item B<< $map = map_seqids($seqs, I<$pattern>) >>
313
314							Maps all sequence ids of C<$seqs> in situ. If C<$pattern> is not given, C<s%9d> is taken as default.
315
316
317							=item B<< $clustered_ranges = cluster_overlapping_ranges(\@ranges) >>
318
319							This function takes some ranges and clusters them by overlap.
320
321							@ranges = (
322							[ $start, $stop, $some, $custom, $elements ],
323							[ ... ],
324							...
325							);
326
327							$clustered_ranges = [
328							# first cluster
329							[
330							[ $start, $stop, $some, $custom, $elements ],
331							...
332							],
333							# next cluster
334							[
335							[ $start, $stop, $some, $custom, $elements ],
336							...
337							],
338							...
339							];
340
341
342							=item B<< $map = map_seqids(\@seqs!, $pattern) >>
343
344							Wrapper around C<seqid_mapper()>, maps the ids of @seqs and returns the map.
345							This function works directly on the sequences.
346
347							=item B<< $mapper = seqid_mapper(\%idmap) >>
348
349							Create a mapper that maps given sequences to new ids generated by the argument
350							given. A hash as argument will be used as mapping base, taking the key as old and the
351							value as new id. In case the sequence id is non-existent in the hash, an artificial id
352							following the pattern C<unknown_$i> with C<$i> running from 0
353							onwards will be generated..
354
355							=item B<< $mapper = seqid_mapper(\&handler) >>
356
357							Create a mapper that maps given sequences to new ids generated by successive
358							calls to the handler. The handler will get the existing/original id as
359							argument and shall return a new id. A simple mapper would be:
360
361
362							my $i = 1;
363							my $mapper = seqid_mapper(sub { sprintf "id%d", $i++ });
364							or
365							my $mapper = seqid_mapper(sub { my $id = shift; $id =~ s/pep/cds/; return $id});
366
367							my ($old_id, $new_id) = $mapper->($sequence_object);
368
369							The sequence object WILL BE ALTERED IN SITU.
370
371							=item B<< $mapper = seqid_mapper($pattern) >>
372
373							Use pattern as basis for sequence id mapping, "%s" or "%d" must be included
374							ONLY ONCE and will be substituted by a couter, running from 0 to INF.
375
376							=item B<< $mapper = seqid_mapper() >>
377
378							Same as C<$mapper = seqid_mapper("s%9d")>
379
380							=item B<< overlaps([$a_begin, $a_end], [$b_begin, $b_end]) >>
381
382							Returns true if a overlaps with b, false otherwise.
383
384							=back
385
386							=head1 SEE ALSO
387
388							=head1 AUTHOR
389
390							jw bargsten, C<< <joachim.bargsten at wur.nl> >>
391
392							=cut