File Coverage

blib/lib/Bio/DOOP/Util/Run/Fuzznuc.pm

Criterion	Covered	Total	%
statement	9	172	5.2
branch	0	32	0.0
condition			n/a
subroutine	3	13	23.0
pod	10	10	100.0
total	22	227	9.6

line	stmt	bran	sub	pod	time	code
1						package Bio::DOOP::Util::Run::Fuzznuc;
2
3	1		1		6	use strict;
	1				1
	1				33
4	1		1		6	use warnings;
	1				2
	1				30
5	1		1		5	use Carp qw(cluck carp verbose);
	1				2
	1				2004
6
7						=head1 NAME
8
9						Bio::DOOP::Util::Run::Fuzznuc - Fuzznuc module
10
11						=head1 VERSION
12
13						Version 0.7
14
15						=cut
16
17						our $VERSION = '0.7';
18
19						=head1 SYNOPSIS
20
21						#!/usr/bin/perl -w
22
23						use Bio::DOOP::DOOP;
24
25						$db = Bio::DOOP::DBSQL->connect("user","pass","doop-plant-1_5","localhost");
26
27						@list = ("81001020","81001110","81001200","81001225","81001230","81001290","81001470","81001580","81001610","81001620","81001680");
28
29						$fuzznuc = Bio::DOOP::Util::Run::Fuzznuc->new($db,'500','M',\@list,"/data/DOOP/dummy.txt");
30
31						print $fuzznuc->get_tmp_file_name,"\n";
32
33						$error = $fuzznuc->run('TTGGGC' , 1 , 0);
34
35						if ($error == -1){
36						die "No results or error!\n";
37						}
38
39						@res = @{$fuzznuc->get_results};
40
41						for $result (@res){
42						print $$result[0]->get_id,"\| ",$$result[1]," ",$$result[2]," ",$$result[3]," ",$$result[4],"\n";
43						}
44
45						=head1 DESCRIPTION
46
47						This module is a wrapper for the EMBOSS (http://emboss.sourceforge.net) program fuzznuc. You can search
48						for patterns in the promoter sequences.
49
50						=head1 AUTHORS
51
52						Tibor Nagy, Godollo, Hungary and Endre Sebestyen, Martonvasar, Hungary
53
54						=head1 METHODS
55
56						=head2 new
57
58						Create new Fuzznuc object.
59
60						Arguments:
61
62						1. Bio::DOOP::DBSQL object
63						2. promoter type (500, 1000, 3000)
64						3. subset type (depends on reference species)
65						4. arrayref of clusters
66						5. temporary file name (default: /tmp/fuzznuc_run.txt, will contain fasta sequences)
67
68						$fuzznuc = Bio::DOOP::Util::Run::Fuzznuc->new($db,500,'M',\@list,'/tmp/tmpfile');
69
70						=cut
71
72						sub new {
73	0		0	1		my $self = {};
74	0					my $dummy = shift;
75	0					my $db = shift;
76	0					my $promo_type = shift;
77	0					my $subset_type = shift;
78	0					my $cluster_id_list = shift;
79	0					my $tmp_filename = shift;
80
81						# TODO use File::Temp module
82	0	0				if (!$tmp_filename) { $tmp_filename = "/tmp/fuzznuc_run.txt" }
	0
83	0					open TMP,">$tmp_filename";
84	0					for my $cl_id (@{$cluster_id_list}){
	0
85	0					my $cl = Bio::DOOP::Cluster->new($db,,$cl_id,$promo_type);
86	0	0				if ($cl == -1){ next }
	0
87	0					my $subset = $cl->get_subset_by_type($subset_type);
88	0	0				if ($subset == -1){ next }
	0
89	0					my @seqs = @{$subset->get_all_seqs};
	0
90	0					for my $seq (@seqs){
91	0					print TMP ">",$seq->get_id,"\n";
92	0					print TMP $seq->get_raw_seq,"\n\n";
93						}
94						}
95	0					close TMP;
96	0					$self->{DB} = $db;
97	0					$self->{CLLIST} = $cluster_id_list;
98	0					$self->{TMP_FILE} = $tmp_filename;
99
100	0					bless $self;
101	0					return($self);
102						}
103
104						=head2 new_by_file
105
106						Create new fuzznuc object from query file, containing cluster ids.
107
108						Arguments:
109
110						1. Bio::DOOP::DBSQL object
111						2. promoter type (500, 1000, 3000)
112						3. subset type (depends on reference species)
113						4. file containing cluster ids
114						5. temporary file name (default: /tmp/fuzznuc_run.txt, will contain fasta sequences)
115
116						$fuzznuc = Bio::DOOP::Util::Run::Fuzznuc->new($db,500,'M','/tmp/clusters.txt','/tmp/tmpfile');
117
118						=cut
119
120						sub new_by_file {
121	0		0	1		my $self = {};
122	0					my $dummy = shift;
123	0					my $db = shift;
124	0					my $promo_type = shift;
125	0					my $subset_type = shift;
126	0					my $filename = shift;
127	0					my $tmp_filename = shift;
128	0					my @cluster_id_list;
129
130						# TODO use File::Temp module
131	0	0				if (!$tmp_filename) { $tmp_filename = "/tmp/fuzznuc_run.txt" }
	0
132
133	0	0				open CLUSTER_ID_FILE,$filename or cluck("No such file or directory!\n");
134	0	0				open TMP,">$tmp_filename" or cluck("Can't write to the temporary file!\n");
135	0					while(){
136	0					chomp;
137	0					my $cl_id = $_;
138	0					push @cluster_id_list,$cl_id;
139	0					my $cl = Bio::DOOP::Cluster->new($db,,$cl_id,$promo_type);
140	0					my $subset = $cl->get_subset_by_type($subset_type);
141	0	0				if ($subset == -1) { next }
	0
142	0					my @seqs = @{$subset->get_all_seqs};
	0
143	0					for my $seq (@seqs){
144	0					print TMP ">",$seq->get_id,"\n";
145	0					print TMP $seq->get_raw_seq,"\n\n";
146						}
147						}
148	0					close CLUSTER_ID_FILE;
149	0					close TMP;
150
151	0					$self->{DB} = $db;
152	0					$self->{CLLIST} = \@cluster_id_list;
153	0					$self->{TMP_FILE} = $tmp_filename;
154
155	0					bless $self;
156	0					return($self);
157						}
158
159						=head2 new_by_tmp
160
161						Create new fuzznuc object from existing temporary file,
162						containing query sequences in fasta format.
163
164						Arguments:
165
166						1. Bio::DOOP::DBSQL object
167						2. file containing fasta sequences
168
169						$fuzznuc = Bio::DOOP::Util::Run::Fuzznuc->new($db,'/tmp/sequences.fasta');
170
171						=cut
172
173						sub new_by_tmp {
174	0		0	1		my $self = {};
175	0					my $dummy = shift;
176	0					my $db = shift;
177	0					my $tmp_filename = shift;
178
179	0					$self->{DB} = $db;
180	0					$self->{TMP_FILE} = $tmp_filename;
181
182	0					bless $self;
183	0					return($self);
184						}
185
186						=head2 get_tmp_file_name
187
188						Get the temporary file name.
189
190						$tempname = $fuzznuc->get_tmp_file_name;
191
192						=cut
193
194						sub get_tmp_file_name {
195	0		0	1		my $self = shift;
196	0					return($self->{TMP_FILE});
197						}
198
199						=head2 get_emboss_version
200
201						Get the installed emboss version.
202
203						$version = $fuzznuc->get_emboss_version;
204
205						=cut
206
207						sub get_emboss_version {
208	0		0	1		my $self = shift;
209	0					return($self->{EMBOSSVER});
210						}
211
212						=head2 run
213
214						Runs fuzznuc, returns 0 on success, otherwise -1.
215
216						Arguments :
217
218						1. query pattern
219						2. mismatch number
220						3. complement (0 or 1)
221
222						$fuzznuc_error = $fuzznuc->run('AACCAGGTT','1','1');
223
224						=cut
225
226						sub run {
227	0		0	1		my $self = shift;
228	0					my $pattern = shift;
229	0					my $mismatch = shift;
230	0					my $complement = shift;
231
232	0					my $file = $self->{TMP_FILE};
233
234	0					my @result = `fuzznuc $file -pattern='$pattern' -sformat=fasta -pmismatch=$mismatch -complement=$complement -stdout -auto`;
235
236	0					my $seq_id;
237						my $start;
238	0					my $end;
239	0					my $mism;
240	0					my $hitseq;
241	0					my @parsed;
242	0					my $strand;
243
244	0	0				if ($#result == -1) { return(-1) } #No results or an error happened.
	0
245	0					for my $line (@result){
246	0	0				if ($line =~ / Sequence: (\S+)/){
247	0					$seq_id = $1;
248						}
249	0	0				if ($line =~ /\s+(\d+)\s+(\d+)\s+(\w+)\s+([0123456789.]+)\s+(\w+)/){
250	0					$start = $1;
251	0					$end = $2;
252	0					$mism = $4;
253	0					$hitseq = $5;
254	0					$mism =~ s/\./0/;
255	0	0				$strand = $start < $end ? 1 : -1;
256	0					push @parsed, "$seq_id $start $end $mism $hitseq $strand";
257						}
258						}
259
260	0					$self->{RESULT} = \@parsed;
261	0					return(0);
262						}
263
264						=head2 run_background
265
266						Runs fuzznuc in background, returns the process id.
267
268						Arguments :
269
270						1. query pattern
271						2. mismatch number
272						3. complement (0 or 1)
273						4. output filename
274
275						$fuzznuc_pid = $fuzznuc->run_background('AACCAGGTT','1','1','/tmp/fuzznuc_result.txt');
276
277						=cut
278
279						sub run_background {
280	0		0	1		my $self = shift;
281	0					my $pattern = shift;
282	0					my $mismatch = shift;
283	0					my $complement = shift;
284	0					my $outfile = shift;
285	0					my $file = $self->{TMP_FILE};
286	0					my $pid;
287
288	0	0				unless($pid = fork){
289	0					`fuzznuc $file -pattern='$pattern' -pmismatch=$mismatch -sformat=fasta -complement=$complement -outfile=$outfile`;
290						}
291
292	0					return($pid);
293						}
294
295						=head2 get_raw_results
296
297						Returns an arrayref of arrays with the raw fuzznuc results, without Bio::DOOP objects.
298						This is much faster as it does not use the database.
299
300						The results contain the following:
301
302						1. sequence ID
303						2. hit start
304						3. hit end
305						4. mismatch number
306						5. hit sequence
307						6. hit strand
308
309						@result = @{$fuzznuc->get_raw_results};
310
311						=cut
312
313						sub get_raw_results {
314	0		0	1		my $self = shift;
315
316	0					my @fuzznuc_res;
317	0					my $res = $self->{RESULT};
318	0					my $seq_id;
319						my $start;
320	0					my $end;
321	0					my $mism;
322	0					my $hitseq;
323	0					my $strand;
324
325	0					for my $line (@{$res}){
	0
326	0					($seq_id,$start,$end,$mism,$hitseq,$strand) = split(/\s+/,$line);
327
328	0					push @fuzznuc_res,[$seq_id,$start,$end,$mism,$hitseq,$strand];
329						}
330
331	0					return(\@fuzznuc_res);
332						}
333
334						=head2 get_results
335
336						Returns an arrayref of arrays with sequence objects and other information of the results.
337
338						The results contain the following:
339
340						1. Bio::DOOP::Sequence object
341						2. hit start
342						3. hit end
343						4. mismatch number
344						5. hit sequence
345						6. hit strand
346
347						@result = @{$fuzznuc->get_raw_results};
348
349						=cut
350
351						sub get_results {
352	0		0	1		my $self = shift;
353
354	0					my @fuzznuc_res;
355	0					my $res = $self->{RESULT};
356	0					my $seq_id;
357						my $start;
358	0					my $end;
359	0					my $mism;
360	0					my $hitseq;
361	0					my $strand;
362
363	0					for my $line (@{$res}){
	0
364	0					($seq_id,$start,$end,$mism,$hitseq,$strand) = split(/\s+/,$line);
365
366	0					my $seq = Bio::DOOP::Sequence->new_from_dbid($self->{DB},$seq_id);
367	0					push @fuzznuc_res,[$seq,$start,$end,$mism,$hitseq,$strand];
368						}
369
370	0					return(\@fuzznuc_res);
371						}
372
373						=head2 get_results_from_file
374
375						Returns an arrayref of arrays with sequence objects and other information of the results
376						from a results file. With this method you can fetch the results of different fuzznuc objects.
377
378						The results contain the following:
379
380						1. Bio::DOOP::Sequence object
381						2. hit start
382						3. hit end
383						4. mismatch number
384						5. hit sequence
385						6. hit strand
386
387						@result = @{$fuzznuc->get_results_from_file};
388
389						=cut
390
391						sub get_results_from_file {
392	0		0	1		my $self = shift;
393	0					my $filename = shift;
394
395	0					my $seq_id;
396						my $start;
397	0					my $end;
398	0					my $mism;
399	0					my $hitseq;
400	0					my @parsed;
401	0					my $strand;
402
403	0	0				open FILE, $filename or return(-1);
404	0					while(){
405	0					chomp;
406	0					my $line = $_;
407	0	0				if ($line =~ / Sequence: (\S+)/){
408	0					$seq_id = $1;
409						}
410	0	0				if ($line =~ /\s+(\d+)\s+(\d+)\s+(\w+)\s+([0123456789.]+)\s+(\w+)/){
411	0					$start = $1;
412	0					$end = $2;
413	0					$mism = $4;
414	0					$hitseq = $5;
415	0					$mism =~ s/\./0/;
416	0	0				$strand = $start < $end ? 1 : -1;
417	0					my $seq = Bio::DOOP::Sequence->new($self->{DB},$seq_id);
418	0					push @parsed, [$seq,$start,$end,$mism,$hitseq,$strand];
419						}
420						}
421	0					close FILE;
422
423	0					$self->{RESULT} = \@parsed;
424	0					return(\@parsed);
425						}
426
427						1;