File Coverage

blib/lib/Bio/Tools/Run/TribeMCL.pm
Criterion Covered Total %
statement 69 440 15.6
branch 13 174 7.4
condition 3 44 6.8
subroutine 15 30 50.0
pod 4 5 80.0
total 104 693 15.0


line stmt bran cond sub pod time code
1             # BioPerl module for TribeMCL
2             #
3             # Please direct questions and support issues to
4             #
5             # Cared for by Shawn Hoon
6             #
7             # Copyright Shawn Hoon
8             #
9             # You may distribute this module under the same terms as perl itself
10              
11             # POD documentation - main docs before the code
12              
13             =head1 NAME
14              
15             Bio::Tools::Run::TribeMCL
16              
17             =head1 SYNOPSIS
18              
19             use Bio::Tools::Run::TribeMCL;
20             use Bio::SearchIO;
21              
22             # 3 methods to input the blast results
23              
24             # straight forward raw blast output (NCBI or WU-BLAST)
25             my @params = ('inputtype'=>'blastfile');
26              
27             # OR
28              
29             # markov program format
30             # protein_id1 protein_id2 evalue_magnitude evalue_factor
31             # for example:
32             # proteins ENSP00000257547 and ENSP00000261659
33             # with a blast score evalue of 1e-50
34             # and proteins O42187 and ENSP00000257547
35             # with a blast score evalue of 1e-119
36             # entry would be
37              
38             my @array = [[qw(ENSP00000257547 ENSP00000261659 1 50)],
39             [qw(O42187 ENSP00000257547 1 119)]];
40             my @params = ('pairs'=>\@array,I=>'2.0');
41              
42             # OR
43              
44             # pass in a searchio object
45             # slowest of the 3 methods as it does more rigourous parsing
46             # than required for us here
47              
48             my $sio = Bio::SearchIO->new(-format=>'blast',
49             -file=>'blast.out');
50             my @params=('inputtype'=>'searchio',I=>'2.0');
51              
52              
53             # you can specify the path to the executable manually in the following way
54             my @params=('inputtype'=>'blastfile',I=>'2.0',
55             'mcl'=>'/home/shawn/software/mcl-02-150/src/shmcl/mcl',
56             'matrix'=>'/home/shawn/software/mcl-02-150/src/contrib/tribe/tribe-matrix');
57             my $fact = Bio::Tools::Run::TribeMCL->new(@params);
58              
59             # OR
60              
61             $fact->matrix_executable('/home/shawn/software/mcl-02-150/src/contrib/tribe/tribe-matrix');
62             $fact->mcl_executable('/home/shawn/software/mcl-02-150/src/shmcl/mcl');
63              
64             # to run
65              
66             my $fact = Bio::Tools::Run::TribeMCL->new(@params);
67              
68             # Run the program
69             # returns an array reference to clusters where members are the ids
70             # for example :2 clusters with 3 members per cluster:
71             # $fam = [ [mem1 mem2 mem3],[mem1 mem2 mem3]]
72              
73             # pass in either the blastfile path/searchio obj/the array ref to scores
74             my $fam = $fact->run($sio);
75              
76             # print out your clusters
77              
78             for (my $i = 0; $i
79             print "Cluster $i \t ".scalar(@{$fam->[$i]})." members\n";
80             foreach my $member (@{$fam->[$i]}){
81             print "\t$member\n";
82             }
83             }
84              
85             =head1 DESCRIPTION
86              
87             TribeMCL is a method for clustering proteins into related groups,
88             which are termed 'protein families'. This clustering is achieved by
89             analysing similarity patterns between proteins in a given dataset, and
90             using these patterns to assign proteins into related groups. In many
91             cases, proteins in the same protein family will have similar
92             functional properties.
93              
94             TribeMCL uses a novel clustering method (Markov Clustering or MCL)
95             which solves problems which normally hinder protein sequence
96             clustering.
97              
98             Reference:
99              
100             Enright A.J., Van Dongen S., Ouzounis C.A; Nucleic Acids
101             Res. 30(7):1575-1584 (2002)
102              
103             You will need tribe-matrix (the program used to generate the matrix
104             for input into mcl) and mcl (the clustering software) available at:
105              
106             http://www.ebi.ac.uk/research/cgg/tribe/ or
107             http://micans.org/mcl/.
108              
109             Future Work in this module: Port the tribe-matrix program into perl so
110             that we can enable have a SearchIO kinda module for reading and
111             writing mcl data format
112              
113             =head1 FEEDBACK
114              
115             =head2 Mailing Lists
116              
117             User feedback is an integral part of the evolution of this and other
118             Bioperl modules. Send your comments and suggestions preferably to one
119             of the Bioperl mailing lists. Your participation is much appreciated.
120              
121             bioperl-l@bioperl.org - General discussion
122             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
123              
124             =head2 Support
125              
126             Please direct usage questions or support issues to the mailing list:
127              
128             I
129              
130             rather than to the module maintainer directly. Many experienced and
131             reponsive experts will be able look at the problem and quickly
132             address it. Please include a thorough description of the problem
133             with code and data examples if at all possible.
134              
135             =head2 Reporting Bugs
136              
137             Report bugs to the Bioperl bug tracking system to help us keep track
138             the bugs and their resolution. Bug reports can be submitted via the
139             web:
140              
141             http://redmine.open-bio.org/projects/bioperl/
142              
143             =head1 AUTHOR - Shawn Hoon
144              
145             Email shawnh@fugu-sg.org
146              
147             =head1 APPENDIX
148              
149             The rest of the documentation details each of the object
150             methods. Internal methods are usually preceded with a "_".
151              
152             =cut
153              
154              
155             # Let the code begin...
156             package Bio::Tools::Run::TribeMCL;
157 1         100 use vars qw($AUTOLOAD @ISA $PROGRAMDIR
158             @TRIBEMCL_PARAMS @MATRIXPROGRAM_PARAMS @MCLPROGRAM_PARAMS
159             @OTHER_SWITCHES %OK_FIELD
160             $MATRIXPROGRAM_NAME $MCLPROGRAM_NAME
161             $MCLPROGRAM $MATRIXPROGRAM
162 1     1   112194 );
  1         2  
163 1     1   5 use strict;
  1         2  
  1         18  
164              
165 1     1   469 use Bio::SeqIO;
  1         53357  
  1         48  
166 1     1   15 use Bio::Root::Root;
  1         3  
  1         32  
167 1     1   9 use Bio::Root::IO;
  1         3  
  1         34  
168 1     1   453 use Bio::Cluster::SequenceFamily;
  1         2826  
  1         42  
169 1     1   418 use Bio::Factory::ApplicationFactoryI;
  1         173  
  1         35  
170 1     1   433 use Bio::Tools::Run::WrapperBase;
  1         3  
  1         25  
171 1     1   269 use Bio::Annotation::DBLink;
  1         1757  
  1         25  
172 1     1   382 use Bio::Seq;
  1         24166  
  1         33  
173 1     1   320 use Bio::Species;
  1         14876  
  1         164  
174              
175             @ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase);
176              
177             # You will need to enable mcl and tribe-matrix to use this wrapper. This
178             # can be done in (at least) two ways:
179             #
180             # 1. define an environmental variable TRIBEDIR
181             # export TRIBEDIR =/usr/local/share/mclbin/
182             # where the tribe-matrix and mcl programs are located.
183             #you probably need to copy them individually to the same directory
184             #if the installation puts them in different directories. or simply put them in
185             #your standard /usr/local/bin
186             #
187             # 2. include a definition of an environmental variable TRIBEDIR in
188             # every script that will use TRIBEMCL.pm
189             # $ENV{TRIBEDIR} = '/usr/local/share/mclbin/;
190             #
191             #3. Manually set the path to the executabes in your code:
192             #
193              
194             #my @params=('inputtype'=>'blastfile',I=>'2.0','
195             # mcl'=>'/home/shawn/software/mcl-02-150/src/shmcl/mcl','
196             # matrix'=>'/home/shawn/software/mcl-02-150/src/contrib/tribe/tribe-matrix');
197             #my $fact = Bio::Tools::Run::TribeMCL->new(@params);
198              
199             #or
200             #$fact->matrix_executable('/home/shawn/software/mcl-02-150/src/contrib/tribe/tribe-matrix');
201             #$fact->mcl_executable('/home/shawn/software/mcl-02-150/src/shmcl/mcl');
202              
203              
204             BEGIN {
205 1     1   3 $MATRIXPROGRAM_NAME = 'tribe-matrix';
206 1         3 $MCLPROGRAM_NAME = 'mcl';
207 1 50       6 if (defined $ENV{TRIBEDIR}) {
208 0   0     0 $PROGRAMDIR = $ENV{TRIBEDIR} || '';
209 0 0       0 $MCLPROGRAM = Bio::Root::IO->catfile($PROGRAMDIR,$MCLPROGRAM_NAME.($^O =~ /mswin/i ?'.exe':''));
210 0 0       0 $MATRIXPROGRAM = Bio::Root::IO->catfile($PROGRAMDIR,$MATRIXPROGRAM_NAME.($^O =~ /mswin/i ?'.exe':''));
211             }
212              
213 1         4 @TRIBEMCL_PARAMS = qw(inputtype hsp hit scorefile blastfile description_file searchio pairs mcl matrix weight description family_tag use_db);
214              
215 1         2 @MATRIXPROGRAM_PARAMS = qw(ind out chunk);
216 1         3 @MCLPROGRAM_PARAMS = qw(I t P R pct o);
217              
218 1         2 @OTHER_SWITCHES = qw(verbose quiet);
219              
220             # Authorize attribute fields
221 1         3 foreach my $attr (@TRIBEMCL_PARAMS, @MATRIXPROGRAM_PARAMS,
222             @MCLPROGRAM_PARAMS, @OTHER_SWITCHES) {
223 25         3639 $OK_FIELD{$attr}++;
224             }
225             }
226              
227             sub new {
228 1     1 1 113 my ($class, @args) = @_;
229 1         14 my $self = $class->SUPER::new(@args);
230            
231 1         45 my ($attr, $value);
232 1         13 while (@args) {
233 2         5 $attr = shift @args;
234 2         7 $value = shift @args;
235 2 50       8 next if( $attr =~ /^-/ ); # don't want named parameters
236 2 50       5 if ($attr =~/MCL/i) {
237 0         0 $self->mcl_executable($value);
238 0         0 next;
239             }
240 2 50       6 if ($attr =~ /MATRIX/i){
241 0         0 $self->matrix_executable($value);
242 0         0 next;
243             }
244 2         15 $self->$attr($value);
245             }
246 1 50       6 defined($self->weight) || $self->weight(200);
247              
248 1         3 return $self;
249             }
250              
251             sub AUTOLOAD {
252 5     5   24 my $self = shift;
253 5         6 my $attr = $AUTOLOAD;
254 5         16 $attr =~ s/.*:://;
255 5 50       13 $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr};
256 5 100       10 $self->{$attr} = shift if @_;
257 5         15 return $self->{$attr};
258             }
259              
260             =head2 mcl_executable
261              
262             Title : mcl_executable
263             Usage : $self->mcl_executable()
264             Function: get set for path to mcl executable
265             Returns : String or undef if not installed
266             Args : [optional] string of path to executable
267             [optional] boolean to warn on missing executable status
268              
269             =cut
270              
271             sub mcl_executable{
272 0     0 1 0 my ($self,$exe,$warn) = @_;
273            
274 0 0       0 if( defined $exe ) {
275 0         0 $self->{'_mcl_exe'} = $exe;
276             }
277 0 0       0 unless( defined $self->{'_mcl_exe'} ) {
278             # this would be the name set in the BEGIN block
279 0 0 0     0 if( $MCLPROGRAM && -e $MCLPROGRAM && -x $MCLPROGRAM ) {
      0        
280 0         0 $self->{'_mcl_exe'} = $MCLPROGRAM;
281             } else {
282 0         0 my $exe;
283 0 0 0     0 if( ( $exe = $self->io->exists_exe($MCLPROGRAM_NAME) ) &&
284             -x $exe ) {
285 0         0 $self->{'_mcl_exe'} = $exe;
286             } else {
287 0 0       0 $self->warn("Cannot find executable for $MCLPROGRAM_NAME") if $warn;
288 0         0 $self->{'_mcl_exe'} = undef;
289             }
290             }
291             }
292 0         0 $self->{'_mcl_exe'};
293             }
294              
295             =head2 matrix_executable
296              
297             Title : matrix_executable
298             Usage : $self->matrix_executable()
299             Function: get set for path to tribe-matrix executable
300             Returns : String or undef if not installed
301             Args : [optional] string of path to executable
302             [optional] boolean to warn on missing executable status
303              
304             =cut
305              
306             sub matrix_executable{
307 1     1 1 841 my ($self,$exe,$warn) = @_;
308            
309 1 50       4 if( defined $exe ) {
310 0         0 $self->{'_matrix_exe'} = $exe;
311             }
312 1 50       4 unless( defined $self->{'_matrix_exe'} ) {
313             # this would be the name set in the BEGIN block
314 1 50 33     5 if( $MATRIXPROGRAM && -e $MATRIXPROGRAM && -x $MATRIXPROGRAM ) {
      33        
315 0         0 $self->{'_matrix_exe'} = $MATRIXPROGRAM;
316             } else {
317 1         2 my $exe;
318 1 50 33     10 if( ( $exe = $self->io->exists_exe($MATRIXPROGRAM_NAME) ) &&
319             -x $exe ) {
320 0         0 $self->{'_matrix_exe'} = $exe;
321             } else {
322 1 50       292 $self->warn("Cannot find executable for $MATRIXPROGRAM_NAME")
323             if $warn;
324 1         3 $self->{'_matrix_exe'} = undef;
325             }
326             }
327             }
328 1         3 $self->{'_matrix_exe'};
329             }
330              
331             =head2 run
332              
333             Title : run
334             Usage : $self->run()
335             Function: runs the clustering
336             Returns : Array Ref of clustered Ids
337             Args :
338              
339             =cut
340              
341             sub run {
342 0     0 1   my ($self,$input) = @_;
343 0 0         if($self->description_file){
344 0           $self->_setup_description($self->description_file);
345             }
346 0           my $file = $self->_setup_input($input);
347 0 0         defined($file) || $self->throw("Error setting up input ");
348             #run tribe-matrix to generate matrix for mcl
349 0           my ($index_file, $mcl_infile) = $self->_run_matrix($file);
350 0 0         $self->throw("tribe-matrix not run properly as index file is missing")
351             unless (-e $index_file);
352 0 0         $self->throw("tribe-matrix not run properly as matrix file is missing")
353             unless (-e $mcl_infile);
354             #run mcl
355 0           my $clusters = $self->_run_mcl($index_file,$mcl_infile);
356 0           my $families;
357 0 0         if($self->description){
358 0           my %consensus = $self->_consensifier($clusters);
359 0           $families = $self->_generate_families($clusters,\%consensus);
360             }
361             else {
362 0           $families = $self->_generate_families($clusters);
363             }
364              
365 0           return @{$families};
  0            
366             }
367              
368             sub _generate_families {
369 0     0     my ($self,$clusters,$consensus) = @_;
370 0   0       my $family_tag = $self->family_tag || "TribeFamily";
371 0           my @fam;
372 0 0         if($consensus){
373 0           my %description = %{$self->description};
  0            
374 0           my %consensus = %{$consensus};
  0            
375 0           for(my $i = 0; $i < scalar(@{$clusters}); $i++){
  0            
376 0           my @mem;
377 0           foreach my $member (@{$clusters->[$i]}){
  0            
378             my $mem = Bio::Seq->new(-id=>$member,
379             -alphabet=>"protein",
380 0           -desc=>$description{$member}->[1]);
381 0           my $annot = Bio::Annotation::DBLink->new(-database=>$description{$member}->[0],
382             -primary_id=>$member);
383              
384 0           $mem->annotation->add_Annotation('dblink',$annot);
385              
386             #store species information
387 0           my $taxon_str = $description{$member}->[2];
388             #parse taxon info into hash
389 0           my %taxon;
390 0 0         $taxon_str=~s/=;/=undef;/g if $taxon_str;
391 0 0         %taxon = map{split '=',$_}split';',$taxon_str if $taxon_str;
  0            
392 0           my $name = $taxon{'taxon_common_name'};
393 0 0         my @classification = $taxon{'taxon_classification'} ? split(':',$taxon{'taxon_classification'}) : ();
394 0           my $tax_id = $taxon{'taxon_id'};
395 0           my $sub_species = $taxon{'taxon_sub_species'};
396              
397 0           my $species = Bio::Species->new();
398 0 0         $species->common_name($name) if $name; #*** should this actually be scientific_name() ?
399 0 0         $species->sub_species($sub_species) if $sub_species;
400 0 0         $species->ncbi_taxid($tax_id) if $tax_id;
401 0 0         $species->classification(@classification) if @classification;
402 0           $mem->species($species);
403              
404 0           push @mem, $mem;
405             }
406 0           my $id = $family_tag."_".$i;
407             my $fam = Bio::Cluster::SequenceFamily->new(-family_id=>$id,
408             -description=>$consensus{$i}{desc},
409             -annotation_score=>$consensus{$i}{conf},
410 0           -members=>\@mem);
411 0           push @fam, $fam;
412             }
413 0           return \@fam;
414             }
415             else {
416 0           for(my $i = 0; $i < scalar(@{$clusters}); $i++){
  0            
417 0           my @mem;
418 0           foreach my $member (@{$clusters->[$i]}){
  0            
419 0           my $mem = Bio::Seq->new(-id=>$member,
420             -alphabet=>"protein");
421 0           push @mem, $mem;
422             }
423 0           my $id = $family_tag."_".$i;
424 0           my $fam = Bio::Cluster::SequenceFamily->new(-family_id=>$id,
425             -members=>\@mem);
426 0           push @fam, $fam;
427             }
428 0           return \@fam;
429             }
430              
431             }
432              
433              
434             sub _consensifier {
435 0     0     my ($self,$clusters) = @_;
436 0           eval {
437 0           require "Algorithm/Diff.pm";
438             };
439 0 0         if($@){
440 0           $self->warn("Algorith::Diff is needed to run TribeMCL");
441 0           return undef;
442             }
443 0           my %description = %{$self->description};
  0            
444 0           my %consensus;
445             my $best_annotation;
446 0           my %use_db;
447 0 0         if($self->use_db){
448 0           foreach my $key(split(',',$self->use_db)){
449 0           $use_db{$key}++;
450             }
451             }
452             CLUSTER:
453 0           for(my $i = 0; $i < scalar(@{$clusters}); $i++){
  0            
454 0           my @desc;
455             my @orig_desc;
456 0           my $total_members = scalar(@{$clusters->[$i]});
  0            
457              
458 0           foreach my $member(@{$clusters->[$i]}){
  0            
459             #if specify which dbs to use for consensifying
460 0 0         if($self->use_db){
461 0 0         if($use_db{$description{$member}->[0]}){
462 0 0         push @desc, $description{$member}->[1] if $description{$member}->[1];
463 0 0         push @orig_desc, $description{$member}->[1] if $description{$member}->[1];
464             }
465             }
466             else {
467 0 0         push @desc, $description{$member}->[1] if $description{$member}->[1];
468 0 0         push @orig_desc, $description{$member}->[1] if $description{$member}->[1];
469             }
470              
471             }
472 0 0         if($#desc < 0){ #truly unknown
473 0           $consensus{$i}{desc} = "UNKNOWN";
474 0           $consensus{$i}{conf} = 0;
475 0           next CLUSTER;
476             }
477 0 0         if($#desc == 0){#only a single description
478 0           $consensus{$i}{desc} = grep(/S+/,@desc);
479 0   0       $consensus{$i}{desc} = $consensus{$i}{desc} || "UNKNOWN";
480 0 0         if ($consensus{$i}{desc} eq "UNKNOWN") {
481 0           $consensus{$i}{conf} = 0;
482             } else {
483 0           $consensus{$i}{conf} = 100 * int(1/$total_members);
484             }
485 0           next CLUSTER;
486             }
487              
488             #all the same desc
489 0           my %desc = ();
490 0           foreach my $desc (@desc) { $desc{$desc}++; }
  0            
491 0 0         if ( (keys %desc) == 1 ) {
492 0           my ($best_annotation,) = keys %desc;
493 0           my $n = grep($_ eq $best_annotation, @desc);
494 0           my $perc= int( 100*($n/$total_members) );
495 0           $consensus{$i}{desc} = $best_annotation;
496 0           $consensus{$i}{conf} = $perc;
497 0           next CLUSTER;
498             }
499            
500 0           my %lcshash = ();
501 0           my %lcnext = ();
502 0           while (@desc) {
503             # do an all-against-all LCS (longest commong substring) of the
504             # descriptions of all members; take the resulting strings, and
505             # again do an all-against-all LCS on them, until we have nothing
506             # left. The LCS's found along the way are in lcshash.
507             #
508             # Incidentally, longest common substring is a misnomer, since it
509             # is not guaranteed to occur in either of the original strings. It
510             # is more like the common parts of a Unix diff ...
511 0           for (my $i=0;$i<@desc;$i++) {
512 0           for (my $j=$i+1;$j<@desc;$j++){
513 0           my @list1=split(" ",$desc[$i]);
514 0           my @list2=split(" ",$desc[$j]);
515 0           my @lcs=Algorithm::Diff::LCS(\@list1,\@list2);
516 0           my $lcs=join(" ",@lcs);
517 0           $lcshash{$lcs}=1;
518 0           $lcnext{$lcs}=1;
519             }
520             }
521 0           @desc=keys(%lcnext);
522 0           undef %lcnext;
523             }
524 0           my ($best_score, $best_perc)=(0, 0);
525 0           my @all_cands=sort {length($b) <=>length($a)} keys %lcshash ;
  0            
526 0           foreach my $candidate_consensus (@all_cands) {
527 0           my @temp=split(" ",$candidate_consensus);
528 0           my $length=@temp; # num of words in annotation
529              
530             # see how many members of cluster contain this LCS:
531              
532 0           my ($lcs_count)=0;
533 0           foreach my $orig_desc (@orig_desc) {
534 0           my @list1=split(" ",$candidate_consensus);
535 0           my @list2=split(" ",$orig_desc);
536 0           my @lcs=Algorithm::Diff::LCS(\@list1,\@list2);
537 0           my $lcs=join(" ",@lcs);
538              
539 0 0 0       if ($lcs eq $candidate_consensus ||
540             index($orig_desc,$candidate_consensus) != -1 # addition;
541             # many good (single word) annotations fall out otherwise
542             ) {
543 0           $lcs_count++;
544              
545             # Following is occurs frequently, as LCS is _not_ the longest
546             # common substring ... so we can't use the shortcut either
547              
548             # if ( index($orig_desc,$candidate_consensus) == -1 ) {
549             # warn "lcs:'$lcs' eq cons:'$candidate_consensus' and
550             # orig:'$orig_desc', but index == -1\n"
551             # }
552             }
553             }
554 0           my $perc_with_desc=(($lcs_count/$total_members))*100;
555 0           my $perc=($lcs_count/$total_members)*100;
556 0           my $score=$perc + ($length*14); # take length into account as well
557 0 0         $score = 0 if $length==0;
558 0 0 0       if (($perc_with_desc >= 40) && ($length >= 1)) {
559 0 0         if ($score > $best_score) {
560 0           $best_score=$score;
561 0           $best_perc=$perc;
562 0           $best_annotation=$candidate_consensus;
563             }
564             }
565             }
566 0 0 0       if ($best_perc==0 || $best_perc >= 100 ) {
567 0           $best_perc='NA';
568             }
569 0 0         if ($best_annotation eq '') {
570 0           $best_annotation = 'AMBIGUOUS';
571             }
572 0           $consensus{$i}{desc} = $best_annotation;
573 0           $consensus{$i}{conf} = $best_perc;
574             }
575 0           return %consensus;
576             }
577              
578             sub _setup_description {
579 0     0     my ($self,$file) = @_;
580 0           my $goners='().-';
581 0           my $spaces= ' ' x length($goners);
582 0           my $filter = "tr '$goners' '$spaces' < $file";
583 0 0         open (FILE,"$filter | ") || die "$filter: $!";
584 0           my %description;
585 0           while(){
586 0           chomp;
587 0           my ($db,$acc,$description,$taxon_str) = split("\t",$_);
588 0 0         $description || $self->throw("Wrongly formated description file");
589 0           $description = $self->_apply_edits($description);
590              
591 0 0         if($description{$acc}){
592 0           $self->warn("Duplicated entry $acc in description file, overwriting..");
593             }
594 0           $description{$acc} = [$db,$description,$taxon_str];
595             }
596 0           $self->description(\%description);
597             }
598              
599             sub as_words {
600             #add ^ and $ to regexp
601 0     0 0   my (@words);
602 0           my @newwords=();
603              
604 0           foreach my $word (@words) { push @newwords, "^$word\$" };
  0            
605             }
606              
607             sub _apply_edits {
608 0     0     my ($self,$desc) = @_;
609 0           my @deletes = ( 'FOR\$', 'SIMILAR TO\$', 'SIMILAR TO PROTEIN\$',
610             'RIKEN.*FULL.*LENGTH.*ENRICHED.*LIBRARY',
611             '\w*\d{4,}','HYPOTHETICAL PROTEIN'
612             );
613 0           my @newwords = &as_words(qw(NOVEL PUTATIVE PREDICTED
614             UNNAMED UNNMAED ORF CLONE MRNA
615             CDNA EST RIKEN FIS KIAA\d+ \S+RIK IMAGE HSPC\d+
616             FOR HYPOTETICAL HYPOTHETICAL));
617 0           push @deletes, @newwords;
618              
619 0           foreach my $re ( @deletes ) { $desc=~s/$re//g; }
  0            
620              
621             #Apply some fixes to the annotation:
622 0           $desc=~s/EC (\d+) (\d+) (\d+) (\d+)/EC $1.$2.$3.$4/;
623 0           $desc=~s/EC (\d+) (\d+) (\d+)/EC $1.$2.$3.-/;
624 0           $desc=~s/EC (\d+) (\d+)/EC $1\.$2.-.-/;
625 0           $desc=~s/(\d+) (\d+) KDA/$1.$2 KDA/;
626 0           return $desc;
627             }
628              
629             =head2 _run_mcl
630              
631             Title : _run_mcl
632             Usage : $self->_run_mcl()
633             Function: internal function for running the mcl program
634             Returns : Array Ref of clustered Ids
635             Args : Index_file name, matrix input file name
636              
637             =cut
638              
639             sub _run_mcl {
640 0     0     my ($self,$ind_file,$infile) = @_;
641 0   0       my $exe = $self->mcl_executable || $self->throw("mcl not found.");
642 0           my $cmd = $exe . " $infile";
643 0 0         unless (defined $self->o) {
644 0           my ($fh,$o) = $self->io->tempfile(-dir=>$self->tempdir);
645 0           $self->o($o);
646             # file handle not use later so deleted
647 0           close($fh);
648 0           undef $fh;
649             }
650 0 0         unless (defined $self->I) {
651 0           $self->I(3.0);
652             }
653              
654 0           foreach my $param (@MCLPROGRAM_PARAMS) {
655 0 0         if (defined $self->$param) {
656 0           $cmd .= " -$param ".$self->$param;
657             }
658             }
659 0 0 0       if($self->quiet ||
660             ($self->verbose < 0)){
661 0           $cmd .= " -V all";
662 0 0         if( $^O !~ /Mac/) {
663 0 0         my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null';
664 0           $cmd .= " 2> $null";
665             }
666             }
667              
668 0           my $status = system($cmd);
669              
670 0 0         $self->throw( "mcl call ($cmd) crashed: $? \n") unless $status==0;
671 0           my $families = $self->_parse_mcl($ind_file,$self->o);
672 0           return $families;
673             }
674              
675             =head2 _run_matrix
676              
677             Title : _run_matrix
678             Usage : $self->_run_matrix()
679             Function: internal function for running the tribe-matrix program
680             Returns : index filepath and matrix file path
681             Args : filepath of parsed ids and scores
682              
683             =cut
684              
685             sub _run_matrix {
686 0     0     my ($self,$parse_file) = @_;
687 0   0       my $exe = $self->matrix_executable || $self->throw("tribe-matrix not found.");
688 0           my $cmd = $exe . " $parse_file";
689 0 0         unless (defined $self->ind) {
690 0           my ($fh,$indexfile) = $self->io->tempfile(-dir=>$self->tempdir);
691 0           $self->ind($indexfile);
692             # file handle not use later so deleted
693 0           close($fh);
694 0           undef $fh;
695             }
696 0 0         unless (defined $self->out) {
697 0           my ($fh,$matrixfile) = $self->io->tempfile(-dir=>$self->tempdir);
698 0           $self->out($matrixfile);
699             # file handle not use later so deleted
700 0           close($fh);
701 0           undef $fh;
702             }
703 0           foreach my $param (@MATRIXPROGRAM_PARAMS) {
704 0 0         if (defined $self->$param) {
705 0           $cmd .= " -$param ".$self->$param;
706             }
707             }
708 0 0         my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null';
709 0           $cmd .= " > $null";
710 0           my $status = system($cmd);
711              
712 0 0         $self->throw( "tribe-matrix call ($cmd) crashed: $? \n") unless $status==0;
713              
714 0           return ($self->ind,$self->out);
715             }
716              
717             =head2 _setup_input
718              
719             Title : _setup_input
720             Usage : $self->_setup_input()
721             Function: internal function for running setting up the inputs
722             needed for running mcl
723             Returns : filepath of parsed ids and scores
724             Args :
725              
726             =cut
727              
728             sub _setup_input {
729 0     0     my ($self,$input) = @_;
730 0           my ($type,$rc);
731              
732 0           my ($tfh,$outfile) = $self->io->tempfile(-dir=>$self->tempdir);
733              
734 0           $type = $self->inputtype();
735 0 0         if($type=~/scorefile/i){
736 0 0         -e $self->scorefile ||
737             $self->throw("Scorefile doesn't seem to exist or accessible");
738 0           return $self->scorefile;
739             }
740 0 0         if($type =~/blastfile/i){
    0          
    0          
    0          
    0          
741 0           $self->blastfile($input);
742 0           $rc = $self->_parse_blastfile($self->blastfile,$tfh);
743             }
744             elsif($type=~/searchio/i){
745 0           $self->searchio($input);
746 0           $rc = $self->_get_from_searchio($self->searchio,$tfh);
747             }
748             elsif($type=~/pairs/i) {
749 0           $self->pairs($input);
750 0           for my $line (@{ $self->pairs }){
  0            
751 0           print $tfh join("\t",@{$line}), "\n";
  0            
752 0           $rc++;
753             }
754             }
755             elsif($type =~/hsp/i) {
756 0           $self->hsp($input);
757 0           $rc = $self->_get_from_hsp($self->hsp,$tfh);
758             }
759             elsif($type=~/hit/i){
760 0           $self->hit($input);
761 0           $rc = $self->_get_from_hit($self->hit,$tfh);
762             }
763             else {
764 0           $self->throw("Must set inputtype to either blastfile,searchio or ".
765             "paris using \$fact->blastfile |\$fact->searchio| \$fact->pairs");
766             }
767 0 0         unless ( $rc ) {
768 0           $self->throw("Need inputs for running tribe mcl, nothing provided");
769             }
770 0           close($tfh);
771 0           $tfh= undef;
772 0           return $outfile;
773             }
774              
775             =head2 _get_from_hsp
776              
777             Title : _get_from_hsp
778             Usage : $self->_get_from_hsp()
779             Function: internal function for getting blast scores from hsp
780             Returns : array ref to ids and score [protein1 protein2 magnitude factor]
781             Args : L
782              
783             =cut
784              
785             sub _get_from_hsp {
786 0     0     my ($self,$hsp,$tfh) = @_;
787 0           my @array;
788             my $count;
789 0           foreach my $pair (@{$hsp}){
  0            
790 0           my $sig = $pair->score;
791 0           $sig =~ s/^e-/1e-/g;
792 0           my $expect=sprintf("%e",$sig);
793 0 0         if ($expect==0){
794 0           my $wt = $self->weight;
795 0           $expect=sprintf("%e","1e-$wt");
796             }
797 0           my $first=(split("e-",$expect))[0];
798 0           my $second=(split("e-",$expect))[1];
799              
800 0           print $tfh join("\t", $pair->feature1->seq_id,
801             $pair->feature2->seq_id,int($first),
802             int($second) ), "\n";
803 0           $count++;
804             }
805 0           return $count;
806             }
807              
808             sub _get_from_hit {
809 0     0     my ($self,$hit,$tfh) = @_;
810 0           my $count;
811 0           foreach my $pair(@{$hit}){
  0            
812 0           my $sig = $pair->raw_score;
813 0           $sig =~s/^e-/1e-/g;
814 0           my $expect = sprintf("%e",$sig);
815 0 0         if ($expect==0){
816 0           my $wt = $self->weight;
817 0           $expect=sprintf("%e","1e-$wt");
818             }
819 0           my $first=(split("e-",$expect))[0];
820 0           my $second=(split("e-",$expect))[1];
821 0           print $tfh join("\t",$pair->name,$pair->description,int($first),int($second)),"\n";
822 0           $count++;
823             }
824 0           return $count;
825             }
826              
827              
828             =head2 _get_from_searchio
829              
830             Title : _get_from_searchio
831             Usage : $self->_get_from_searchio()
832             Function: internal function for parsing blast scores from searchio object
833             Returns : array ref to ids and score [protein1 protein2 magnitude factor]
834             Args : L
835              
836             =cut
837              
838             sub _get_from_searchio {
839 0     0     my ($self,$sio,$tfh) = @_;
840 0           my @array;
841             my $count;
842 0           while( my $result = $sio->next_result ) {
843 0           while( my $hit = $result->next_hit ) {
844 0           while( my $hsp = $hit->next_hsp ) {
845 0           my $sig = $hsp->evalue;
846 0           $sig =~ s/^e-/1e-/g;
847 0           my $expect=sprintf("%e",$sig);
848 0 0         if ($expect==0){
849 0           my $wt = $self->weight;
850 0           $expect=sprintf("%e","1e-$wt");
851             }
852 0           my $first=(split("e-",$expect))[0];
853 0           my $second=(split("e-",$expect))[1];
854 0           print $tfh join("\t",
855             $hsp->feature1->seq_id,
856             $hsp->feature2->seq_id,
857             int($first),
858             int($second) ), "\n";
859              
860 0           $count++;
861             }
862             }
863             }
864 0           return $count;
865             }
866              
867             =head2 _parse_blastfile
868              
869             Title : _parse_blastfile
870             Usage : $self->_parse_blastfile()
871             Function: internal function for quickly parsing blast evalue
872             scores from raw blast output file
873             Returns : array ref to ids and score [protein1 protein2 magnitude factor]
874             Args : file path
875              
876             =cut
877              
878             sub _parse_blastfile {
879 0     0     my ($self, $file,$tfh) = @_;
880 0 0         open(FILE,$file) || $self->throw("Cannot open Blast Output File");
881 0           my ($query,$reference,$first,$second);
882 0           my @array;
883 0           my $count;
884 0           my $weight = $self->weight;
885 0           while(){
886 0 0         if(/Query=\s+(\S+)/){
887 0           $query = $1;
888             }
889 0 0         if(/^>(\S+)/){
890 0           $reference = $1;
891             }
892 0 0         if (/Expect = (\S+)/){
893 0           my $raw=$1;
894 0           $raw=~s/^e-/1e-/g;
895 0           my $expect=sprintf("%e",$raw);
896 0 0         if ($expect==0){
897 0           $expect=sprintf("%e","1e-$weight");
898             }
899 0           $first=(split("e-",$expect))[0];
900 0           $second=(split("e-",$expect))[1];
901 0           print $tfh join("\t", $query,
902             $reference,
903             int($first),
904             int($second)), "\n";
905 0           $count++;
906             }
907             }
908 0           return $count;
909             }
910              
911             =head2 _parse_mcl
912              
913             Title : _parse_mcl
914             Usage : $self->_parse_mcl()
915             Function: internal function for quickly parsing mcl output and
916             generating the array of clusters
917             Returns : Array Ref of clustered Ids
918             Args : index file path, mcl output file path
919              
920             =cut
921              
922             sub _parse_mcl {
923 0     0     my ($self,$ind,$mcl) = @_;
924 0 0         open (MCL,$mcl) || $self->throw("Error, cannot open $mcl for parsing");
925 0           my $i =-1;
926 0           my $start;
927 0           my (@cluster,@out);
928 0           while() {
929 0 0         if ($start) {
930 0           chomp($_);
931 0           $cluster[$i] = join(" ",$cluster[$i],"$_");
932             }
933 0 0         if(/^\d+/){
934 0           $start = 1;
935 0           $i++;
936 0   0       $cluster[$i] = join(" ",$cluster[$i] || '',"$_");
937             }
938 0 0         if (/\$$/){
939 0           $start = 0;
940             }
941 0 0         last if /^\(mclruninfo/;
942             }
943 0 0         open (IND,$ind) || $self->throw("Cannot open $ind for parsing");
944 0           my %hash;
945 0           while(){
946 0           /^(\S+)\s+(\S+)/;
947 0           $hash{$1}=$2;
948             }
949              
950 0           for (my $j=0;$j<$i+1;$j++) {
951 0           my @array=split(" ",$cluster[$j]);
952 0           for (my $p=1;$p<$#array;$p++){
953 0           push @{$out[$array[0]]}, $hash{$array[$p]};
  0            
954             }
955             }
956 0           return \@out;
957             }
958              
959              
960             1;