File Coverage

Bio/Matrix/PSM/IO/mast.pm
Criterion Covered Total %
statement 136 151 90.0
branch 50 62 80.6
condition 3 3 100.0
subroutine 8 9 88.8
pod 3 3 100.0
total 200 228 87.7


line stmt bran cond sub pod time code
1              
2             =head1 NAME
3              
4             Bio::Matrix::PSM::IO::mast - PSM mast parser implementation
5              
6             =head1 SYNOPSIS
7              
8             See Bio::Matrix::PSM::IO for detailed documentation on how to
9             use PSM parsers
10              
11             =head1 DESCRIPTION
12              
13             Parser for mast. This driver unlike meme or transfac for example is
14             dedicated more to PSM sequence matches, than to PSM themselves.
15              
16             =head1 TO DO
17              
18             Section III should be parsed too, otherwise no real sequence is
19             available, so we supply 'NNNNN....' as a seq which is not right.
20              
21             =head1 FEEDBACK
22              
23             =head2 Mailing Lists
24              
25             User feedback is an integral part of the evolution of this
26             and other Bioperl modules. Send your comments and suggestions preferably
27             to one of the Bioperl mailing lists. Your participation is much appreciated.
28              
29             bioperl-l@bioperl.org - General discussion
30             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
31              
32             =head2 Support
33              
34             Please direct usage questions or support issues to the mailing list:
35              
36             I
37              
38             rather than to the module maintainer directly. Many experienced and
39             reponsive experts will be able look at the problem and quickly
40             address it. Please include a thorough description of the problem
41             with code and data examples if at all possible.
42              
43             =head2 Reporting Bugs
44              
45             Report bugs to the Bioperl bug tracking system to help us keep track
46             the bugs and their resolution. Bug reports can be submitted via the
47             web:
48              
49             https://github.com/bioperl/bioperl-live/issues
50              
51             =head1 AUTHOR - Stefan Kirov
52              
53             Email skirov@utk.edu
54              
55             =head1 APPENDIX
56              
57             The rest of the documentation details each of the object
58             methods. Internal methods are usually preceded with a _
59              
60             =cut
61              
62             # Let the code begin...
63             package Bio::Matrix::PSM::IO::mast;
64 1     1   4 use Bio::Matrix::PSM::InstanceSite;
  1         2  
  1         36  
65 1     1   4 use Bio::Matrix::PSM::Psm;
  1         0  
  1         15  
66 1     1   3 use Bio::Root::Root;
  1         1  
  1         16  
67 1     1   4 use strict;
  1         1  
  1         24  
68              
69 1     1   3 use base qw(Bio::Matrix::PSM::PsmHeader Bio::Matrix::PSM::IO);
  1         1  
  1         1477  
70              
71             =head2 new
72              
73             Title : new
74             Usage : my $psmIO = Bio::Matrix::PSM::IO->new(-format=>'mast',
75             -file=>$file);
76             Function: Associates a file with the appropriate parser
77             Throws : Throws if the file passed is in HTML format or if
78             some criteria for the file
79             format are not met.
80             Example :
81             Returns : psm object, associated with a file with matrix file
82             Args : hash
83             return : "Bio::Matrix::PSM::$format"->new(@args);
84              
85             =cut
86              
87              
88             sub new {
89 2     2 1 5 my($class, @args)=@_;
90 2         10 my $self = $class->SUPER::new(@args);
91 2         2 my (%instances,@header,$n);
92 2         9 my ($file)=$self->_rearrange(['FILE'], @args);
93 2         4 $self->{file} = $file;
94 2         2 $self->{_factor}=1;
95 2 50       10 $self->_initialize_io(@args) || warn "Did you intend to use STDIN?"; #Read only for now
96 2         6 $self->{_end}=0;
97 2         2 undef $self->{hid};
98 2 50       6 return $self if ($file=~/^>/);#Just writing
99 2         9 my $buf=$self->_readline;
100 2 50       7 $self->throw('Cannot parse HTML format yet') if ($buf =~/^/);
101             # this should probably be moved to its own function
102 2         4 while ( defined($buf=$self->_readline)) {
103 90         74 chomp($buf);
104 90 100       120 if ($buf=~/DATABASE AND MOTIFS/) {
105 2         5 while ($buf=$self->_readline) {
106 16 100       22 if ($buf=~/DATABASE/) {
107 2         6 $buf=~s/^[\s\t]+//;
108 2         2 chomp $buf;
109 2         11 ($n,$self->{_dbname},$self->{_dbtype})=split(/\s/,$buf);
110 2         6 $self->{_dbtype}=~s/[\(\)]//g;
111             }
112 16 100       30 if ($buf=~/MOTIFS/) {
113 2         5 $buf=~s/^[\s\t]+//;
114 2         3 chomp $buf;
115 2         7 ($n,$self->{_mrsc},$self->{_msrctype})=split(/\s/,$buf);
116 2         6 $self->{_msrctype}=~s/[\(\)]//g;
117 2         3 last;
118             }
119             }
120 2 100       5 if ($self->{_msrctype} ne $self->{_dbtype}) {#Assume we have protein motifs, nuc DB (not handling opp.)
121 1         2 $self->{_factor}=3;
122 1         1 $self->{_mixquery}=1;
123             }
124             }
125 90 100       101 if ($buf=~m/MOTIF WIDTH BEST POSSIBLE MATCH/) {
126 2         3 $self->_readline;
127 2         4 while (defined($buf=$self->_readline)) {
128 10 100       23 last if ($buf!~/\w/);
129 8         25 $buf=~s/\t+//g;
130 8         17 $buf=~s/^\s+//g;
131 8         22 my ($id,$width,$seq)=split(/\s+/,$buf);
132 8         8 push @{$self->{hid}},$id;
  8         11  
133 8         12 $self->{length}->{$id}=$width;
134 8         16 $self->{seq}->{$id}=$seq;
135             }
136 2         4 next;
137             }
138 88 100       123 if ($buf=~m/section i:/i) {
139 2         4 $self->_readline;
140 2         4 $self->_readline;
141 2         3 $self->_readline;
142 2         4 %instances=_get_genes($self);
143 2         12 $self->{instances}=\%instances;
144 2 50       18 if (!(%instances)) {
145 0         0 $self->warn ("Your MAST analysis did not find any matches satisfying the current threshold.\nSee MAST documentation for more information.\n");
146 0         0 return $self; #The header might be useful so we return the object, not undef
147             }
148 2         5 next;
149             }
150 86 100       102 if ($buf=~m/section ii:/i) {
151 2         5 $self->_readline;
152 2         4 $self->_readline;
153 2         5 $self->_readline;
154 2         3 last;
155             }
156 84         214 $buf=~s/[\t+\s+]/ /g;
157 84 100 100     359 push @header,$buf unless (($buf=~/\*{10,}/)||($buf!~/\w/));
158             }
159 2 50       6 $self->throw('Could not read Section I, probably wrong format, make sure it is not HTML, giving up...') if !(%instances);
160 2 50       23 $self->warn( "This file might be an unreadable version, proceed with caution!\n") if (!grep(/\s+MAST\s+version\s+3/,@header));
161              
162 2         4 $self->{unstructured} = \@header;
163 2         16 $self->_initialize;
164 2         15 return $self;
165             }
166              
167              
168             # Get the file header and put store it as a hash, which later we'll use to create
169             # the header for each Psm. See Bio::Matrix::PSM::PsmI for header function.
170             sub _get_genes {
171 2     2   2 my $self=shift;
172 2         4 my %llid;
173 2         1 my $ok=0;
174 2         3 my $i=0;
175 2         2 my %instances;
176 2         3 while (my $line=$self->_readline) {
177 108 100       222 last if ($line=~/^[\s\t*]/); # Well, ids can be nearly anything...???
178 106         102 chomp($line);
179 106         77 $i++;
180 106 50       124 next if ($line eq '');
181 106         441 $line=~s/\s+/,/g;
182 106         249 my ($id,$key,$eval,$len)=split(/,/,$line);
183 106 50       156 unless ($len) {
184 0         0 warn "Malformed data found: $line\n";
185 0         0 next;
186             }
187 106         339 $instances{$id}=Bio::Matrix::PSM::InstanceSite->new(-id=>$id,
188             -desc=>$key,
189             -score=>$eval,
190             -width=>$len,
191             -seq=>'ACGT');
192             }
193 2         73 return %instances;
194             }
195              
196              
197             =head2 next_psm
198              
199             Title : next_psm
200             Usage : my $psm=$psmIO->next_psm();
201             Function: Reads the next PSM from the input file, associated with this object
202             Throws : Throws if there ara format violations in the input file (checking is not
203             very strict with all drivers).
204             Example :
205             Returns : Bio::Matrix::PSM::Psm object
206             Args : none
207              
208             =cut
209              
210              
211             sub next_psm {
212 2     2 1 626 my $self=shift;
213 2 50       8 return if ($self->{_end}==1);
214 2         1 my (@lmotifsm,%index,$eval,$scheme,$sid);
215 2         3 %index= %{$self->{length}};
  2         9  
216 2         2 my (@instances,%instances);
217 2         5 my $line=$self->_readline;
218 2         8 $line=~s/[\t\n]//;
219 2 50       7 if ($line =~ /\*{10,}/) { #Endo of Section II if we do only section II
220 0         0 $self->{_end}=1;
221 0         0 return ;
222             }
223 2         2 do {
224 36 100       40 if ($line!~/^\s/) {
225 2         9 ($sid,$eval,$scheme)=split(/\s+/,$line,3);
226             }
227             else
228 34         31 { $scheme .=$line; }
229 36         44 $line=$self->_readline;
230 36         117 $line=~s/[\t\n]//;
231             } until ($line!~/^\s/);
232 2         3 my $pos=1;
233 2         18 $scheme=~s/\s+//g;
234 2         4 $scheme=~s/\n//g;
235 2         33 my @motifs=split(/_/,$scheme);
236 2         5 while (@motifs) {
237 184         158 my $next=shift(@motifs);
238 184 100       351 if (!($next=~/\D/)) {
239 93 100       109 last if (!@motifs);
240 91         109 $pos+=$next;
241 91         129 next;
242             }
243 91         68 my $id=$next;
244 91 100       134 my $score= $id=~m/\[/ ? 'strong' : 'weak' ;
245 91         58 my $frame;
246 91 100       155 my $strand = $id =~ m/\-\d/ ? -1 : 1 ;
247 91 100       106 if ($self->{_mixquery}) {
248 87 100       154 $frame = 0 if $id =~ m/\d+a/ ;
249 87 100       142 $frame = 1 if $id =~ m/\d+b/ ;
250 87 100       157 $frame = 2 if $id =~ m/\d+c/ ;
251             }
252 91         196 $id=~s/\D+//g;
253              
254 91         71 my @s;
255 91         87 my $width=$index{$id};
256             #We don't know the sequence, but we know the length
257 91         152 my $seq='N' x ($width*$self->{_factor}); #Future version will have to parse Section tree nad get the real seq
258 91         463 my $instance=Bio::Matrix::PSM::InstanceSite->new
259             ( -id=>"$id\@$sid",
260             -mid=>$id,
261             -accession_number=>$sid,
262             -desc=>"Motif $id occurrance in $sid",
263             -score=>$score,
264             -seq=>$seq,
265             -alphabet => 'dna',
266             -start=>$pos,
267             -strand=>$strand);
268 91 100       268 $instance->frame($frame) if ($self->{_mixquery});
269 91         111 push @instances,$instance;
270 91         210 $pos+=$index{$id}*$self->{_factor};
271             }
272 2         29 my $psm= Bio::Matrix::PSM::Psm->new(-instances=> \@instances,
273             -e_val => $eval,
274             -id => $sid);
275 2         15 $self->_pushback($line);
276 2         9 return $psm;
277             }
278              
279              
280             =head2 write_psm
281              
282             Title : write_psm
283             Usage : #Get SiteMatrix object somehow (see Bio::Matrix::PSM::SiteMatrix)
284             my $matrix=$psmin->next_matrix;
285             #Create the stream
286             my $psmio=new(-file=>">psms.mast",-format=>'mast');
287             $psmio->write_psm($matrix);
288             #Will warn if only PFM data is contained in $matrix, recalculate the PWM
289             #based on normal distribution (A=>0.25, C=>0.25, etc)
290             Function: writes pwm in mast format
291             Throws :
292             Example :
293             Args : SiteMatrix object
294             Returns :
295              
296             =cut
297              
298             sub write_psm {
299 0     0 1   my ($self,$matrix)=@_;
300             # my $idline=">". $matrix->id . "\n";
301 0           my $w=$matrix->width;
302 0           my $header="ALPHABET= ACGT\nlog-odds matrix: alength= 4 w= $w\n";
303 0           $self->_print($header);
304 0 0         unless ($matrix->get_logs_array('A')) {
305 0           warn "No log-odds data, available, using normal distribution to recalculate the PWM";
306 0           $matrix->calc_weight({A=>0.25, C=>0.25, G=>0.25,T=>0.25});
307             }
308 0           while (my %h=$matrix->next_pos) {
309 0           $self->_print (join("\t",$h{lA},$h{lC},$h{lG},$h{lT},"\n"));
310             }
311             }
312              
313             1;