File Coverage

Bio/Matrix/PSM/IO/mast.pm
Criterion Covered Total %
statement 136 151 90.0
branch 50 62 80.6
condition 3 3 100.0
subroutine 8 9 88.8
pod 3 3 100.0
total 200 228 87.7


line stmt bran cond sub pod time code
1              
2             =head1 NAME
3              
4             Bio::Matrix::PSM::IO::mast - PSM mast parser implementation
5              
6             =head1 SYNOPSIS
7              
8             See Bio::Matrix::PSM::IO for detailed documentation on how to
9             use PSM parsers
10              
11             =head1 DESCRIPTION
12              
13             Parser for mast. This driver unlike meme or transfac for example is
14             dedicated more to PSM sequence matches, than to PSM themselves.
15              
16             =head1 TO DO
17              
18             Section III should be parsed too, otherwise no real sequence is
19             available, so we supply 'NNNNN....' as a seq which is not right.
20              
21             =head1 FEEDBACK
22              
23             =head2 Mailing Lists
24              
25             User feedback is an integral part of the evolution of this
26             and other Bioperl modules. Send your comments and suggestions preferably
27             to one of the Bioperl mailing lists. Your participation is much appreciated.
28              
29             bioperl-l@bioperl.org - General discussion
30             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
31              
32             =head2 Support
33              
34             Please direct usage questions or support issues to the mailing list:
35              
36             I
37              
38             rather than to the module maintainer directly. Many experienced and
39             reponsive experts will be able look at the problem and quickly
40             address it. Please include a thorough description of the problem
41             with code and data examples if at all possible.
42              
43             =head2 Reporting Bugs
44              
45             Report bugs to the Bioperl bug tracking system to help us keep track
46             the bugs and their resolution. Bug reports can be submitted via the
47             web:
48              
49             https://github.com/bioperl/bioperl-live/issues
50              
51             =head1 AUTHOR - Stefan Kirov
52              
53             Email skirov@utk.edu
54              
55             =head1 APPENDIX
56              
57             The rest of the documentation details each of the object
58             methods. Internal methods are usually preceded with a _
59              
60             =cut
61              
62             # Let the code begin...
63             package Bio::Matrix::PSM::IO::mast;
64 1     1   5 use Bio::Matrix::PSM::InstanceSite;
  1         2  
  1         25  
65 1     1   4 use Bio::Matrix::PSM::Psm;
  1         2  
  1         18  
66 1     1   4 use Bio::Root::Root;
  1         1  
  1         14  
67 1     1   3 use strict;
  1         1  
  1         24  
68              
69 1     1   4 use base qw(Bio::Matrix::PSM::PsmHeader Bio::Matrix::PSM::IO);
  1         1  
  1         1385  
70              
71             =head2 new
72              
73             Title : new
74             Usage : my $psmIO = Bio::Matrix::PSM::IO->new(-format=>'mast',
75             -file=>$file);
76             Function: Associates a file with the appropriate parser
77             Throws : Throws if the file passed is in HTML format or if
78             some criteria for the file
79             format are not met.
80             Example :
81             Returns : psm object, associated with a file with matrix file
82             Args : hash
83             return : "Bio::Matrix::PSM::$format"->new(@args);
84              
85             =cut
86              
87              
88             sub new {
89 2     2 1 7 my($class, @args)=@_;
90 2         8 my $self = $class->SUPER::new(@args);
91 2         4 my (%instances,@header,$n);
92 2         8 my ($file)=$self->_rearrange(['FILE'], @args);
93 2         4 $self->{file} = $file;
94 2         4 $self->{_factor}=1;
95 2 50       7 $self->_initialize_io(@args) || warn "Did you intend to use STDIN?"; #Read only for now
96 2         5 $self->{_end}=0;
97 2         4 undef $self->{hid};
98 2 50       5 return $self if ($file=~/^>/);#Just writing
99 2         9 my $buf=$self->_readline;
100 2 50       5 $self->throw('Cannot parse HTML format yet') if ($buf =~/^/);
101             # this should probably be moved to its own function
102 2         9 while ( defined($buf=$self->_readline)) {
103 90         104 chomp($buf);
104 90 100       122 if ($buf=~/DATABASE AND MOTIFS/) {
105 2         13 while ($buf=$self->_readline) {
106 16 100       26 if ($buf=~/DATABASE/) {
107 2         6 $buf=~s/^[\s\t]+//;
108 2         4 chomp $buf;
109 2         10 ($n,$self->{_dbname},$self->{_dbtype})=split(/\s/,$buf);
110 2         7 $self->{_dbtype}=~s/[\(\)]//g;
111             }
112 16 100       37 if ($buf=~/MOTIFS/) {
113 2         6 $buf=~s/^[\s\t]+//;
114 2         3 chomp $buf;
115 2         8 ($n,$self->{_mrsc},$self->{_msrctype})=split(/\s/,$buf);
116 2         6 $self->{_msrctype}=~s/[\(\)]//g;
117 2         4 last;
118             }
119             }
120 2 100       6 if ($self->{_msrctype} ne $self->{_dbtype}) {#Assume we have protein motifs, nuc DB (not handling opp.)
121 1         1 $self->{_factor}=3;
122 1         2 $self->{_mixquery}=1;
123             }
124             }
125 90 100       115 if ($buf=~m/MOTIF WIDTH BEST POSSIBLE MATCH/) {
126 2         6 $self->_readline;
127 2         4 while (defined($buf=$self->_readline)) {
128 10 100       22 last if ($buf!~/\w/);
129 8         22 $buf=~s/\t+//g;
130 8         23 $buf=~s/^\s+//g;
131 8         25 my ($id,$width,$seq)=split(/\s+/,$buf);
132 8         9 push @{$self->{hid}},$id;
  8         13  
133 8         17 $self->{length}->{$id}=$width;
134 8         20 $self->{seq}->{$id}=$seq;
135             }
136 2         6 next;
137             }
138 88 100       143 if ($buf=~m/section i:/i) {
139 2         4 $self->_readline;
140 2         4 $self->_readline;
141 2         3 $self->_readline;
142 2         4 %instances=_get_genes($self);
143 2         10 $self->{instances}=\%instances;
144 2 50       7 if (!(%instances)) {
145 0         0 $self->warn ("Your MAST analysis did not find any matches satisfying the current threshold.\nSee MAST documentation for more information.\n");
146 0         0 return $self; #The header might be useful so we return the object, not undef
147             }
148 2         7 next;
149             }
150 86 100       109 if ($buf=~m/section ii:/i) {
151 2         5 $self->_readline;
152 2         5 $self->_readline;
153 2         6 $self->_readline;
154 2         3 last;
155             }
156 84         247 $buf=~s/[\t+\s+]/ /g;
157 84 100 100     297 push @header,$buf unless (($buf=~/\*{10,}/)||($buf!~/\w/));
158             }
159 2 50       5 $self->throw('Could not read Section I, probably wrong format, make sure it is not HTML, giving up...') if !(%instances);
160 2 50       23 $self->warn( "This file might be an unreadable version, proceed with caution!\n") if (!grep(/\s+MAST\s+version\s+3/,@header));
161              
162 2         6 $self->{unstructured} = \@header;
163 2         16 $self->_initialize;
164 2         21 return $self;
165             }
166              
167              
168             # Get the file header and put store it as a hash, which later we'll use to create
169             # the header for each Psm. See Bio::Matrix::PSM::PsmI for header function.
170             sub _get_genes {
171 2     2   2 my $self=shift;
172 2         4 my %llid;
173 2         2 my $ok=0;
174 2         2 my $i=0;
175 2         3 my %instances;
176 2         4 while (my $line=$self->_readline) {
177 108 100       265 last if ($line=~/^[\s\t*]/); # Well, ids can be nearly anything...???
178 106         123 chomp($line);
179 106         108 $i++;
180 106 50       147 next if ($line eq '');
181 106         470 $line=~s/\s+/,/g;
182 106         311 my ($id,$key,$eval,$len)=split(/,/,$line);
183 106 50       167 unless ($len) {
184 0         0 warn "Malformed data found: $line\n";
185 0         0 next;
186             }
187 106         310 $instances{$id}=Bio::Matrix::PSM::InstanceSite->new(-id=>$id,
188             -desc=>$key,
189             -score=>$eval,
190             -width=>$len,
191             -seq=>'ACGT');
192             }
193 2         64 return %instances;
194             }
195              
196              
197             =head2 next_psm
198              
199             Title : next_psm
200             Usage : my $psm=$psmIO->next_psm();
201             Function: Reads the next PSM from the input file, associated with this object
202             Throws : Throws if there ara format violations in the input file (checking is not
203             very strict with all drivers).
204             Example :
205             Returns : Bio::Matrix::PSM::Psm object
206             Args : none
207              
208             =cut
209              
210              
211             sub next_psm {
212 2     2 1 969 my $self=shift;
213 2 50       7 return if ($self->{_end}==1);
214 2         3 my (@lmotifsm,%index,$eval,$scheme,$sid);
215 2         3 %index= %{$self->{length}};
  2         10  
216 2         5 my (@instances,%instances);
217 2         6 my $line=$self->_readline;
218 2         9 $line=~s/[\t\n]//;
219 2 50       5 if ($line =~ /\*{10,}/) { #Endo of Section II if we do only section II
220 0         0 $self->{_end}=1;
221 0         0 return ;
222             }
223 2         3 do {
224 36 100       57 if ($line!~/^\s/) {
225 2         8 ($sid,$eval,$scheme)=split(/\s+/,$line,3);
226             }
227             else
228 34         55 { $scheme .=$line; }
229 36         55 $line=$self->_readline;
230 36         137 $line=~s/[\t\n]//;
231             } until ($line!~/^\s/);
232 2         3 my $pos=1;
233 2         21 $scheme=~s/\s+//g;
234 2         5 $scheme=~s/\n//g;
235 2         33 my @motifs=split(/_/,$scheme);
236 2         5 while (@motifs) {
237 184         219 my $next=shift(@motifs);
238 184 100       422 if (!($next=~/\D/)) {
239 93 100       139 last if (!@motifs);
240 91         141 $pos+=$next;
241 91         139 next;
242             }
243 91         97 my $id=$next;
244 91 100       156 my $score= $id=~m/\[/ ? 'strong' : 'weak' ;
245 91         88 my $frame;
246 91 100       162 my $strand = $id =~ m/\-\d/ ? -1 : 1 ;
247 91 100       141 if ($self->{_mixquery}) {
248 87 100       164 $frame = 0 if $id =~ m/\d+a/ ;
249 87 100       159 $frame = 1 if $id =~ m/\d+b/ ;
250 87 100       170 $frame = 2 if $id =~ m/\d+c/ ;
251             }
252 91         261 $id=~s/\D+//g;
253              
254 91         128 my @s;
255 91         111 my $width=$index{$id};
256             #We don't know the sequence, but we know the length
257 91         183 my $seq='N' x ($width*$self->{_factor}); #Future version will have to parse Section tree nad get the real seq
258 91         439 my $instance=Bio::Matrix::PSM::InstanceSite->new
259             ( -id=>"$id\@$sid",
260             -mid=>$id,
261             -accession_number=>$sid,
262             -desc=>"Motif $id occurrance in $sid",
263             -score=>$score,
264             -seq=>$seq,
265             -alphabet => 'dna',
266             -start=>$pos,
267             -strand=>$strand);
268 91 100       336 $instance->frame($frame) if ($self->{_mixquery});
269 91         129 push @instances,$instance;
270 91         236 $pos+=$index{$id}*$self->{_factor};
271             }
272 2         23 my $psm= Bio::Matrix::PSM::Psm->new(-instances=> \@instances,
273             -e_val => $eval,
274             -id => $sid);
275 2         17 $self->_pushback($line);
276 2         25 return $psm;
277             }
278              
279              
280             =head2 write_psm
281              
282             Title : write_psm
283             Usage : #Get SiteMatrix object somehow (see Bio::Matrix::PSM::SiteMatrix)
284             my $matrix=$psmin->next_matrix;
285             #Create the stream
286             my $psmio=new(-file=>">psms.mast",-format=>'mast');
287             $psmio->write_psm($matrix);
288             #Will warn if only PFM data is contained in $matrix, recalculate the PWM
289             #based on normal distribution (A=>0.25, C=>0.25, etc)
290             Function: writes pwm in mast format
291             Throws :
292             Example :
293             Args : SiteMatrix object
294             Returns :
295              
296             =cut
297              
298             sub write_psm {
299 0     0 1   my ($self,$matrix)=@_;
300             # my $idline=">". $matrix->id . "\n";
301 0           my $w=$matrix->width;
302 0           my $header="ALPHABET= ACGT\nlog-odds matrix: alength= 4 w= $w\n";
303 0           $self->_print($header);
304 0 0         unless ($matrix->get_logs_array('A')) {
305 0           warn "No log-odds data, available, using normal distribution to recalculate the PWM";
306 0           $matrix->calc_weight({A=>0.25, C=>0.25, G=>0.25,T=>0.25});
307             }
308 0           while (my %h=$matrix->next_pos) {
309 0           $self->_print (join("\t",$h{lA},$h{lC},$h{lG},$h{lT},"\n"));
310             }
311             }
312              
313             1;