File Coverage

Bio/Matrix/PSM/IO/masta.pm
Criterion Covered Total %
statement 139 142 97.8
branch 36 46 78.2
condition 11 20 55.0
subroutine 12 12 100.0
pod 4 4 100.0
total 202 224 90.1


line stmt bran cond sub pod time code
1             #---------------------------------------------------------
2              
3             =head1 NAME
4              
5             Bio::Matrix::PSM::IO::masta - motif fasta format parser
6              
7             =head1 SYNOPSIS
8              
9             MASTA is a position frequency matrix format similar to
10             fasta. It contains one ID row just like fasta and then the actual
11             data, which is tab delimited:
12              
13             0.1 0.62 .017 0.11
14             0.22 0.13 0.54 0.11
15              
16             Or A,C,G and T could be horizontally positioned (positioning is
17             automatically detected). Please note masta will parse only DNA at the
18             moment.
19              
20             It will also convert a set of aligned sequences:
21             ACATGCAT
22             ACAGGGAT
23             ACAGGCAT
24             ACCGGCAT
25              
26             to a PFM (SiteMatrix object). When writing if you supply SEQ it will
27             write 10 random instances, which represent correctly the frequency and
28             can be used as an input for weblogo creation purposes.
29              
30             See Bio::Matrix::PSM::IO for detailed documentation on how to use masta parser
31              
32             =head1 DESCRIPTION
33              
34             Parser for meme.
35              
36             =head1 FEEDBACK
37              
38             =head2 Mailing Lists
39              
40             User feedback is an integral part of the evolution of this
41             and other Bioperl modules. Send your comments and suggestions preferably
42             to one of the Bioperl mailing lists.
43             Your participation is much appreciated.
44              
45             bioperl-l@bioperl.org - General discussion
46             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
47              
48             =head2 Support
49              
50             Please direct usage questions or support issues to the mailing list:
51              
52             I
53              
54             rather than to the module maintainer directly. Many experienced and
55             reponsive experts will be able look at the problem and quickly
56             address it. Please include a thorough description of the problem
57             with code and data examples if at all possible.
58              
59             =head2 Reporting Bugs
60              
61             Report bugs to the Bioperl bug tracking system to help us keep track
62             the bugs and their resolution. Bug reports can be submitted via the
63             web:
64              
65             https://github.com/bioperl/bioperl-live/issues
66              
67             =head1 AUTHOR - Stefan Kirov
68              
69             Email skirov@utk.edu
70              
71             =head1 APPENDIX
72              
73             =cut
74              
75              
76             # Let the code begin...
77             package Bio::Matrix::PSM::IO::masta;
78 1     1   350 use Bio::Matrix::PSM::SiteMatrix;
  1         2  
  1         36  
79 1     1   9 use vars qw(@HEADER);
  1         1  
  1         54  
80 1     1   4 use strict;
  1         1  
  1         24  
81              
82 1     1   3 use base qw(Bio::Matrix::PSM::IO Bio::Root::Root);
  1         2  
  1         1341  
83              
84              
85              
86             =head2 new
87              
88             Title : new
89             Usage : my $psmIO = Bio::Matrix::PSM::IO->new(-format=> 'masta',
90             -file => $file,
91             -mtype => 'PWM');
92             Function: Associates a file with the appropriate parser
93             Throws :
94             Example :
95             Args : hash
96             Returns : "Bio::Matrix::PSM::$format"->new(@args);
97              
98             =cut
99              
100             sub new {
101 3     3 1 6 my($class, @args)=@_;
102 3         14 my $self = $class->SUPER::new(@args);
103 3         9 my ($file)=$self->_rearrange(['FILE'], @args);
104 3         10 my ($query,$tr1)=split(/\./,$file,2);
105 3         5 $self->{file} = $file;
106 3         4 $self->{_end} = 0;
107 3   50     9 $self->{mtype} = uc($self->_rearrange(['MTYPE'], @args) || "PFM");
108 3 50       9 $self->_initialize_io(@args) || $self->warn("Did you intend to use STDIN?"); #Read only for now
109 3         14 return $self;
110             }
111              
112             =head2 write_psm
113              
114             Title : write_psm
115             Usage :
116             Function: writes a pfm/pwm/raw sequence in a simple masta format
117             Throws :
118             Example :
119             Args : SiteMatrix object, type (optional string: PWM, SEQ or PFM)
120             Returns :
121              
122             =cut
123              
124             sub write_psm {
125 3     3 1 1184 my ($self,$matrix,$type)=@_;
126 3 50       9 $self->{mtype} = uc($type) if ($type);
127 3         6 my $idline=">". $matrix->id . "\n";
128 3         11 $self->_print($idline);
129 3 100       8 unless ($self->{mtype} eq 'SEQ') {
130 2         5 while (my %h=$matrix->next_pos) {
131 50 100       175 my $row=$self->{mtype} eq 'PWM' ? join("\t",$h{lA},$h{lC},$h{lG},$h{lT},"\n"):join("\t",$h{pA},$h{pC},$h{pG},$h{pT},"\n");
132 50         72 $self->_print ($row);
133             }
134             } else {
135 1         2 my @seq;
136 1         3 while (my %h=$matrix->next_pos) {
137 25         40 my ($a,$c,$g,$t)=_freq_to_count(\%h);
138 25 50       41 $self->throw("Could not convert from frequency to count\n") if (($a+$c+$g+$t) !=10);
139 25         37 for my $i (0..$a-1) {$seq[$i].='A';}
  97         75  
140 25         19 my $m=$a+$c;
141 25         23 for my $i ($a..$m-1) {$seq[$i].='C';}
  110         78  
142 25         22 my $n=$a+$c+$g;
143 25         24 for my $i ($m..$n-1) {$seq[$i].='G';}
  20         16  
144 25         72 for my $i ($n..9) {$seq[$i].='T';}
  23         25  
145             }
146 1         3 foreach my $s (@seq) {
147 10         9 $s.="\n";
148 10         13 $self->_print ($s);
149             }
150             }
151             }
152              
153             =head2 next_matrix
154              
155             Title : next_matrix
156             Usage : my $matrix = $psmio->next_matrix;
157             Function: Alias of next_psm function
158              
159             =cut
160              
161             sub next_matrix {
162 7     7 1 895 shift->next_psm(@_);
163             }
164              
165             =head2 next_psm
166              
167             Title : next_psm
168             Usage : my $matrix=$psmio->next_psm;
169             Function: returns the next matrix in the stream
170             Throws : If there is you mix different types, for example weights and
171             frequencies occur in the same entry You can mix weights, but these
172             should be designated by different ID lines
173             Example :
174             Args :
175             Returns : Bio::Matrix::PSM::SiteMatrix
176              
177             =cut
178              
179             sub next_psm {
180 7     7 1 7 my $self=shift;
181 7 100       22 return if ($self->{_end});
182 6         22 my $line=$self->_readline;
183 6 50       26 $self->throw("No ID line- wrong format\n") unless ($line=~/^>/);
184 6         24 my ($id,$desc)=split(/[\t\s]+/,$line,2);
185 6         91 $id=~s/>//;
186 6         8 my ($mtype,$format,@mdata,$len);
187 6         13 $self->{_mtype} = 0;
188 6         15 while ($line=$self->_readline) {
189 123 100       314 next if $line =~ /^\s+$/;# There should not be empty lines, but just in case...
190 122         94 chomp $line;
191 122 100       152 if ($line =~ /^>/) {
192 4         13 $self->_pushback($line);
193 4         4 last;
194             }
195              
196 118 100       207 if ($line !~ /[^ACGTacgt]/g) {
197             # This is a set of aligned sequences
198             $self->throw("Mixing between types is not allowed or a parsing error occured\n")
199 18 50 66     35 if (($self->{_mtype} != 3) && ($mtype)) ;
200 18 50 66     46 $self->throw("Bad sequence- different length: $line\n")
201             if (($len) && ($len!=length($line)));
202 18 100       22 $len=length($line) unless ($len);
203 18         19 push @mdata,$line;
204 18         29 $self->{_mtype}=3;
205             } else {
206             # do not strip 'e's since they are part of number notation for small/big numbers
207 100         94 $line=~s/[a-df-zA-DF-Z]//g; #Well we may wanna do a hash and auto check for letter order if there is a really boring talk...
208 100         133 $line=~s/^[\s\t]+//;
209 100         306 $line=~s/[\s\t]+/\t/g;
210 100         291 my @data=split(/[\s\t]+/,$line);
211 100 50       121 if ($#data==3) {
212 100 50 66     289 $self->throw("Mixing between types is not allowed or a parsing error occured\n") if (($mtype)&&($self->{_mtype} !=1)) ;
213 100         80 $self->{_mtype}=1;
214 100         73 $mtype=1;
215             }
216             else {
217 0 0 0     0 $self->throw("Mixing between types is not allowedor a parsing error occured\n") if (($mtype)&&($self->{_mtype} !=2)) ;
218 0         0 $self->{_mtype}=2;
219 0         0 $mtype=1;
220             }
221 100         283 push @mdata,\@data;
222             }
223             }
224 6 100 66     26 $self->{_end} = 1 if (!defined $line || $line !~ /^>/);
225 6         16 return _make_matrix(\@mdata,$self->{_mtype},$id,$desc);
226             }
227              
228             sub _make_matrix {
229 6     6   10 my ($mdata,$type,$id,$desc)=@_;
230 6 100       15 if ($type==1) {
231 4         11 my @rearr=_rearrange_matrix($mdata);
232 4         9 $mdata=\@rearr;
233             }
234             #Auto recognition for what type is this entry (PFM, PWM or simple count)
235             #A bit dangerous, I hate too much auto stuff, but I want to be able to mix different
236             #types in a single file
237 6         8 my $mformat='count';
238 6         59 my ($a,$c,$g,$t);
239 6 100       14 if ($type == 3 ) {
240 2         6 ($a,$c,$g,$t)= &_count_positions($mdata);
241             } else {
242 4         6 ($a,$c,$g,$t)=@{$mdata};
  4         8  
243 4         21 my $k=$a->[0]+$c->[0]+$g->[0]+$t->[0];
244 4         14 my $l= ($a->[0]+$c->[0]+$g->[0]+$t->[0]) -
245             (abs($a->[0])+abs($c->[0])+abs($g->[0])+abs($t->[0]));
246 4 100 66     19 $mformat='freq' if (($k==1) && ($l==0));
247 4 100       15 $mformat='pwm' if ($l!=0);
248             }
249 6         10 my (@fa,@fc,@fg,@ft,%mparam);
250              
251 6 100       15 if ($mformat eq 'pwm') {
252 2         4 foreach my $i (0..$#{$a}) {
  2         8  
253 50         64 my $ca=exp $a->[$i];
254 50         52 my $cc=exp $c->[$i];
255 50         42 my $cg=exp $g->[$i];
256 50         41 my $ct=exp $t->[$i];
257 50         44 my $all=$ca+$cc+$cg+$ct;
258 50         55 push @fa,($ca/$all)*100;
259 50         38 push @fc,($cc/$all)*100;
260 50         39 push @fg,($cg/$all)*100;
261 50         43 push @ft,($ct/$all)*100;
262             }
263             }
264 6         14 $desc.=", source is $mformat";
265 6 100       12 if ($mformat eq 'pwm') {
266 2         4 $desc=~s/^pwm//;
267 2         23 %mparam=(-pA=>\@fa,-pC=>\@fc,-pG=>\@fg,-pT=>\@ft,-id=>$id,-desc=>$desc,
268             -lA=>$a,-lC=>$c,-lG=>$g,-lT=>$t);
269             }
270             else {
271 4         22 %mparam=(-pA=>$a,-pC=>$c,-pG=>$g,-pT=>$t,-id=>$id,-desc=>$desc);
272             }
273 6         47 return new Bio::Matrix::PSM::SiteMatrix(%mparam);
274             }
275              
276             sub _rearrange_matrix {
277 4     4   6 my $mdata=shift;
278 4         6 my (@a,@c,@g,@t);
279 4         4 foreach my $entry (@{$mdata}) {
  4         9  
280 100         158 my ($a,$c,$g,$t)=@$entry;
281 100         99 push @a,$a;
282 100         80 push @c,$c;
283 100         67 push @g,$g;
284 100         104 push @t,$t;
285             }
286 4         19 return \@a,\@c,\@g,\@t;
287             }
288              
289              
290             sub _count_positions {
291 2     2   3 my $seq=shift;
292 2         3 my %pos;
293 2         4 my $l=length($seq->[0])-1;
294 2         6 for( my $i = 0; $i <= $l; $i++ ) {
295 50         43 for ( qw(A C G T) ) {
296 200         182 $pos{$_}->[$i] = 0;
297             }
298             }
299 2         2 foreach my $sequence (@{$seq}) {
  2         5  
300 18         53 my @let= split(//,$sequence);
301 18         24 for my $i (0..$#let) {
302 450         366 $pos{uc($let[$i])}->[$i]++;
303             }
304             }
305 2         9 return $pos{A},$pos{C},$pos{G},$pos{T};
306             }
307              
308              
309             sub _freq_to_count {
310 25     25   20 my $h=shift;
311 25         35 my $a=int(10*$h->{pA}+0.5);
312 25         22 my $c=int(10*$h->{pC}+0.5);
313 25         20 my $g=int(10*$h->{pG}+0.5);
314 25         19 my $t=int(10*$h->{pT}+0.5);
315 25         33 return ($a,$c,$g,$t);
316             }
317              
318             1;