File Coverage

Bio/Matrix/PSM/IO/masta.pm
Criterion Covered Total %
statement 139 142 97.8
branch 36 46 78.2
condition 11 20 55.0
subroutine 12 12 100.0
pod 4 4 100.0
total 202 224 90.1


line stmt bran cond sub pod time code
1             #---------------------------------------------------------
2              
3             =head1 NAME
4              
5             Bio::Matrix::PSM::IO::masta - motif fasta format parser
6              
7             =head1 SYNOPSIS
8              
9             MASTA is a position frequency matrix format similar to
10             fasta. It contains one ID row just like fasta and then the actual
11             data, which is tab delimited:
12              
13             0.1 0.62 .017 0.11
14             0.22 0.13 0.54 0.11
15              
16             Or A,C,G and T could be horizontally positioned (positioning is
17             automatically detected). Please note masta will parse only DNA at the
18             moment.
19              
20             It will also convert a set of aligned sequences:
21             ACATGCAT
22             ACAGGGAT
23             ACAGGCAT
24             ACCGGCAT
25              
26             to a PFM (SiteMatrix object). When writing if you supply SEQ it will
27             write 10 random instances, which represent correctly the frequency and
28             can be used as an input for weblogo creation purposes.
29              
30             See Bio::Matrix::PSM::IO for detailed documentation on how to use masta parser
31              
32             =head1 DESCRIPTION
33              
34             Parser for meme.
35              
36             =head1 FEEDBACK
37              
38             =head2 Mailing Lists
39              
40             User feedback is an integral part of the evolution of this
41             and other Bioperl modules. Send your comments and suggestions preferably
42             to one of the Bioperl mailing lists.
43             Your participation is much appreciated.
44              
45             bioperl-l@bioperl.org - General discussion
46             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
47              
48             =head2 Support
49              
50             Please direct usage questions or support issues to the mailing list:
51              
52             I
53              
54             rather than to the module maintainer directly. Many experienced and
55             reponsive experts will be able look at the problem and quickly
56             address it. Please include a thorough description of the problem
57             with code and data examples if at all possible.
58              
59             =head2 Reporting Bugs
60              
61             Report bugs to the Bioperl bug tracking system to help us keep track
62             the bugs and their resolution. Bug reports can be submitted via the
63             web:
64              
65             https://github.com/bioperl/bioperl-live/issues
66              
67             =head1 AUTHOR - Stefan Kirov
68              
69             Email skirov@utk.edu
70              
71             =head1 APPENDIX
72              
73             =cut
74              
75              
76             # Let the code begin...
77             package Bio::Matrix::PSM::IO::masta;
78 1     1   285 use Bio::Matrix::PSM::SiteMatrix;
  1         3  
  1         43  
79 1     1   8 use vars qw(@HEADER);
  1         3  
  1         56  
80 1     1   7 use strict;
  1         1  
  1         20  
81              
82 1     1   4 use base qw(Bio::Matrix::PSM::IO Bio::Root::Root);
  1         1  
  1         1351  
83              
84              
85              
86             =head2 new
87              
88             Title : new
89             Usage : my $psmIO = Bio::Matrix::PSM::IO->new(-format=> 'masta',
90             -file => $file,
91             -mtype => 'PWM');
92             Function: Associates a file with the appropriate parser
93             Throws :
94             Example :
95             Args : hash
96             Returns : "Bio::Matrix::PSM::$format"->new(@args);
97              
98             =cut
99              
100             sub new {
101 3     3 1 7 my($class, @args)=@_;
102 3         10 my $self = $class->SUPER::new(@args);
103 3         6 my ($file)=$self->_rearrange(['FILE'], @args);
104 3         9 my ($query,$tr1)=split(/\./,$file,2);
105 3         4 $self->{file} = $file;
106 3         4 $self->{_end} = 0;
107 3   50     7 $self->{mtype} = uc($self->_rearrange(['MTYPE'], @args) || "PFM");
108 3 50       7 $self->_initialize_io(@args) || $self->warn("Did you intend to use STDIN?"); #Read only for now
109 3         13 return $self;
110             }
111              
112             =head2 write_psm
113              
114             Title : write_psm
115             Usage :
116             Function: writes a pfm/pwm/raw sequence in a simple masta format
117             Throws :
118             Example :
119             Args : SiteMatrix object, type (optional string: PWM, SEQ or PFM)
120             Returns :
121              
122             =cut
123              
124             sub write_psm {
125 3     3 1 1704 my ($self,$matrix,$type)=@_;
126 3 50       9 $self->{mtype} = uc($type) if ($type);
127 3         7 my $idline=">". $matrix->id . "\n";
128 3         10 $self->_print($idline);
129 3 100       5 unless ($self->{mtype} eq 'SEQ') {
130 2         6 while (my %h=$matrix->next_pos) {
131 50 100       173 my $row=$self->{mtype} eq 'PWM' ? join("\t",$h{lA},$h{lC},$h{lG},$h{lT},"\n"):join("\t",$h{pA},$h{pC},$h{pG},$h{pT},"\n");
132 50         68 $self->_print ($row);
133             }
134             } else {
135 1         7 my @seq;
136 1         2 while (my %h=$matrix->next_pos) {
137 25         33 my ($a,$c,$g,$t)=_freq_to_count(\%h);
138 25 50       35 $self->throw("Could not convert from frequency to count\n") if (($a+$c+$g+$t) !=10);
139 25         31 for my $i (0..$a-1) {$seq[$i].='A';}
  97         88  
140 25         22 my $m=$a+$c;
141 25         27 for my $i ($a..$m-1) {$seq[$i].='C';}
  110         102  
142 25         25 my $n=$a+$c+$g;
143 25         27 for my $i ($m..$n-1) {$seq[$i].='G';}
  20         20  
144 25         62 for my $i ($n..9) {$seq[$i].='T';}
  23         27  
145             }
146 1         1 foreach my $s (@seq) {
147 10         10 $s.="\n";
148 10         11 $self->_print ($s);
149             }
150             }
151             }
152              
153             =head2 next_matrix
154              
155             Title : next_matrix
156             Usage : my $matrix = $psmio->next_matrix;
157             Function: Alias of next_psm function
158              
159             =cut
160              
161             sub next_matrix {
162 7     7 1 1304 shift->next_psm(@_);
163             }
164              
165             =head2 next_psm
166              
167             Title : next_psm
168             Usage : my $matrix=$psmio->next_psm;
169             Function: returns the next matrix in the stream
170             Throws : If there is you mix different types, for example weights and
171             frequencies occur in the same entry You can mix weights, but these
172             should be designated by different ID lines
173             Example :
174             Args :
175             Returns : Bio::Matrix::PSM::SiteMatrix
176              
177             =cut
178              
179             sub next_psm {
180 7     7 1 7 my $self=shift;
181 7 100       14 return if ($self->{_end});
182 6         20 my $line=$self->_readline;
183 6 50       22 $self->throw("No ID line- wrong format\n") unless ($line=~/^>/);
184 6         23 my ($id,$desc)=split(/[\t\s]+/,$line,2);
185 6         16 $id=~s/>//;
186 6         6 my ($mtype,$format,@mdata,$len);
187 6         10 $self->{_mtype} = 0;
188 6         11 while ($line=$self->_readline) {
189 123 100       275 next if $line =~ /^\s+$/;# There should not be empty lines, but just in case...
190 122         147 chomp $line;
191 122 100       175 if ($line =~ /^>/) {
192 4         12 $self->_pushback($line);
193 4         4 last;
194             }
195              
196 118 100       227 if ($line !~ /[^ACGTacgt]/g) {
197             # This is a set of aligned sequences
198             $self->throw("Mixing between types is not allowed or a parsing error occurred\n")
199 18 50 66     27 if (($self->{_mtype} != 3) && ($mtype)) ;
200 18 50 66     40 $self->throw("Bad sequence- different length: $line\n")
201             if (($len) && ($len!=length($line)));
202 18 100       46 $len=length($line) unless ($len);
203 18         25 push @mdata,$line;
204 18         30 $self->{_mtype}=3;
205             } else {
206             # do not strip 'e's since they are part of number notation for small/big numbers
207 100         137 $line=~s/[a-df-zA-DF-Z]//g; #Well we may wanna do a hash and auto check for letter order if there is a really boring talk...
208 100         156 $line=~s/^[\s\t]+//;
209 100         367 $line=~s/[\s\t]+/\t/g;
210 100         315 my @data=split(/[\s\t]+/,$line);
211 100 50       160 if ($#data==3) {
212 100 50 66     232 $self->throw("Mixing between types is not allowed or a parsing error occurred\n") if (($mtype)&&($self->{_mtype} !=1)) ;
213 100         102 $self->{_mtype}=1;
214 100         83 $mtype=1;
215             }
216             else {
217 0 0 0     0 $self->throw("Mixing between types is not allowedor a parsing error occurred\n") if (($mtype)&&($self->{_mtype} !=2)) ;
218 0         0 $self->{_mtype}=2;
219 0         0 $mtype=1;
220             }
221 100         219 push @mdata,\@data;
222             }
223             }
224 6 100 66     19 $self->{_end} = 1 if (!defined $line || $line !~ /^>/);
225 6         14 return _make_matrix(\@mdata,$self->{_mtype},$id,$desc);
226             }
227              
228             sub _make_matrix {
229 6     6   11 my ($mdata,$type,$id,$desc)=@_;
230 6 100       9 if ($type==1) {
231 4         7 my @rearr=_rearrange_matrix($mdata);
232 4         4 $mdata=\@rearr;
233             }
234             #Auto recognition for what type is this entry (PFM, PWM or simple count)
235             #A bit dangerous, I hate too much auto stuff, but I want to be able to mix different
236             #types in a single file
237 6         7 my $mformat='count';
238 6         32 my ($a,$c,$g,$t);
239 6 100       11 if ($type == 3 ) {
240 2         3 ($a,$c,$g,$t)= &_count_positions($mdata);
241             } else {
242 4         4 ($a,$c,$g,$t)=@{$mdata};
  4         5  
243 4         15 my $k=$a->[0]+$c->[0]+$g->[0]+$t->[0];
244 4         8 my $l= ($a->[0]+$c->[0]+$g->[0]+$t->[0]) -
245             (abs($a->[0])+abs($c->[0])+abs($g->[0])+abs($t->[0]));
246 4 100 66     13 $mformat='freq' if (($k==1) && ($l==0));
247 4 100       6 $mformat='pwm' if ($l!=0);
248             }
249 6         8 my (@fa,@fc,@fg,@ft,%mparam);
250              
251 6 100       12 if ($mformat eq 'pwm') {
252 2         2 foreach my $i (0..$#{$a}) {
  2         5  
253 50         96 my $ca=exp $a->[$i];
254 50         60 my $cc=exp $c->[$i];
255 50         60 my $cg=exp $g->[$i];
256 50         56 my $ct=exp $t->[$i];
257 50         53 my $all=$ca+$cc+$cg+$ct;
258 50         69 push @fa,($ca/$all)*100;
259 50         59 push @fc,($cc/$all)*100;
260 50         53 push @fg,($cg/$all)*100;
261 50         69 push @ft,($ct/$all)*100;
262             }
263             }
264 6         10 $desc.=", source is $mformat";
265 6 100       10 if ($mformat eq 'pwm') {
266 2         3 $desc=~s/^pwm//;
267 2         13 %mparam=(-pA=>\@fa,-pC=>\@fc,-pG=>\@fg,-pT=>\@ft,-id=>$id,-desc=>$desc,
268             -lA=>$a,-lC=>$c,-lG=>$g,-lT=>$t);
269             }
270             else {
271 4         17 %mparam=(-pA=>$a,-pC=>$c,-pG=>$g,-pT=>$t,-id=>$id,-desc=>$desc);
272             }
273 6         30 return new Bio::Matrix::PSM::SiteMatrix(%mparam);
274             }
275              
276             sub _rearrange_matrix {
277 4     4   4 my $mdata=shift;
278 4         5 my (@a,@c,@g,@t);
279 4         5 foreach my $entry (@{$mdata}) {
  4         5  
280 100         127 my ($a,$c,$g,$t)=@$entry;
281 100         97 push @a,$a;
282 100         106 push @c,$c;
283 100         94 push @g,$g;
284 100         125 push @t,$t;
285             }
286 4         11 return \@a,\@c,\@g,\@t;
287             }
288              
289              
290             sub _count_positions {
291 2     2   20 my $seq=shift;
292 2         3 my %pos;
293 2         2 my $l=length($seq->[0])-1;
294 2         5 for( my $i = 0; $i <= $l; $i++ ) {
295 50         63 for ( qw(A C G T) ) {
296 200         302 $pos{$_}->[$i] = 0;
297             }
298             }
299 2         5 foreach my $sequence (@{$seq}) {
  2         4  
300 18         60 my @let= split(//,$sequence);
301 18         25 for my $i (0..$#let) {
302 450         451 $pos{uc($let[$i])}->[$i]++;
303             }
304             }
305 2         8 return $pos{A},$pos{C},$pos{G},$pos{T};
306             }
307              
308              
309             sub _freq_to_count {
310 25     25   23 my $h=shift;
311 25         34 my $a=int(10*$h->{pA}+0.5);
312 25         31 my $c=int(10*$h->{pC}+0.5);
313 25         26 my $g=int(10*$h->{pG}+0.5);
314 25         26 my $t=int(10*$h->{pT}+0.5);
315 25         41 return ($a,$c,$g,$t);
316             }
317              
318             1;