File Coverage

Bio/AlignIO/msf.pm
Criterion Covered Total %
statement 88 98 89.8
branch 20 28 71.4
condition 5 9 55.5
subroutine 8 8 100.0
pod 2 2 100.0
total 123 145 84.8


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::AlignIO::msf
3             # based on the Bio::SeqIO::msf module
4             # by Ewan Birney
5             # and Lincoln Stein
6             #
7             # and the SimpleAlign.pm module of Ewan Birney
8             #
9             # Copyright Peter Schattner
10             #
11             # You may distribute this module under the same terms as perl itself
12             # _history
13             # September 5, 2000
14             # POD documentation - main docs before the code
15              
16             =head1 NAME
17              
18             Bio::AlignIO::msf - msf sequence input/output stream
19              
20             =head1 SYNOPSIS
21              
22             Do not use this module directly. Use it via the L class.
23              
24             =head1 DESCRIPTION
25              
26             This object can transform L objects to and from msf
27             flat file databases.
28              
29             =head1 FEEDBACK
30              
31             =head2 Support
32              
33             Please direct usage questions or support issues to the mailing list:
34              
35             I
36              
37             rather than to the module maintainer directly. Many experienced and
38             reponsive experts will be able look at the problem and quickly
39             address it. Please include a thorough description of the problem
40             with code and data examples if at all possible.
41              
42             =head2 Reporting Bugs
43              
44             Report bugs to the Bioperl bug tracking system to help us keep track
45             the bugs and their resolution. Bug reports can be submitted via the
46             web:
47              
48             https://github.com/bioperl/bioperl-live/issues
49              
50             =head1 AUTHORS - Peter Schattner
51              
52             Email: schattner@alum.mit.edu
53              
54              
55             =head1 APPENDIX
56              
57             The rest of the documentation details each of the object
58             methods. Internal methods are usually preceded with a _
59              
60             =cut
61              
62             # Let the code begin...
63              
64             package Bio::AlignIO::msf;
65 3     3   412 use vars qw(%valid_type);
  3         6  
  3         131  
66 3     3   14 use strict;
  3         3  
  3         60  
67              
68 3     3   494 use Bio::SeqIO::gcg; # for GCG_checksum()
  3         6  
  3         99  
69 3     3   439 use Bio::SimpleAlign;
  3         4  
  3         82  
70              
71 3     3   15 use base qw(Bio::AlignIO);
  3         4  
  3         483  
72              
73             BEGIN {
74 3     3   1977 %valid_type = qw( dna N rna N protein P );
75             }
76              
77             =head2 next_aln
78              
79             Title : next_aln
80             Usage : $aln = $stream->next_aln()
81             Function: returns the next alignment in the stream. Tries to read *all* MSF
82             It reads all non whitespace characters in the alignment
83             area. For MSFs with weird gaps (eg ~~~) map them by using
84             $aln->map_chars('~','-')
85             Returns : Bio::Align::AlignI object
86             Args : NONE
87              
88             =cut
89              
90             sub next_aln {
91 3     3 1 9 my $self = shift;
92 3         4 my $entry;
93 3         6 my (%hash,$name,$str,@names,$seqname,$start,$end,$count,$seq);
94              
95 3         44 my $aln = Bio::SimpleAlign->new(-source => 'gcg' );
96              
97 3         25 while( $entry = $self->_readline) {
98 63 100       94 $entry =~ m{//} && last; # move to alignment section
99 60 100       150 $entry =~ /Name:\s+(\S+)/ && do { $name = $1;
  48         72  
100 48         77 $hash{$name} = ""; # blank line
101 48         85 push(@names,$name); # we need it ordered!
102             };
103             # otherwise - skip
104             }
105              
106             # alignment section
107              
108 3         8 while( $entry = $self->_readline) {
109 276 50       482 next if ( $entry =~ /^\s+(\d+)/ ) ;
110 276 100       644 $entry =~ /^\s*(\S+)\s+(.*)$/ && do {
111 240         315 $name = $1;
112 240         230 $str = $2;
113 240 50       329 if( ! exists $hash{$name} ) {
114 0         0 $self->throw("$name exists as an alignment line but not in the header. Not confident of what is going on!");
115             }
116 240         787 $str =~ s/\s//g;
117 240         282 $str =~ s/~/-/g;
118 240         477 $hash{$name} .= $str;
119             };
120             }
121              
122 3 50       12 return if @names < 1;
123              
124             # now got this as a name - sequence hash. Let's make some sequences!
125              
126 3         6 for $name ( @names ) {
127 48 50       264 if( $name =~ m{(\S+)/(\d+)-(\d+)} ) {
128 48         103 $seqname = $1;
129 48         60 $start = $2;
130 48         57 $end = $3;
131             } else {
132 0         0 $seqname = $name;
133 0         0 $start = 1;
134 0         0 $str = $hash{$name};
135 0         0 $str =~ s/[^0-9A-Za-z$Bio::LocatableSeq::OTHER_SYMBOLS]//g;
136              
137 0         0 $end = length($str);
138             }
139              
140 48         126 $seq = Bio::LocatableSeq->new('-seq' => $hash{$name},
141             '-display_id' => $seqname,
142             '-start' => $start,
143             '-end' => $end,
144             '-alphabet' => $self->alphabet,
145             );
146 48         123 $aln->add_seq($seq);
147              
148             # If $end <= 0, we have either reached the end of
149             # file in <> or we have encountered some other error
150              
151             }
152 3 50       14 return $aln if $aln->num_sequences;
153 0         0 return;
154             }
155              
156              
157             =head2 write_aln
158              
159             Title : write_aln
160             Usage : $stream->write_aln(@aln)
161             Function: writes the $aln object into the stream in MSF format
162             Sequence type of the alignment is determined by the first sequence.
163             Returns : 1 for success and 0 for error
164             Args : Bio::Align::AlignI object
165              
166              
167             =cut
168              
169             sub write_aln {
170 2     2 1 9 my ($self,@aln) = @_;
171 2         4 my $msftag;
172             my $type;
173 2         5 my $count = 0;
174 2         3 my $maxname;
175 2         4 my ($length,$date,$name,$seq,$miss,$pad,%hash,@arr,$tempcount,$index);
176 2         4 foreach my $aln (@aln) {
177 2 50 33     16 if( ! $aln || ! $aln->isa('Bio::Align::AlignI') ) {
178 0         0 $self->warn("Must provide a Bio::Align::AlignI object when calling write_aln");
179 0         0 next;
180             }
181 2         96 $date = localtime(time);
182 2         5 $msftag = "MSF";
183 2         10 $type = $valid_type{$aln->get_seq_by_pos(1)->alphabet};
184 2         8 $maxname = $aln->maxdisplayname_length();
185 2         6 $length = $aln->length();
186 2         6 $name = $aln->id();
187 2 50       6 if( !defined $name ) {
188 0         0 $name = "Align";
189             }
190              
191 2         5 $self->_print (sprintf("\n%s MSF: %d Type: %s %s Check: 00 ..\n\n",
192             $name, $aln->num_sequences, $type, $date));
193              
194 2 100       10 my $seqCountFormat = "%".($maxname > 20 ? $maxname + 2: 22)."s%-27d%27d\n";
195 2 100       8 my $seqNameFormat = "%-".($maxname > 20 ? $maxname : 20)."s ";
196            
197 2         6 foreach $seq ( $aln->each_seq() ) {
198 22         44 $name = $aln->displayname($seq->get_nse());
199 22         31 $miss = $maxname - length ($name);
200 22         24 $miss += 2;
201 22         32 $pad = " " x $miss;
202              
203 22         41 $self->_print (sprintf(" Name: %s%sLen: %d Check: %d Weight: 1.00\n",$name,$pad,length $seq->seq(), Bio::SeqIO::gcg->GCG_checksum($seq)));
204              
205 22         42 $hash{$name} = $seq->seq();
206 22         39 push(@arr,$name);
207             }
208             # ok - heavy handed, but there you go.
209             #
210 2         9 $self->_print ("\n//\n\n\n");
211              
212 2         6 while( $count < $length ) {
213             # there is another block to go!
214 14         50 $self->_print (sprintf($seqCountFormat,' ',$count+1,$count+50));
215 14         15 foreach $name ( @arr ) {
216 134         316 $self->_print (sprintf($seqNameFormat,$name));
217              
218 134         116 $tempcount = $count;
219 134         121 $index = 0;
220 134   100     281 while( ($tempcount + 10 < $length) && ($index < 5) ) {
221              
222 636         1858 $self->_print (sprintf("%s ",substr($hash{$name},
223             $tempcount,10)));
224              
225 636         699 $tempcount += 10;
226 636         1296 $index++;
227             } #
228             # ok, could be the very last guy ;)
229             #
230 134 100       152 if( $index < 5) {
231             # space to print!
232             #
233 22         65 $self->_print (sprintf("%s ",substr($hash{$name},$tempcount)));
234 22         26 $tempcount += 10;
235             }
236 134         180 $self->_print ("\n");
237             }
238 14         27 $self->_print ("\n\n");
239 14         22 $count = $tempcount;
240             }
241             }
242 2 50 33     8 $self->flush if $self->_flush_on_write && defined $self->_fh;
243 2         13 return 1;
244             }
245              
246             1;