File Coverage

Bio/AlignIO/nexus.pm
Criterion Covered Total %
statement 172 188 91.4
branch 100 130 76.9
condition 25 41 60.9
subroutine 9 10 90.0
pod 3 3 100.0
total 309 372 83.0


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::AlignIO::nexus
3             #
4             # Copyright Heikki Lehvaslaiho
5             #
6              
7             =head1 NAME
8              
9             Bio::AlignIO::nexus - NEXUS format sequence input/output stream
10              
11             =head1 SYNOPSIS
12              
13             Do not use this module directly. Use it via the L class.
14              
15             use Bio::AlignIO;
16              
17             my $in = Bio::AlignIO->new(-format => 'nexus',
18             -file => 'aln.nexus');
19             while( my $aln = $in->next_aln ) {
20             # do something with the alignment
21             }
22              
23             =head1 DESCRIPTION
24              
25             This object can transform L objects to and from NEXUS
26             data blocks. See method documentation for supported NEXUS features.
27              
28             =head1 ACKNOWLEDGEMENTS
29              
30             Will Fisher has written an excellent standalone NEXUS format parser in
31             Perl, readnexus. A number of tricks were adapted from it.
32              
33             =head1 FEEDBACK
34              
35             =head2 Support
36              
37             Please direct usage questions or support issues to the mailing list:
38              
39             I
40              
41             rather than to the module maintainer directly. Many experienced and
42             reponsive experts will be able look at the problem and quickly
43             address it. Please include a thorough description of the problem
44             with code and data examples if at all possible.
45              
46             =head2 Reporting Bugs
47              
48             Report bugs to the Bioperl bug tracking system to help us keep track
49             the bugs and their resolution. Bug reports can be submitted via the
50             web:
51              
52             https://github.com/bioperl/bioperl-live/issues
53              
54             =head1 AUTHORS - Heikki Lehvaslaiho
55              
56             Email: heikki-at-bioperl-dot-org
57              
58             =head1 APPENDIX
59              
60             The rest of the documentation details each of the object
61             methods. Internal methods are usually preceded with a _
62              
63             =cut
64              
65             # Let the code begin...
66              
67             package Bio::AlignIO::nexus;
68 3     3   403 use vars qw(%valid_type);
  3         4  
  3         123  
69 3     3   10 use strict;
  3         4  
  3         59  
70 3     3   7 no strict "refs";
  3         3  
  3         73  
71              
72              
73 3     3   8 use base qw(Bio::AlignIO);
  3         4  
  3         517  
74              
75             BEGIN {
76 3     3   7 %valid_type = map {$_, 1} qw( dna rna protein standard );
  12         5334  
77             # standard throws error: inherited from Bio::PrimarySeq
78             }
79              
80             =head2 new
81              
82             Title : new
83             Usage : $alignio = Bio::AlignIO->new(-format => 'nexus', -file => 'filename');
84             Function: returns a new Bio::AlignIO object to handle clustalw files
85             Returns : Bio::AlignIO::clustalw object
86             Args : -verbose => verbosity setting (-1,0,1,2)
87             -file => name of file to read in or with ">" - writeout
88             -fh => alternative to -file param - provide a filehandle
89             to read from/write to
90             -format => type of Alignment Format to process or produce
91              
92             Customization of nexus flavor output
93              
94             -show_symbols => print the symbols="ATGC" in the data definition
95             (MrBayes does not like this)
96             boolean [default is 1]
97             -show_endblock => print an 'endblock;' at the end of the data
98             (MyBayes does not like this)
99             boolean [default is 1]
100              
101             =cut
102              
103             sub _initialize {
104 25     25   43 my ($self, @args) = @_;
105 25         77 $self->SUPER::_initialize(@args);
106 25         89 my ($show_symbols, $endblock) =
107             $self->_rearrange([qw(SHOW_SYMBOLS SHOW_ENDBLOCK)], @args);
108 25         53 my @names = qw(symbols endblock);
109 25         50 for my $v ( $show_symbols, $endblock ) {
110 50 50       85 $v = 1 unless defined $v; # default value is 1
111 50         53 my $n = shift @names;
112 50         88 $self->flag($n, $v);
113             }
114             }
115              
116              
117             =head2 next_aln
118              
119             Title : next_aln
120             Usage : $aln = $stream->next_aln()
121             Function: Returns the next alignment in the stream.
122              
123             Supports the following NEXUS format features:
124             - The file has to start with '#NEXUS'
125             - Reads in the name of the alignment from a comment
126             (anything after 'TITLE: ') .
127             - Sequence names can be given in a taxa block, too.
128             - If matchchar notation is used, converts
129             them back to sequence characters.
130             - Does character conversions specified in the
131             NEXUS equate command.
132             - Sequence names of type 'Homo sapiens' and
133             Homo_sapiens are treated identically.
134              
135             Returns : L object
136             Args :
137              
138              
139             =cut
140              
141              
142             sub next_aln {
143 22     22 1 784 my $self = shift;
144 22         24 my $entry;
145 22         25 my ($aln_name, $seqcount, $residuecount, %hash, $alphabet,
146             $match, $gap, $missing, $equate, $interleave,
147             $name,$str,@names,$seqname,$start,$end,$count,$seq);
148 22         35 local $Bio::LocatableSeq::OTHER_SYMBOLS = '\*\?\.';
149 22         28 local $Bio::LocatableSeq::GAP_SYMBOLS = '\-';
150 22         154 my $aln = Bio::SimpleAlign->new(-source => 'nexus');
151              
152             # file starts with '#NEXUS' but we allow white space only lines before it
153 22         78 $entry = $self->_readline;
154 22   33     129 $entry = $self->_readline while defined $entry && $entry =~ /^\s+$/;
155              
156 22 50       40 return unless $entry;
157 22 50 33     118 $self->throw("Not a valid interleaved NEXUS file! [#NEXUS] not starting the file\n$entry")
158             unless ($entry && $entry =~ /^#NEXUS/i);
159              
160             # skip anything before either the taxa or data block
161             # but read in the optional title in a comment
162 22         42 while (defined($entry = $self->_readline)) {
163 64         72 local ($_) = $entry;
164 64 100       118 /\[TITLE. *([^\]]+)]\s+/i and $aln_name = $1;
165 64 100 100     292 last if /^begin +data/i || /^begin +taxa/i;
166             }
167 22 100 50     61 $aln_name =~ s/\s/_/g and $aln->id($aln_name) if $aln_name;
168              
169             # data and taxa blocks
170 22         21 my $incomment;
171 22         45 while (defined ($entry = $self->_readline)) {
172 402         410 local ($_) = $entry;
173 402 100       1232 next if s/\[[^\]]+\]//g; # remove comments
174 220 100       543 if( s/\[[^\]]+$// ) {
    100          
    100          
175 6         6 $incomment = 1;
176             # skip line if it is now empty or contains only whitespace
177 6 50       25 next if /^\s*$/;
178             } elsif($incomment) {
179 39 100       52 if( s/^[^\]]*\]// ) {
180 6         12 $incomment = 0;
181             } else {
182 33         45 next;
183             }
184             } elsif( /taxlabels/i ) {
185             # doesn't deal with taxlabels adequately and can mess things up!
186             # @names = $self->_read_taxlabels;
187             } else {
188              
189 162 100       304 /ntax\s*=\s*(\d+)/i and $seqcount = $1;
190 162 100       266 /nchar\s*=\s*(\d+)/i and $residuecount = $1;
191 162 100       230 /matchchar\s*=\s*(.)/i and $match = $1;
192 162 100       252 /gap\s*=\s*(.)/i and $gap = $1;
193 162 100       252 /missing\s*=\s*(.)/i and $missing = $1;
194 162 50       203 /equate\s*=\s*\"([^\"]+)/i and $equate = $1; # "e.g. equate="T=C G=A";
195 162 100       245 /datatype\s*=\s*(\w+)/i and $alphabet = lc $1;
196 162 100       214 /interleave/i and $interleave = 1 ;
197 162 100       385 last if /matrix/io;
198             }
199             }
200 22 50       42 $self->throw("Not a valid NEXUS sequence file. Datatype not specified.")
201             unless $alphabet;
202             $self->throw("Not a valid NEXUS sequence file. Datatype should not be [$alphabet]")
203 22 50       65 unless $valid_type{$alphabet};
204 22 50 33     100 $self->throw("\"$gap\" is not a valid gap character. For compatability, gap char can not be one of: ()[]{}/\,;:=*'`\"<>^")
205             if $gap && $gap =~ /[\(\)\[\]\{\}\/\\\,\;\:\=\*\'\`\<\>\^]/;
206 22 50 66     83 $self->throw("\"$missing\" is not a valid missing character. For compatability, missing char can not be one of: ()[]{}/\,;:=*'`\"<>^")
207             if $missing && $missing =~ /[\(\)\[\]\{\}\/\\\,\;\:\=\*\'\`\<\>\^]/;
208              
209 22         77 $aln->gap_char($gap);
210 22         52 $aln->missing_char($missing);
211              
212             #
213             # if data is not right after the matrix line
214             # read the empty lines out
215             #
216 22         42 while ($entry = $self->_readline) {
217 25 100       96 unless ($entry =~ /^\s+$/) {
218 22         61 $self->_pushback($entry);
219 22         24 last;
220             }
221             }
222              
223             #
224             # matrix command
225             #
226             # first alignment section
227 22 50       54 if (@names == 0) { # taxa block did not exist
228 22         42 while ($entry = $self->_readline) {
229 467         485 local ($_) = $entry;
230 467 100       1171 if( s/\[[^\]]+\]//g ) { #] remove comments
231 205 100       361 next if /^\s*$/;
232             # skip line if it is now empty or contains only whitespace
233             }
234 459 100 100     1007 if ($interleave && defined$count && ($count <= $seqcount)) {
      66        
235 78 100       172 /^\s+$/ and last;
236             } else {
237 381 100       671 /^\s+$/ and next;
238             }
239 448 100       752 /^\s*;/ and last; # stop if colon at end of matrix is on it's own line
240             #/^\s*;\s*$/ and last;
241 442 50       1247 if ( /^\s*([\"\'](.+?)[\"\']|(\S+))\s+(.*)\s*$/ ) {
242             # get single and double quoted names, or all the first
243             # nonwhite word as the name, and remained is seq
244             #if (/^\s*('([^']*?)'|([^']\S*))\s+(.*)$/) { #'
245 442   66     963 $name = ($2 || $3);
246 442 100       702 if ($4) {
247             # seq is on same line as name
248             # this is the usual NEXUS format
249 432         434 $str = $4;
250             } else {
251             # otherwise get seq from following lines. No comments allowed
252             # a less common matrix format, usually used for very long seqs
253 10         11 $str='';
254 10         17 while (local ($_) = $self->_readline) {
255 1860         1345 my $str_tmp = $_;
256 1860         3466 $str_tmp =~ s/[\s;]//g;
257 1860         1375 $str .= $str_tmp;
258 1860 100       3595 last if length$str == $residuecount;
259             }
260             }
261 442         347 $name =~ s/ /_/g;
262 442         380 push @names, $name;
263              
264 442         3883 $str =~ s/[\s;]//g;
265 442         357 $count = @names;
266 442         703 $hash{$count} = $str;
267             }
268 442 50       561 $self->throw("Not a valid interleaved NEXUS file! seqcount [$count] > predeclared [$seqcount] in the first section") if $count > $seqcount;
269 442 100       1153 /;/ and last; # stop if colon at end of matrix is on the same line as the last seq
270             }
271             }
272              
273             # interleaved sections
274 22         35 $count = 0;
275 22 100       67 if ( $interleave ) { # only read next section if file is interleaved
276 7         17 while( $entry = $self->_readline) {
277 1003         975 local ($_) = $entry;
278 1003 100       2691 if( s/\[[^\]]+\]//g ) { #] remove comments
279 826 100       1453 next if /^\s*$/; # skip line if it is now empty or contains only whitespace
280             }
281 945 100       1251 /^\s*;/ and last; # stop if colon at end of matrix is on it's own line
282 939 100       1358 $count = 0, next if $entry =~ /^\s*$/;
283 872 50       1939 if (/^\s*(\'([^\']*?)\'|([^\']\S*))\s+(.*)$/) {
284 872         1069 $str = $4;
285 872         2483 $str =~ s/[\s;]//g;
286 872         645 $count++;
287 872         1154 $hash{$count} .= $str;
288             };
289 872 50       1078 $self->throw("Not a valid interleaved NEXUS file!
290             seqcount [$count] > predeclared [$seqcount] ") if $count > $seqcount;
291 872 100       2004 /;/ and last; # stop if colon at end of matrix is on the same line as the last seq
292             }
293             }
294            
295 22 50       55 return if @names < 1;
296            
297             # sequence creation
298 22         34 $count = 0;
299 22         44 foreach $name ( @names ) {
300 442         360 $count++;
301 442 50       760 if( $name =~ /(\S+)\/(\d+)-(\d+)/ ) {
302 0         0 ($seqname,$start,$end) = ($1,$2,$3);
303             } else {
304 442         1006 ($seqname,$start,$str) = ($name,1,$hash{$count});
305 442         7458 $str =~ s/[$Bio::LocatableSeq::GAP_SYMBOLS]//g;
306 442         463 $end = length($str);
307             }
308            
309             # consistency test
310             $self->throw("Length of sequence [$seqname] is not [$residuecount]; got".CORE::length($hash{$count}))
311 442 50       726 unless CORE::length($hash{$count}) == $residuecount;
312            
313 442         1008 $seq = Bio::LocatableSeq->new('-seq' => $hash{$count},
314             '-display_id' => $seqname,
315             '-start' => $start,
316             '-end' => $end,
317             '-alphabet' => $alphabet
318             );
319 442         909 $aln->add_seq($seq);
320             }
321              
322             # if matchchar is used
323 22 100       54 $aln->unmatch($match) if $match;
324              
325             # if equate ( e.g. equate="T=C G=A") is used
326 22 50       43 if ($equate) {
327 0         0 $aln->map_chars($1, $2) while $equate =~ /(\S)=(\S)/g;
328             }
329              
330 22   100     111 while (defined $entry &&
331             $entry !~ /endblock/i) {
332 453         638 $entry = $self->_readline;
333             }
334              
335 22 50       81 return $aln if $aln->num_sequences;
336 0         0 return;
337             }
338              
339             sub _read_taxlabels {
340 0     0   0 my ($self) = @_;
341 0         0 my ($name, @names);
342 0         0 while (my $entry = $self->_readline) {
343 0 0       0 last if $entry =~ m/^\s*(END)?;/i;
344 0 0       0 if( $entry =~ m/\s*(\S+)\s+/ ) {
345 0         0 ($name) = ($1);
346 0         0 $name =~ s/\[[^\[]+\]//g;
347 0         0 $name =~ s/\W/_/g;
348 0         0 push @names, $name;
349             }
350             }
351 0         0 return @names;
352             }
353              
354             =head2 write_aln
355              
356             Title : write_aln
357             Usage : $stream->write_aln(@aln)
358             Function: Writes the $aln object into the stream in interleaved NEXUS
359             format. Everything is written into a data block.
360             SimpleAlign methods match_char, missing_char and gap_char must be set
361             if you want to see them in the output.
362             Returns : 1 for success and 0 for error
363             Args : L object
364              
365             =cut
366              
367             sub write_aln {
368 2     2 1 9 my ($self,@aln) = @_;
369 2         5 my $count = 0;
370 2         4 my $wrapped = 0;
371 2         2 my $maxname;
372 2         4 my ($length,$date,$name,$seq,$miss,$pad,%hash,@arr,$tempcount,$index );
373 2         5 my ($match, $missing, $gap,$symbols) = ('', '', '','');
374              
375 2         4 foreach my $aln (@aln) {
376 2 50 33     16 if( ! $aln || ! $aln->isa('Bio::Align::AlignI') ) {
377 0         0 $self->warn("Must provide a Bio::Align::AlignI object when calling write_aln");
378 0         0 next;
379             }
380 2 50       8 $self->throw("All sequences in the alignment must be the same length")
381             unless $aln->is_flush($self->verbose);
382              
383 2         9 $length = $aln->length();
384              
385 2         11 $self->_print (sprintf("#NEXUS\n[TITLE: %s]\n\nbegin data;\ndimensions ntax=%s nchar=%s;\n",
386             $aln->id, $aln->num_sequences, $length));
387 2 50       9 $match = "match=". $aln->match_char if $aln->match_char;
388 2 100       6 $missing = "missing=". $aln->missing_char if $aln->missing_char;
389 2 50       9 $gap = "gap=". $aln->gap_char if $aln->gap_char;
390              
391 2 50 33     11 $symbols = 'symbols="'.join('',$aln->symbol_chars). '"'
392             if( $self->flag('symbols') && $aln->symbol_chars);
393 2         12 $self->_print
394             (sprintf("format interleave datatype=%s %s %s %s %s;\n\nmatrix\n",
395             $aln->get_seq_by_pos(1)->alphabet, $match,
396             $missing, $gap, $symbols));
397              
398             # account for single quotes round names
399 2         10 my $indent = $aln->maxdisplayname_length+2;
400              
401 2         9 $aln->set_displayname_flat();
402 2         5 foreach $seq ( $aln->each_seq() ) {
403 10         15 my $nmid = $aln->displayname($seq->get_nse());
404 10 100       30 if( $nmid =~ /[^\w\d\.]/ ) {
405             # put name in single quotes incase it contains any of
406             # the following chars: ()[]{}/\,;:=*'"`+-<> that are not
407             # allowed in PAUP* and possible other software
408              
409 1         5 $name = sprintf("%-${indent}s", "\'" . $nmid . "\'");
410             } else {
411 9         26 $name = sprintf("%-${indent}s", $nmid);
412             }
413 10         16 $hash{$name} = $seq->seq;
414 10         17 push(@arr,$name);
415             }
416              
417 2         9 while( $count < $length ) {
418             # there is another block to go!
419 10         9 foreach $name ( @arr ) {
420 58         42 my $dispname = $name;
421             # $dispname = '' if $wrapped;
422 58         112 $self->_print (sprintf("%${indent}s ",$dispname));
423 58         43 $tempcount = $count;
424 58         30 $index = 0;
425 58   100     164 while( ($tempcount + 10 < $length) && ($index < 5) ) {
426 268         615 $self->_print (sprintf("%s ",substr($hash{$name},$tempcount,10)));
427 268         227 $tempcount += 10;
428 268         678 $index++;
429             }
430             # last
431 58 100       71 if( $index < 5) {
432             # space to print!
433 10         29 $self->_print (sprintf("%s ",substr($hash{$name},$tempcount)));
434 10         14 $tempcount += 10;
435             }
436 58         63 $self->_print ("\n");
437             }
438 10         17 $self->_print ("\n\n");
439 10         6 $count = $tempcount;
440 10         20 $wrapped = 1;
441             }
442 2 50       8 if( $self->flag('endblock') ) {
443 2         8 $self->_print (";\n\nendblock;\n");
444             } else {
445 0         0 $self->_print (";\n\nend;\n");
446             }
447             }
448 2 50 33     12 $self->flush if $self->_flush_on_write && defined $self->_fh;
449 2         20 return 1;
450             }
451              
452             =head2 flag
453              
454             Title : flag
455             Usage : $obj->flag($name,$value)
456             Function: Get/Set a flag value
457             Returns : value of flag (a scalar)
458             Args : on set, new value (a scalar or undef, optional)
459              
460              
461             =cut
462              
463             sub flag{
464 54     54 1 65 my ($self,$name,$val) = @_;
465 54 100       195 return $self->{'flag'}->{$name} = $val if defined $val;
466 4         25 return $self->{'flag'}->{$name};
467             }
468              
469             1;