File Coverage

Bio/AlignIO/nexus.pm
Criterion Covered Total %
statement 172 188 91.4
branch 100 130 76.9
condition 25 41 60.9
subroutine 9 10 90.0
pod 3 3 100.0
total 309 372 83.0


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::AlignIO::nexus
3             #
4             # Copyright Heikki Lehvaslaiho
5             #
6              
7             =head1 NAME
8              
9             Bio::AlignIO::nexus - NEXUS format sequence input/output stream
10              
11             =head1 SYNOPSIS
12              
13             Do not use this module directly. Use it via the L class.
14              
15             use Bio::AlignIO;
16              
17             my $in = Bio::AlignIO->new(-format => 'nexus',
18             -file => 'aln.nexus');
19             while( my $aln = $in->next_aln ) {
20             # do something with the alignment
21             }
22              
23             =head1 DESCRIPTION
24              
25             This object can transform L objects to and from NEXUS
26             data blocks. See method documentation for supported NEXUS features.
27              
28             =head1 ACKNOWLEDGEMENTS
29              
30             Will Fisher has written an excellent standalone NEXUS format parser in
31             Perl, readnexus. A number of tricks were adapted from it.
32              
33             =head1 FEEDBACK
34              
35             =head2 Support
36              
37             Please direct usage questions or support issues to the mailing list:
38              
39             I
40              
41             rather than to the module maintainer directly. Many experienced and
42             reponsive experts will be able look at the problem and quickly
43             address it. Please include a thorough description of the problem
44             with code and data examples if at all possible.
45              
46             =head2 Reporting Bugs
47              
48             Report bugs to the Bioperl bug tracking system to help us keep track
49             the bugs and their resolution. Bug reports can be submitted via the
50             web:
51              
52             https://github.com/bioperl/bioperl-live/issues
53              
54             =head1 AUTHORS - Heikki Lehvaslaiho
55              
56             Email: heikki-at-bioperl-dot-org
57              
58             =head1 APPENDIX
59              
60             The rest of the documentation details each of the object
61             methods. Internal methods are usually preceded with a _
62              
63             =cut
64              
65             # Let the code begin...
66              
67             package Bio::AlignIO::nexus;
68 3     3   493 use vars qw(%valid_type);
  3         3  
  3         129  
69 3     3   10 use strict;
  3         2  
  3         51  
70 3     3   9 no strict "refs";
  3         3  
  3         63  
71              
72              
73 3     3   8 use base qw(Bio::AlignIO);
  3         3  
  3         447  
74              
75             BEGIN {
76 3     3   5 %valid_type = map {$_, 1} qw( dna rna protein standard );
  12         5301  
77             # standard throws error: inherited from Bio::PrimarySeq
78             }
79              
80             =head2 new
81              
82             Title : new
83             Usage : $alignio = Bio::AlignIO->new(-format => 'nexus', -file => 'filename');
84             Function: returns a new Bio::AlignIO object to handle clustalw files
85             Returns : Bio::AlignIO::clustalw object
86             Args : -verbose => verbosity setting (-1,0,1,2)
87             -file => name of file to read in or with ">" - writeout
88             -fh => alternative to -file param - provide a filehandle
89             to read from/write to
90             -format => type of Alignment Format to process or produce
91              
92             Customization of nexus flavor output
93              
94             -show_symbols => print the symbols="ATGC" in the data definition
95             (MrBayes does not like this)
96             boolean [default is 1]
97             -show_endblock => print an 'endblock;' at the end of the data
98             (MyBayes does not like this)
99             boolean [default is 1]
100              
101             =cut
102              
103             sub _initialize {
104 25     25   43 my ($self, @args) = @_;
105 25         63 $self->SUPER::_initialize(@args);
106 25         92 my ($show_symbols, $endblock) =
107             $self->_rearrange([qw(SHOW_SYMBOLS SHOW_ENDBLOCK)], @args);
108 25         51 my @names = qw(symbols endblock);
109 25         43 for my $v ( $show_symbols, $endblock ) {
110 50 50       83 $v = 1 unless defined $v; # default value is 1
111 50         49 my $n = shift @names;
112 50         82 $self->flag($n, $v);
113             }
114             }
115              
116              
117             =head2 next_aln
118              
119             Title : next_aln
120             Usage : $aln = $stream->next_aln()
121             Function: Returns the next alignment in the stream.
122              
123             Supports the following NEXUS format features:
124             - The file has to start with '#NEXUS'
125             - Reads in the name of the alignment from a comment
126             (anything after 'TITLE: ') .
127             - Sequence names can be given in a taxa block, too.
128             - If matchchar notation is used, converts
129             them back to sequence characters.
130             - Does character conversions specified in the
131             NEXUS equate command.
132             - Sequence names of type 'Homo sapiens' and
133             Homo_sapiens are treated identically.
134              
135             Returns : L object
136             Args :
137              
138              
139             =cut
140              
141              
142             sub next_aln {
143 22     22 1 472 my $self = shift;
144 22         25 my $entry;
145 22         28 my ($aln_name, $seqcount, $residuecount, %hash, $alphabet,
146             $match, $gap, $missing, $equate, $interleave,
147             $name,$str,@names,$seqname,$start,$end,$count,$seq);
148 22         34 local $Bio::LocatableSeq::OTHER_SYMBOLS = '\*\?\.';
149 22         21 local $Bio::LocatableSeq::GAP_SYMBOLS = '\-';
150 22         130 my $aln = Bio::SimpleAlign->new(-source => 'nexus');
151              
152             # file starts with '#NEXUS' but we allow white space only lines before it
153 22         68 $entry = $self->_readline;
154 22   33     121 $entry = $self->_readline while defined $entry && $entry =~ /^\s+$/;
155              
156 22 50       34 return unless $entry;
157 22 50 33     132 $self->throw("Not a valid interleaved NEXUS file! [#NEXUS] not starting the file\n$entry")
158             unless ($entry && $entry =~ /^#NEXUS/i);
159              
160             # skip anything before either the taxa or data block
161             # but read in the optional title in a comment
162 22         48 while (defined($entry = $self->_readline)) {
163 64         82 local ($_) = $entry;
164 64 100       122 /\[TITLE. *([^\]]+)]\s+/i and $aln_name = $1;
165 64 100 100     293 last if /^begin +data/i || /^begin +taxa/i;
166             }
167 22 100 50     68 $aln_name =~ s/\s/_/g and $aln->id($aln_name) if $aln_name;
168              
169             # data and taxa blocks
170 22         18 my $incomment;
171 22         48 while (defined ($entry = $self->_readline)) {
172 402         433 local ($_) = $entry;
173 402 100       1290 next if s/\[[^\]]+\]//g; # remove comments
174 220 100       541 if( s/\[[^\]]+$// ) {
    100          
    100          
175 6         7 $incomment = 1;
176             # skip line if it is now empty or contains only whitespace
177 6 50       27 next if /^\s*$/;
178             } elsif($incomment) {
179 39 100       58 if( s/^[^\]]*\]// ) {
180 6         8 $incomment = 0;
181             } else {
182 33         50 next;
183             }
184             } elsif( /taxlabels/i ) {
185             # doesn't deal with taxlabels adequately and can mess things up!
186             # @names = $self->_read_taxlabels;
187             } else {
188              
189 162 100       299 /ntax\s*=\s*(\d+)/i and $seqcount = $1;
190 162 100       285 /nchar\s*=\s*(\d+)/i and $residuecount = $1;
191 162 100       221 /matchchar\s*=\s*(.)/i and $match = $1;
192 162 100       249 /gap\s*=\s*(.)/i and $gap = $1;
193 162 100       255 /missing\s*=\s*(.)/i and $missing = $1;
194 162 50       223 /equate\s*=\s*\"([^\"]+)/i and $equate = $1; # "e.g. equate="T=C G=A";
195 162 100       235 /datatype\s*=\s*(\w+)/i and $alphabet = lc $1;
196 162 100       209 /interleave/i and $interleave = 1 ;
197 162 100       402 last if /matrix/io;
198             }
199             }
200 22 50       44 $self->throw("Not a valid NEXUS sequence file. Datatype not specified.")
201             unless $alphabet;
202             $self->throw("Not a valid NEXUS sequence file. Datatype should not be [$alphabet]")
203 22 50       61 unless $valid_type{$alphabet};
204 22 50 33     96 $self->throw("\"$gap\" is not a valid gap character. For compatability, gap char can not be one of: ()[]{}/\,;:=*'`\"<>^")
205             if $gap && $gap =~ /[\(\)\[\]\{\}\/\\\,\;\:\=\*\'\`\<\>\^]/;
206 22 50 66     91 $self->throw("\"$missing\" is not a valid missing character. For compatability, missing char can not be one of: ()[]{}/\,;:=*'`\"<>^")
207             if $missing && $missing =~ /[\(\)\[\]\{\}\/\\\,\;\:\=\*\'\`\<\>\^]/;
208              
209 22         74 $aln->gap_char($gap);
210 22         51 $aln->missing_char($missing);
211              
212             #
213             # if data is not right after the matrix line
214             # read the empty lines out
215             #
216 22         45 while ($entry = $self->_readline) {
217 25 100       81 unless ($entry =~ /^\s+$/) {
218 22         58 $self->_pushback($entry);
219 22         23 last;
220             }
221             }
222              
223             #
224             # matrix command
225             #
226             # first alignment section
227 22 50       46 if (@names == 0) { # taxa block did not exist
228 22         32 while ($entry = $self->_readline) {
229 467         487 local ($_) = $entry;
230 467 100       1173 if( s/\[[^\]]+\]//g ) { #] remove comments
231 205 100       371 next if /^\s*$/;
232             # skip line if it is now empty or contains only whitespace
233             }
234 459 100 100     1005 if ($interleave && defined$count && ($count <= $seqcount)) {
      66        
235 78 100       159 /^\s+$/ and last;
236             } else {
237 381 100       671 /^\s+$/ and next;
238             }
239 448 100       719 /^\s*;/ and last; # stop if colon at end of matrix is on it's own line
240             #/^\s*;\s*$/ and last;
241 442 50       1262 if ( /^\s*([\"\'](.+?)[\"\']|(\S+))\s+(.*)\s*$/ ) {
242             # get single and double quoted names, or all the first
243             # nonwhite word as the name, and remained is seq
244             #if (/^\s*('([^']*?)'|([^']\S*))\s+(.*)$/) { #'
245 442   66     1008 $name = ($2 || $3);
246 442 100       732 if ($4) {
247             # seq is on same line as name
248             # this is the usual NEXUS format
249 432         425 $str = $4;
250             } else {
251             # otherwise get seq from following lines. No comments allowed
252             # a less common matrix format, usually used for very long seqs
253 10         12 $str='';
254 10         22 while (local ($_) = $self->_readline) {
255 1860         1682 my $str_tmp = $_;
256 1860         3664 $str_tmp =~ s/[\s;]//g;
257 1860         1508 $str .= $str_tmp;
258 1860 100       3807 last if length$str == $residuecount;
259             }
260             }
261 442         322 $name =~ s/ /_/g;
262 442         419 push @names, $name;
263              
264 442         4022 $str =~ s/[\s;]//g;
265 442         345 $count = @names;
266 442         705 $hash{$count} = $str;
267             }
268 442 50       572 $self->throw("Not a valid interleaved NEXUS file! seqcount [$count] > predeclared [$seqcount] in the first section") if $count > $seqcount;
269 442 100       1196 /;/ and last; # stop if colon at end of matrix is on the same line as the last seq
270             }
271             }
272              
273             # interleaved sections
274 22         27 $count = 0;
275 22 100       41 if ( $interleave ) { # only read next section if file is interleaved
276 7         14 while( $entry = $self->_readline) {
277 1003         1023 local ($_) = $entry;
278 1003 100       2700 if( s/\[[^\]]+\]//g ) { #] remove comments
279 826 100       1417 next if /^\s*$/; # skip line if it is now empty or contains only whitespace
280             }
281 945 100       1330 /^\s*;/ and last; # stop if colon at end of matrix is on it's own line
282 939 100       1388 $count = 0, next if $entry =~ /^\s*$/;
283 872 50       1913 if (/^\s*(\'([^\']*?)\'|([^\']\S*))\s+(.*)$/) {
284 872         1092 $str = $4;
285 872         2558 $str =~ s/[\s;]//g;
286 872         628 $count++;
287 872         1205 $hash{$count} .= $str;
288             };
289 872 50       1071 $self->throw("Not a valid interleaved NEXUS file!
290             seqcount [$count] > predeclared [$seqcount] ") if $count > $seqcount;
291 872 100       2036 /;/ and last; # stop if colon at end of matrix is on the same line as the last seq
292             }
293             }
294            
295 22 50       47 return if @names < 1;
296            
297             # sequence creation
298 22         22 $count = 0;
299 22         37 foreach $name ( @names ) {
300 442         365 $count++;
301 442 50       806 if( $name =~ /(\S+)\/(\d+)-(\d+)/ ) {
302 0         0 ($seqname,$start,$end) = ($1,$2,$3);
303             } else {
304 442         998 ($seqname,$start,$str) = ($name,1,$hash{$count});
305 442         7897 $str =~ s/[$Bio::LocatableSeq::GAP_SYMBOLS]//g;
306 442         521 $end = length($str);
307             }
308            
309             # consistency test
310             $self->throw("Length of sequence [$seqname] is not [$residuecount]; got".CORE::length($hash{$count}))
311 442 50       800 unless CORE::length($hash{$count}) == $residuecount;
312            
313 442         1072 $seq = Bio::LocatableSeq->new('-seq' => $hash{$count},
314             '-display_id' => $seqname,
315             '-start' => $start,
316             '-end' => $end,
317             '-alphabet' => $alphabet
318             );
319 442         861 $aln->add_seq($seq);
320             }
321              
322             # if matchchar is used
323 22 100       57 $aln->unmatch($match) if $match;
324              
325             # if equate ( e.g. equate="T=C G=A") is used
326 22 50       38 if ($equate) {
327 0         0 $aln->map_chars($1, $2) while $equate =~ /(\S)=(\S)/g;
328             }
329              
330 22   100     105 while (defined $entry &&
331             $entry !~ /endblock/i) {
332 453         615 $entry = $self->_readline;
333             }
334              
335 22 50       73 return $aln if $aln->num_sequences;
336 0         0 return;
337             }
338              
339             sub _read_taxlabels {
340 0     0   0 my ($self) = @_;
341 0         0 my ($name, @names);
342 0         0 while (my $entry = $self->_readline) {
343 0 0       0 last if $entry =~ m/^\s*(END)?;/i;
344 0 0       0 if( $entry =~ m/\s*(\S+)\s+/ ) {
345 0         0 ($name) = ($1);
346 0         0 $name =~ s/\[[^\[]+\]//g;
347 0         0 $name =~ s/\W/_/g;
348 0         0 push @names, $name;
349             }
350             }
351 0         0 return @names;
352             }
353              
354             =head2 write_aln
355              
356             Title : write_aln
357             Usage : $stream->write_aln(@aln)
358             Function: Writes the $aln object into the stream in interleaved NEXUS
359             format. Everything is written into a data block.
360             SimpleAlign methods match_char, missing_char and gap_char must be set
361             if you want to see them in the output.
362             Returns : 1 for success and 0 for error
363             Args : L object
364              
365             =cut
366              
367             sub write_aln {
368 2     2 1 7 my ($self,@aln) = @_;
369 2         3 my $count = 0;
370 2         2 my $wrapped = 0;
371 2         3 my $maxname;
372 2         3 my ($length,$date,$name,$seq,$miss,$pad,%hash,@arr,$tempcount,$index );
373 2         5 my ($match, $missing, $gap,$symbols) = ('', '', '','');
374              
375 2         5 foreach my $aln (@aln) {
376 2 50 33     14 if( ! $aln || ! $aln->isa('Bio::Align::AlignI') ) {
377 0         0 $self->warn("Must provide a Bio::Align::AlignI object when calling write_aln");
378 0         0 next;
379             }
380 2 50       4 $self->throw("All sequences in the alignment must be the same length")
381             unless $aln->is_flush($self->verbose);
382              
383 2         29 $length = $aln->length();
384              
385 2         7 $self->_print (sprintf("#NEXUS\n[TITLE: %s]\n\nbegin data;\ndimensions ntax=%s nchar=%s;\n",
386             $aln->id, $aln->num_sequences, $length));
387 2 50       8 $match = "match=". $aln->match_char if $aln->match_char;
388 2 100       6 $missing = "missing=". $aln->missing_char if $aln->missing_char;
389 2 50       6 $gap = "gap=". $aln->gap_char if $aln->gap_char;
390              
391 2 50 33     6 $symbols = 'symbols="'.join('',$aln->symbol_chars). '"'
392             if( $self->flag('symbols') && $aln->symbol_chars);
393 2         7 $self->_print
394             (sprintf("format interleave datatype=%s %s %s %s %s;\n\nmatrix\n",
395             $aln->get_seq_by_pos(1)->alphabet, $match,
396             $missing, $gap, $symbols));
397              
398             # account for single quotes round names
399 2         23 my $indent = $aln->maxdisplayname_length+2;
400              
401 2         8 $aln->set_displayname_flat();
402 2         6 foreach $seq ( $aln->each_seq() ) {
403 10         14 my $nmid = $aln->displayname($seq->get_nse());
404 10 100       22 if( $nmid =~ /[^\w\d\.]/ ) {
405             # put name in single quotes incase it contains any of
406             # the following chars: ()[]{}/\,;:=*'"`+-<> that are not
407             # allowed in PAUP* and possible other software
408              
409 1         3 $name = sprintf("%-${indent}s", "\'" . $nmid . "\'");
410             } else {
411 9         22 $name = sprintf("%-${indent}s", $nmid);
412             }
413 10         19 $hash{$name} = $seq->seq;
414 10         14 push(@arr,$name);
415             }
416              
417 2         8 while( $count < $length ) {
418             # there is another block to go!
419 10         10 foreach $name ( @arr ) {
420 58         38 my $dispname = $name;
421             # $dispname = '' if $wrapped;
422 58         106 $self->_print (sprintf("%${indent}s ",$dispname));
423 58         42 $tempcount = $count;
424 58         31 $index = 0;
425 58   100     154 while( ($tempcount + 10 < $length) && ($index < 5) ) {
426 268         607 $self->_print (sprintf("%s ",substr($hash{$name},$tempcount,10)));
427 268         204 $tempcount += 10;
428 268         649 $index++;
429             }
430             # last
431 58 100       66 if( $index < 5) {
432             # space to print!
433 10         48 $self->_print (sprintf("%s ",substr($hash{$name},$tempcount)));
434 10         9 $tempcount += 10;
435             }
436 58         64 $self->_print ("\n");
437             }
438 10         12 $self->_print ("\n\n");
439 10         8 $count = $tempcount;
440 10         14 $wrapped = 1;
441             }
442 2 50       5 if( $self->flag('endblock') ) {
443 2         4 $self->_print (";\n\nendblock;\n");
444             } else {
445 0         0 $self->_print (";\n\nend;\n");
446             }
447             }
448 2 50 33     7 $self->flush if $self->_flush_on_write && defined $self->_fh;
449 2         12 return 1;
450             }
451              
452             =head2 flag
453              
454             Title : flag
455             Usage : $obj->flag($name,$value)
456             Function: Get/Set a flag value
457             Returns : value of flag (a scalar)
458             Args : on set, new value (a scalar or undef, optional)
459              
460              
461             =cut
462              
463             sub flag{
464 54     54 1 56 my ($self,$name,$val) = @_;
465 54 100       192 return $self->{'flag'}->{$name} = $val if defined $val;
466 4         17 return $self->{'flag'}->{$name};
467             }
468              
469             1;