File Coverage

Bio/AlignIO/nexus.pm
Criterion Covered Total %
statement 172 188 91.4
branch 100 130 76.9
condition 25 41 60.9
subroutine 9 10 90.0
pod 3 3 100.0
total 309 372 83.0


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::AlignIO::nexus
3             #
4             # Copyright Heikki Lehvaslaiho
5             #
6              
7             =head1 NAME
8              
9             Bio::AlignIO::nexus - NEXUS format sequence input/output stream
10              
11             =head1 SYNOPSIS
12              
13             Do not use this module directly. Use it via the L class.
14              
15             use Bio::AlignIO;
16              
17             my $in = Bio::AlignIO->new(-format => 'nexus',
18             -file => 'aln.nexus');
19             while( my $aln = $in->next_aln ) {
20             # do something with the alignment
21             }
22              
23             =head1 DESCRIPTION
24              
25             This object can transform L objects to and from NEXUS
26             data blocks. See method documentation for supported NEXUS features.
27              
28             =head1 ACKNOWLEDGEMENTS
29              
30             Will Fisher has written an excellent standalone NEXUS format parser in
31             Perl, readnexus. A number of tricks were adapted from it.
32              
33             =head1 FEEDBACK
34              
35             =head2 Support
36              
37             Please direct usage questions or support issues to the mailing list:
38              
39             I
40              
41             rather than to the module maintainer directly. Many experienced and
42             reponsive experts will be able look at the problem and quickly
43             address it. Please include a thorough description of the problem
44             with code and data examples if at all possible.
45              
46             =head2 Reporting Bugs
47              
48             Report bugs to the Bioperl bug tracking system to help us keep track
49             the bugs and their resolution. Bug reports can be submitted via the
50             web:
51              
52             https://github.com/bioperl/bioperl-live/issues
53              
54             =head1 AUTHORS - Heikki Lehvaslaiho
55              
56             Email: heikki-at-bioperl-dot-org
57              
58             =head1 APPENDIX
59              
60             The rest of the documentation details each of the object
61             methods. Internal methods are usually preceded with a _
62              
63             =cut
64              
65             # Let the code begin...
66              
67             package Bio::AlignIO::nexus;
68 3     3   457 use vars qw(%valid_type);
  3         6  
  3         169  
69 3     3   21 use strict;
  3         8  
  3         80  
70 3     3   14 no strict "refs";
  3         6  
  3         90  
71              
72              
73 3     3   16 use base qw(Bio::AlignIO);
  3         6  
  3         578  
74              
75             BEGIN {
76 3     3   12 %valid_type = map {$_, 1} qw( dna rna protein standard );
  12         5888  
77             # standard throws error: inherited from Bio::PrimarySeq
78             }
79              
80             =head2 new
81              
82             Title : new
83             Usage : $alignio = Bio::AlignIO->new(-format => 'nexus', -file => 'filename');
84             Function: returns a new Bio::AlignIO object to handle clustalw files
85             Returns : Bio::AlignIO::clustalw object
86             Args : -verbose => verbosity setting (-1,0,1,2)
87             -file => name of file to read in or with ">" - writeout
88             -fh => alternative to -file param - provide a filehandle
89             to read from/write to
90             -format => type of Alignment Format to process or produce
91              
92             Customization of nexus flavor output
93              
94             -show_symbols => print the symbols="ATGC" in the data definition
95             (MrBayes does not like this)
96             boolean [default is 1]
97             -show_endblock => print an 'endblock;' at the end of the data
98             (MyBayes does not like this)
99             boolean [default is 1]
100              
101             =cut
102              
103             sub _initialize {
104 25     25   81 my ($self, @args) = @_;
105 25         118 $self->SUPER::_initialize(@args);
106 25         99 my ($show_symbols, $endblock) =
107             $self->_rearrange([qw(SHOW_SYMBOLS SHOW_ENDBLOCK)], @args);
108 25         69 my @names = qw(symbols endblock);
109 25         63 for my $v ( $show_symbols, $endblock ) {
110 50 50       116 $v = 1 unless defined $v; # default value is 1
111 50         82 my $n = shift @names;
112 50         121 $self->flag($n, $v);
113             }
114             }
115              
116              
117             =head2 next_aln
118              
119             Title : next_aln
120             Usage : $aln = $stream->next_aln()
121             Function: Returns the next alignment in the stream.
122              
123             Supports the following NEXUS format features:
124             - The file has to start with '#NEXUS'
125             - Reads in the name of the alignment from a comment
126             (anything after 'TITLE: ') .
127             - Sequence names can be given in a taxa block, too.
128             - If matchchar notation is used, converts
129             them back to sequence characters.
130             - Does character conversions specified in the
131             NEXUS equate command.
132             - Sequence names of type 'Homo sapiens' and
133             Homo_sapiens are treated identically.
134              
135             Returns : L object
136             Args :
137              
138              
139             =cut
140              
141              
142             sub next_aln {
143 22     22 1 7051 my $self = shift;
144 22         40 my $entry;
145 22         64 my ($aln_name, $seqcount, $residuecount, %hash, $alphabet,
146             $match, $gap, $missing, $equate, $interleave,
147             $name,$str,@names,$seqname,$start,$end,$count,$seq);
148 22         56 local $Bio::LocatableSeq::OTHER_SYMBOLS = '\*\?\.';
149 22         44 local $Bio::LocatableSeq::GAP_SYMBOLS = '\-';
150 22         161 my $aln = Bio::SimpleAlign->new(-source => 'nexus');
151              
152             # file starts with '#NEXUS' but we allow white space only lines before it
153 22         108 $entry = $self->_readline;
154 22   33     154 $entry = $self->_readline while defined $entry && $entry =~ /^\s+$/;
155              
156 22 50       56 return unless $entry;
157 22 50 33     148 $self->throw("Not a valid interleaved NEXUS file! [#NEXUS] not starting the file\n$entry")
158             unless ($entry && $entry =~ /^#NEXUS/i);
159              
160             # skip anything before either the taxa or data block
161             # but read in the optional title in a comment
162 22         73 while (defined($entry = $self->_readline)) {
163 64         102 local ($_) = $entry;
164 64 100       146 /\[TITLE. *([^\]]+)]\s+/i and $aln_name = $1;
165 64 100 100     348 last if /^begin +data/i || /^begin +taxa/i;
166             }
167 22 100 50     97 $aln_name =~ s/\s/_/g and $aln->id($aln_name) if $aln_name;
168              
169             # data and taxa blocks
170 22         31 my $incomment;
171 22         50 while (defined ($entry = $self->_readline)) {
172 402         570 local ($_) = $entry;
173 402 100       1444 next if s/\[[^\]]+\]//g; # remove comments
174 220 100       592 if( s/\[[^\]]+$// ) {
    100          
    100          
175 6         9 $incomment = 1;
176             # skip line if it is now empty or contains only whitespace
177 6 50       36 next if /^\s*$/;
178             } elsif($incomment) {
179 39 100       68 if( s/^[^\]]*\]// ) {
180 6         13 $incomment = 0;
181             } else {
182 33         55 next;
183             }
184             } elsif( /taxlabels/i ) {
185             # doesn't deal with taxlabels adequately and can mess things up!
186             # @names = $self->_read_taxlabels;
187             } else {
188              
189 162 100       350 /ntax\s*=\s*(\d+)/i and $seqcount = $1;
190 162 100       325 /nchar\s*=\s*(\d+)/i and $residuecount = $1;
191 162 100       263 /matchchar\s*=\s*(.)/i and $match = $1;
192 162 100       291 /gap\s*=\s*(.)/i and $gap = $1;
193 162 100       299 /missing\s*=\s*(.)/i and $missing = $1;
194 162 50       271 /equate\s*=\s*\"([^\"]+)/i and $equate = $1; # "e.g. equate="T=C G=A";
195 162 100       313 /datatype\s*=\s*(\w+)/i and $alphabet = lc $1;
196 162 100       249 /interleave/i and $interleave = 1 ;
197 162 100       721 last if /matrix/io;
198             }
199             }
200 22 50       61 $self->throw("Not a valid NEXUS sequence file. Datatype not specified.")
201             unless $alphabet;
202             $self->throw("Not a valid NEXUS sequence file. Datatype should not be [$alphabet]")
203 22 50       98 unless $valid_type{$alphabet};
204 22 50 33     100 $self->throw("\"$gap\" is not a valid gap character. For compatability, gap char can not be one of: ()[]{}/\,;:=*'`\"<>^")
205             if $gap && $gap =~ /[\(\)\[\]\{\}\/\\\,\;\:\=\*\'\`\<\>\^]/;
206 22 50 66     123 $self->throw("\"$missing\" is not a valid missing character. For compatability, missing char can not be one of: ()[]{}/\,;:=*'`\"<>^")
207             if $missing && $missing =~ /[\(\)\[\]\{\}\/\\\,\;\:\=\*\'\`\<\>\^]/;
208              
209 22         106 $aln->gap_char($gap);
210 22         83 $aln->missing_char($missing);
211              
212             #
213             # if data is not right after the matrix line
214             # read the empty lines out
215             #
216 22         52 while ($entry = $self->_readline) {
217 25 100       108 unless ($entry =~ /^\s+$/) {
218 22         103 $self->_pushback($entry);
219 22         35 last;
220             }
221             }
222              
223             #
224             # matrix command
225             #
226             # first alignment section
227 22 50       65 if (@names == 0) { # taxa block did not exist
228 22         56 while ($entry = $self->_readline) {
229 467         697 local ($_) = $entry;
230 467 100       1359 if( s/\[[^\]]+\]//g ) { #] remove comments
231 205 100       464 next if /^\s*$/;
232             # skip line if it is now empty or contains only whitespace
233             }
234 459 100 100     988 if ($interleave && defined$count && ($count <= $seqcount)) {
      66        
235 78 100       211 /^\s+$/ and last;
236             } else {
237 381 100       776 /^\s+$/ and next;
238             }
239 448 100       771 /^\s*;/ and last; # stop if colon at end of matrix is on it's own line
240             #/^\s*;\s*$/ and last;
241 442 50       1599 if ( /^\s*([\"\'](.+?)[\"\']|(\S+))\s+(.*)\s*$/ ) {
242             # get single and double quoted names, or all the first
243             # nonwhite word as the name, and remained is seq
244             #if (/^\s*('([^']*?)'|([^']\S*))\s+(.*)$/) { #'
245 442   66     1458 $name = ($2 || $3);
246 442 100       850 if ($4) {
247             # seq is on same line as name
248             # this is the usual NEXUS format
249 432         549 $str = $4;
250             } else {
251             # otherwise get seq from following lines. No comments allowed
252             # a less common matrix format, usually used for very long seqs
253 10         50 $str='';
254 10         28 while (local ($_) = $self->_readline) {
255 1860         2345 my $str_tmp = $_;
256 1860         5955 $str_tmp =~ s/[\s;]//g;
257 1860         2612 $str .= $str_tmp;
258 1860 100       4371 last if length$str == $residuecount;
259             }
260             }
261 442         558 $name =~ s/ /_/g;
262 442         582 push @names, $name;
263              
264 442         4024 $str =~ s/[\s;]//g;
265 442         569 $count = @names;
266 442         931 $hash{$count} = $str;
267             }
268 442 50       697 $self->throw("Not a valid interleaved NEXUS file! seqcount [$count] > predeclared [$seqcount] in the first section") if $count > $seqcount;
269 442 100       1210 /;/ and last; # stop if colon at end of matrix is on the same line as the last seq
270             }
271             }
272              
273             # interleaved sections
274 22         39 $count = 0;
275 22 100       59 if ( $interleave ) { # only read next section if file is interleaved
276 7         21 while( $entry = $self->_readline) {
277 1003         1676 local ($_) = $entry;
278 1003 100       4306 if( s/\[[^\]]+\]//g ) { #] remove comments
279 826 100       2296 next if /^\s*$/; # skip line if it is now empty or contains only whitespace
280             }
281 945 100       1781 /^\s*;/ and last; # stop if colon at end of matrix is on it's own line
282 939 100       2053 $count = 0, next if $entry =~ /^\s*$/;
283 872 50       2738 if (/^\s*(\'([^\']*?)\'|([^\']\S*))\s+(.*)$/) {
284 872         1830 $str = $4;
285 872         3909 $str =~ s/[\s;]//g;
286 872         1292 $count++;
287 872         2100 $hash{$count} .= $str;
288             };
289 872 50       1535 $self->throw("Not a valid interleaved NEXUS file!
290             seqcount [$count] > predeclared [$seqcount] ") if $count > $seqcount;
291 872 100       2743 /;/ and last; # stop if colon at end of matrix is on the same line as the last seq
292             }
293             }
294            
295 22 50       64 return if @names < 1;
296            
297             # sequence creation
298 22         52 $count = 0;
299 22         50 foreach $name ( @names ) {
300 442         555 $count++;
301 442 50       858 if( $name =~ /(\S+)\/(\d+)-(\d+)/ ) {
302 0         0 ($seqname,$start,$end) = ($1,$2,$3);
303             } else {
304 442         1271 ($seqname,$start,$str) = ($name,1,$hash{$count});
305 442         8489 $str =~ s/[$Bio::LocatableSeq::GAP_SYMBOLS]//g;
306 442         743 $end = length($str);
307             }
308            
309             # consistency test
310             $self->throw("Length of sequence [$seqname] is not [$residuecount]; got".CORE::length($hash{$count}))
311 442 50       929 unless CORE::length($hash{$count}) == $residuecount;
312            
313 442         1256 $seq = Bio::LocatableSeq->new('-seq' => $hash{$count},
314             '-display_id' => $seqname,
315             '-start' => $start,
316             '-end' => $end,
317             '-alphabet' => $alphabet
318             );
319 442         1167 $aln->add_seq($seq);
320             }
321              
322             # if matchchar is used
323 22 100       73 $aln->unmatch($match) if $match;
324              
325             # if equate ( e.g. equate="T=C G=A") is used
326 22 50       66 if ($equate) {
327 0         0 $aln->map_chars($1, $2) while $equate =~ /(\S)=(\S)/g;
328             }
329              
330 22   100     165 while (defined $entry &&
331             $entry !~ /endblock/i) {
332 453         803 $entry = $self->_readline;
333             }
334              
335 22 50       103 return $aln if $aln->num_sequences;
336 0         0 return;
337             }
338              
339             sub _read_taxlabels {
340 0     0   0 my ($self) = @_;
341 0         0 my ($name, @names);
342 0         0 while (my $entry = $self->_readline) {
343 0 0       0 last if $entry =~ m/^\s*(END)?;/i;
344 0 0       0 if( $entry =~ m/\s*(\S+)\s+/ ) {
345 0         0 ($name) = ($1);
346 0         0 $name =~ s/\[[^\[]+\]//g;
347 0         0 $name =~ s/\W/_/g;
348 0         0 push @names, $name;
349             }
350             }
351 0         0 return @names;
352             }
353              
354             =head2 write_aln
355              
356             Title : write_aln
357             Usage : $stream->write_aln(@aln)
358             Function: Writes the $aln object into the stream in interleaved NEXUS
359             format. Everything is written into a data block.
360             SimpleAlign methods match_char, missing_char and gap_char must be set
361             if you want to see them in the output.
362             Returns : 1 for success and 0 for error
363             Args : L object
364              
365             =cut
366              
367             sub write_aln {
368 2     2 1 13 my ($self,@aln) = @_;
369 2         4 my $count = 0;
370 2         6 my $wrapped = 0;
371 2         5 my $maxname;
372 2         5 my ($length,$date,$name,$seq,$miss,$pad,%hash,@arr,$tempcount,$index );
373 2         8 my ($match, $missing, $gap,$symbols) = ('', '', '','');
374              
375 2         7 foreach my $aln (@aln) {
376 2 50 33     20 if( ! $aln || ! $aln->isa('Bio::Align::AlignI') ) {
377 0         0 $self->warn("Must provide a Bio::Align::AlignI object when calling write_aln");
378 0         0 next;
379             }
380 2 50       9 $self->throw("All sequences in the alignment must be the same length")
381             unless $aln->is_flush($self->verbose);
382              
383 2         9 $length = $aln->length();
384              
385 2         8 $self->_print (sprintf("#NEXUS\n[TITLE: %s]\n\nbegin data;\ndimensions ntax=%s nchar=%s;\n",
386             $aln->id, $aln->num_sequences, $length));
387 2 50       9 $match = "match=". $aln->match_char if $aln->match_char;
388 2 100       7 $missing = "missing=". $aln->missing_char if $aln->missing_char;
389 2 50       7 $gap = "gap=". $aln->gap_char if $aln->gap_char;
390              
391 2 50 33     7 $symbols = 'symbols="'.join('',$aln->symbol_chars). '"'
392             if( $self->flag('symbols') && $aln->symbol_chars);
393 2         10 $self->_print
394             (sprintf("format interleave datatype=%s %s %s %s %s;\n\nmatrix\n",
395             $aln->get_seq_by_pos(1)->alphabet, $match,
396             $missing, $gap, $symbols));
397              
398             # account for single quotes round names
399 2         9 my $indent = $aln->maxdisplayname_length+2;
400              
401 2         8 $aln->set_displayname_flat();
402 2         6 foreach $seq ( $aln->each_seq() ) {
403 10         18 my $nmid = $aln->displayname($seq->get_nse());
404 10 100       52 if( $nmid =~ /[^\w\d\.]/ ) {
405             # put name in single quotes incase it contains any of
406             # the following chars: ()[]{}/\,;:=*'"`+-<> that are not
407             # allowed in PAUP* and possible other software
408              
409 1         4 $name = sprintf("%-${indent}s", "\'" . $nmid . "\'");
410             } else {
411 9         28 $name = sprintf("%-${indent}s", $nmid);
412             }
413 10         21 $hash{$name} = $seq->seq;
414 10         20 push(@arr,$name);
415             }
416              
417 2         7 while( $count < $length ) {
418             # there is another block to go!
419 10         14 foreach $name ( @arr ) {
420 58         59 my $dispname = $name;
421             # $dispname = '' if $wrapped;
422 58         140 $self->_print (sprintf("%${indent}s ",$dispname));
423 58         50 $tempcount = $count;
424 58         51 $index = 0;
425 58   100     131 while( ($tempcount + 10 < $length) && ($index < 5) ) {
426 268         791 $self->_print (sprintf("%s ",substr($hash{$name},$tempcount,10)));
427 268         288 $tempcount += 10;
428 268         550 $index++;
429             }
430             # last
431 58 100       84 if( $index < 5) {
432             # space to print!
433 10         51 $self->_print (sprintf("%s ",substr($hash{$name},$tempcount)));
434 10         17 $tempcount += 10;
435             }
436 58         76 $self->_print ("\n");
437             }
438 10         17 $self->_print ("\n\n");
439 10         10 $count = $tempcount;
440 10         18 $wrapped = 1;
441             }
442 2 50       6 if( $self->flag('endblock') ) {
443 2         7 $self->_print (";\n\nendblock;\n");
444             } else {
445 0         0 $self->_print (";\n\nend;\n");
446             }
447             }
448 2 50 33     9 $self->flush if $self->_flush_on_write && defined $self->_fh;
449 2         14 return 1;
450             }
451              
452             =head2 flag
453              
454             Title : flag
455             Usage : $obj->flag($name,$value)
456             Function: Get/Set a flag value
457             Returns : value of flag (a scalar)
458             Args : on set, new value (a scalar or undef, optional)
459              
460              
461             =cut
462              
463             sub flag{
464 54     54 1 104 my ($self,$name,$val) = @_;
465 54 100       214 return $self->{'flag'}->{$name} = $val if defined $val;
466 4         24 return $self->{'flag'}->{$name};
467             }
468              
469             1;