File Coverage

blib/lib/Bio/Phylo/Parsers/Newick.pm
Criterion Covered Total %
statement 181 188 96.2
branch 73 80 91.2
condition 107 128 83.5
subroutine 13 14 92.8
pod n/a
total 374 410 91.2


line stmt bran cond sub pod time code
1             package Bio::Phylo::Parsers::Newick;
2 24     24   453 use warnings;
  24         45  
  24         695  
3 24     24   108 use strict;
  24         42  
  24         459  
4 24     24   100 use base 'Bio::Phylo::Parsers::Abstract';
  24         42  
  24         6200  
5 24     24   155 no warnings 'recursion';
  24         43  
  24         42759  
6              
7             =head1 NAME
8              
9             Bio::Phylo::Parsers::Newick - Parser used by Bio::Phylo::IO, no serviceable parts inside
10              
11             =head1 DESCRIPTION
12              
13             This module parses tree descriptions in parenthetical format. It is called by the
14             L<Bio::Phylo::IO> facade, don't call it directly. Several additional flags can be
15             passed to the Bio::Phylo::IO parse and parse_tree functions to influence how to deal
16             with complex newick strings:
17              
18             -keep => [ ...list of taxa names... ]
19              
20             The C<-keep> flag allows you to only retain certain taxa of interest, ignoring others
21             while building the tree object.
22              
23             -ignore_comments => 1,
24              
25             This will treat comments in square brackets as if they are a normal taxon name character,
26             this so that names such as C<Choristoneura diversana|BC ZSM Lep 23401[05/*> are parsed
27             "successfully". (Note: square brackets should NOT be used in this way as it will break
28             many parsers).
29              
30             -keep_whitespace => 1,
31              
32             This will treat unescaped whitespace as if it is a normal taxon name character. Normally,
33             whitespace is only retained inside quoted strings (e.g. C<'Homo sapiens'>), otherwise it
34             is the convention to use underscores (C<Homo_sapiens>). This is because some programs
35             introduce whitespace to prettify a newick string, e.g. to indicate indentation/depth,
36             in which case you almost certainly want to ignore it. This is the default behaviour. The
37             option to keep it is provided for dealing with incorrectly formatted data.
38              
39             =cut
40              
41 83     83   293 sub _return_is_scalar { 1 }
42              
43              
44             sub _simplify {
45             # Simplify a Newick tree string by removing unneeded nodes. The leaves to
46             # keep are given as $ids, an arrayref of terminal node IDs. Note that only
47             # cherries are simplified to keep the function fast. Ternary or higher order
48             # branches are left alone. Quoted strings should be handled properly.
49 56     56   132 my ($string, $ids) = @_;
50 56         103 my %id_hash = map { $_ => undef } @$ids;
  106         239  
51              
52             # Setup some regular expressions:
53             # 1/ ID is anything but these characters (except when quoted): , ; : ( ) " '
54 56         167 my $id_re_simple = qr/[^)(,:"';]+/;
55 56         103 my $id_re_squote = qr/[^']+/;
56 56         97 my $id_re_dquote = qr/[^']+/;
57 56         288 my $id_re = qr/ (?: $id_re_simple | '$id_re_squote' | "$id_re_dquote" ) /x;
58             # 2/ Distance is a real number (regexp taken from Regexp::Common $RE{num}{real})
59 56         101 my $dist_re = qr/(?:(?i)(?:[+-]?)(?:(?=[.]?[0123456789])(?:[0123456789]*)(?:(?:[.])(?:[0123456789]{0,}))?)(?:(?:[E])(?:(?:[+-]?)(?:[0123456789]+))|))/;
60             # 3/ A pair of ID and distance (both optional)
61 56         241 my $pair_re = qr/ ($id_re)? (?: \: ($dist_re) )? /x;
62             # 4/ Cherry
63 56         334 my $cherry_re = qr/ ( \( $pair_re , $pair_re \) $pair_re ) /x;
64             # 5/ Whitespaces
65 56         102 my $ws_re = qr/ \s+ /msx;
66              
67             # Remove spaces and newlines (no spaces allowed in node names)
68 56         193 $string =~ s/$ws_re//g;
69              
70             # Prune cherries
71 56         85 my $prev_string = '';
72 56         121 while (not $string eq $prev_string) {
73 134         205 $prev_string = $string;
74 134         1307 $string =~ s/ $cherry_re / _prune_cherry($1, $2, $3, $4, $5, $6, $7, \%id_hash) /gex;
  108         256  
75             }
76 56         159 __PACKAGE__->_logger->debug("simplified string by removing unneeded nodes");
77 56         373 return $string;
78             }
79              
80              
81             sub _prune_cherry {
82 108     108   441 my ($match, $id1, $dist1, $id2, $dist2, $idp, $distp, $id_hash) = @_;
83 108         153 my $repl;
84 108   100     286 my $id1_exists = defined $id1 && exists $id_hash->{$id1};
85 108   100     225 my $id2_exists = defined $id2 && exists $id_hash->{$id2};
86 108 100 100     245 if ( $id1_exists && $id2_exists ) {
87             # Keep both leaves
88 27         44 $repl = $match;
89             } else {
90             # There are from zero to one leaves to keep. Delete one of them.
91 81 100       139 my ($id, $dist) = $id1_exists ? ($id1, $dist1) : ($id2, $dist2);
92 81 100 100     179 if ( defined($dist) || defined($distp) ) {
93 64   100     304 $dist = ':'.(($dist||0) + ($distp||0));
      100        
94             }
95 81 100 50     200 $id ||= '' if not defined $id;
96 81 100 50     154 $dist ||= '' if not defined $dist;
97 81         129 $repl = $id.$dist;
98             }
99 108         442 return $repl;
100             }
101              
102              
103             sub _parse {
104 84     84   177 my $self = shift;
105 84         376 my $fh = $self->_handle;
106 84         296 my $forest = $self->_factory->create_forest;
107              
108 84         176 my $string;
109 84         595 while (<$fh>) {
110 106         233 chomp;
111 106         408 $string .= $_;
112             }
113              
114 84         411 my $ids = $self->_args->{'-keep'};
115 84         220 my $ignore = $self->_args->{'-ignore_comments'};
116 84         205 my $whitespace = $self->_args->{'-keep_whitespace'};
117 84         189 my $quotes = $self->_args->{'-ignore_quotes'};
118              
119             # remove comments, split on tree descriptions
120 84         157 my $counter = 1;
121              
122 84         290 for my $newick ( $self->_split($string,$ignore,$whitespace,$quotes) ) {
123 104         401 $self->_logger->debug("going to process newick string " . $counter++);
124             # simplify tree
125 104 100       305 if ($ids) {
126 1         3 $newick = _simplify($string, $ids);
127             }
128            
129             # parse trees
130 104         353 my $tree = $self->_parse_string($newick);
131              
132             # adding labels to untagged nodes
133 104 50       412 if ( $self->_args->{'-label'} ) {
134 0         0 my $i = 1;
135             $tree->visit(
136             sub {
137 0     0   0 my $n = shift;
138 0 0       0 $n->set_name( 'n' . $i++ ) unless $n->get_name;
139             }
140 0         0 );
141             }
142 104         547 $forest->insert($tree);
143             }
144 84         337 return $forest;
145             }
146              
147             =begin comment
148              
149             Type : Parser
150             Title : _split($string)
151             Usage : my @strings = $newick->_split($string);
152             Function: Creates an array of (decommented) tree descriptions
153             Returns : A Bio::Phylo::Forest::Tree object.
154             Args : $string = concatenated tree descriptions
155              
156             =end comment
157              
158             =cut
159              
160             sub _split {
161 84     84   252 my ( $self, $string, $ignore, $whitespace, $quotes ) = @_;
162 84         281 my $log = $self->_logger;
163 84         211 my ( $QUOTED, $COMMENTED ) = ( 0, 0 );
164 84         144 my $decommented = '';
165 84         145 my @trees;
166 84         320 TOKEN: for my $i ( 0 .. length($string) ) {
167 85317         108684 my $token = substr( $string, $i, 1 );
168              
169             # detect apostrophe as ' between two letters
170 85317 100       121081 my $prev = $i > 0 ? substr( $string, $i-1, 1 ) : 0;
171 85317 100       123455 my $next = $i< length($string) ? substr( $string, $i+1, 1 ) : 0;
172 85317   33     143991 my $apostr = substr( $string, $i, 1 ) eq "'" && $prev=~/[a-z]/i && $next=~/[a-z]/i;
173 85317 50       114419 $log->debug("detected apostrophe") if $apostr;
174              
175 85317 100 100     612818 if ( !$QUOTED && !$COMMENTED && $token eq "'" && ! $quotes && ! $apostr ) {
    100 100        
    100 66        
    100 66        
      100        
      100        
      100        
      100        
      100        
      66        
      66        
      100        
      66        
      66        
      33        
176 3         5 $QUOTED++;
177             }
178             elsif ( !$QUOTED && !$COMMENTED && $token eq "[" && ! $ignore ) {
179 2         3 $COMMENTED++;
180 2         10 $log->debug("quote level changed to $COMMENTED");
181 2         6 next TOKEN;
182             }
183             elsif ( !$QUOTED && $COMMENTED && $token eq "]" && ! $ignore ) {
184 2         4 $COMMENTED--;
185 2         6 next TOKEN;
186             }
187             elsif ($QUOTED
188             && !$COMMENTED
189             && $token eq "'"
190             && substr( $string, $i, 2 ) ne "''" && ! $quotes && ! $apostr )
191             {
192 3         5 $QUOTED--;
193             }
194 85313 100 100     195225 if ( !$QUOTED && $token eq ' ' && ! $whitespace ) {
      66        
195 18         25 next TOKEN;
196             }
197 85295 100       121982 $decommented .= $token unless $COMMENTED;
198 85295 100 100     270849 if ( !$QUOTED && !$COMMENTED && substr( $string, $i, 1 ) eq ';' ) {
      100        
199 104         373 push @trees, $decommented;
200 104         234 $decommented = '';
201             }
202              
203             }
204 84         568 $log->debug("removed comments, split on tree descriptions");
205 84         411 $log->debug("found ".scalar(@trees)." tree descriptions");
206 84         307 return @trees;
207             }
208              
209             =begin comment
210              
211             Type : Parser
212             Title : _parse_string($string)
213             Usage : my $tree = $newick->_parse_string($string);
214             Function: Creates a populated Bio::Phylo::Forest::Tree object from a newick
215             string.
216             Returns : A Bio::Phylo::Forest::Tree object.
217             Args : $string = a newick tree description
218              
219             =end comment
220              
221             =cut
222              
223             sub _parse_string {
224 104     104   247 my ( $self, $string ) = @_;
225 104         321 my $fac = $self->_factory;
226 104         294 $self->_logger->debug("going to parse tree string '$string'");
227 104         801 my $tree = $fac->create_tree;
228 104         237 my $remainder = $string;
229 104         203 my $token;
230             my @tokens;
231 104         392 while ( ( $token, $remainder ) = $self->_next_token($remainder) ) {
232 18523 100 66     56037 last if ( !defined $token || !defined $remainder );
233 18419         36752 $self->_logger->debug("fetched token '$token'");
234              
235 18419         44712 push @tokens, $token;
236             }
237 104         217 my $i;
238 104         420 for ( $i = $#tokens ; $i >= 0 ; $i-- ) {
239 104 50       340 last if $tokens[$i] eq ';';
240             }
241 104         982 my $root = $fac->create_node;
242 104         692 $tree->insert($root);
243 104         1975 $self->_parse_node_data( $root, @tokens[ 0 .. ( $i - 1 ) ] );
244 104         1286 $self->_parse_clade( $tree, $root, @tokens[ 0 .. ( $i - 1 ) ] );
245 104         3645 return $tree;
246             }
247              
248             sub _parse_clade {
249 4442     4442   35341 my ( $self, $tree, $root, @tokens ) = @_;
250 4442         11214 my $fac = $self->_factory;
251 4442         9732 $self->_logger->debug("recursively parsing clade '@tokens'");
252 4442         8271 my ( @clade, $depth, @remainder );
253 4442         11006 TOKEN: for my $i ( 0 .. $#tokens ) {
254 316654 100 100     716953 if ( $tokens[$i] eq '(' ) {
    100          
    100          
255 34793 100       47046 if ( not defined $depth ) {
256 2106         3199 $depth = 1;
257 2106         3996 next TOKEN;
258             }
259             else {
260 32687         35416 $depth++;
261             }
262             }
263             elsif ( $tokens[$i] eq ',' && $depth == 1 ) {
264 2232         13320 my $node = $fac->create_node;
265 2232         8163 $root->set_child($node);
266 2232         7307 $tree->insert($node);
267 2232         8438 $self->_parse_node_data( $node, @clade );
268 2232         10595 $self->_parse_clade( $tree, $node, @clade );
269 2232         12624 @clade = ();
270 2232         4620 next TOKEN;
271             }
272             elsif ( $tokens[$i] eq ')' ) {
273 34793         36607 $depth--;
274 34793 100       48287 if ( $depth == 0 ) {
275 2106         7211 @remainder = @tokens[ ( $i + 1 ) .. $#tokens ];
276 2106         13298 my $node = $fac->create_node;
277 2106         7505 $root->set_child($node);
278 2106         6777 $tree->insert($node);
279 2106         8448 $self->_parse_node_data( $node, @clade );
280 2106         8983 $self->_parse_clade( $tree, $node, @clade );
281 2106         34363 last TOKEN;
282             }
283             }
284 310210         440971 push @clade, $tokens[$i];
285             }
286             }
287              
288             sub _parse_node_data {
289 4227     4227   56980 my ( $self, $node, @clade ) = @_;
290 4227         12000 $self->_logger->debug("parsing name and branch length for node");
291 4227         6184 my @tail;
292 4227         11043 PARSE_TAIL: for ( my $i = $#clade ; $i >= 0 ; $i-- ) {
293 13225 100       29688 if ( $clade[$i] eq ')' ) {
    100          
294 1999         6968 @tail = @clade[ ( $i + 1 ) .. $#clade ];
295 1999         4443 last PARSE_TAIL;
296             }
297             elsif ( $i == 0 ) {
298 2228         6506 @tail = @clade;
299             }
300             }
301            
302 4227 50 100     18621 if ( defined($tail[-1]) and $tail[-1] =~ /(\[.+\])$/ and scalar @tail != 1 ) {
      66        
303 0         0 my $anno = $1;
304 0         0 $self->_logger->info("discarding branch comment $anno");
305 0         0 $tail[-1] =~ s/\Q$anno\E//;
306             }
307              
308             # name only
309 4227 100       13054 if ( scalar @tail == 1 ) {
    100          
    100          
310 317         986 $node->set_name( $tail[0] );
311             }
312             elsif ( scalar @tail == 2 ) {
313 245         674 $node->set_branch_length( $tail[-1] );
314             }
315             elsif ( scalar @tail == 3 ) {
316 3473         13133 $node->set_name( $tail[0] );
317 3473         11128 $node->set_branch_length( $tail[-1] );
318             }
319             }
320              
321             sub _next_token {
322 18523     18523   28911 my ( $self, $string ) = @_;
323 18523         31344 $self->_logger->debug("tokenizing string '$string'");
324 18523         41069 my $ignore = $self->_args->{'-ignore_comments'};
325 18523         24019 my $QUOTED = 0;
326 18523         21813 my $COMMENTED = 0;
327 18523         22104 my $token = '';
328 18523         41896 my $TOKEN_DELIMITER = qr/[():,;]/;
329 18523         38898 TOKEN: for my $i ( 0 .. length($string) ) {
330 93180         137919 $token .= substr( $string, $i, 1 );
331 93180         169601 $self->_logger->debug("growing token: '$token'");
332            
333             # detect apostrophe as ' between two letters
334 93180 100       184040 my $prev = $i > 0 ? substr( $string, $i-1, 1 ) : 0;
335 93180 100       163328 my $next = $i< length($string) ? substr( $string, $i+1, 1 ) : 0;
336 93180   33     177277 my $apostr = substr( $string, $i, 1 ) eq "'" && $prev=~/[a-z]/i && $next=~/[a-z]/i;
337 93180 50       133121 $self->_logger->debug("detected apostrophe") if $apostr;
338              
339             # if -ignore_comments was specified the string can still contain comments
340             # that can contain token delimiters, so we still need to track
341             # whether we are inside a comment
342 93180 100 100     166848 if ( $ignore && $token =~ /\[$/ ) {
343 267         390 $COMMENTED++;
344             }
345 93180 100 100     164600 if ( $ignore && $token =~ /\]$/ ) {
346 267         383 $COMMENTED--;
347 267         543 next TOKEN;
348             }
349 92913 100 100     406501 if ( !$QUOTED && !$COMMENTED && $token =~ $TOKEN_DELIMITER ) {
      100        
350 18419         26563 my $length = length($token);
351 18419 100       27330 if ( $length == 1 ) {
352 10481         20687 $self->_logger->debug("single char token: '$token'");
353 10481         51884 return $token, substr( $string, ( $i + 1 ) );
354             }
355             else {
356 7938         16293 $self->_logger->debug(
357             sprintf( "range token: %s",
358             substr( $token, 0, $length - 1 ) )
359             );
360 7938         56400 return substr( $token, 0, $length - 1 ),
361             substr( $token, $length - 1, 1 )
362             . substr( $string, ( $i + 1 ) );
363             }
364             }
365 74494 100 100     320521 if ( !$QUOTED && !$COMMENTED && substr( $string, $i, 1 ) eq "'" && ! $apostr ) {
    100 100        
      66        
      66        
      100        
      66        
      66        
366 3         7 $QUOTED++;
367             }
368             elsif ($QUOTED && !$COMMENTED
369             && substr( $string, $i, 1 ) eq "'"
370             && substr( $string, $i, 2 ) ne "''" && ! $apostr)
371             {
372 3         7 $QUOTED--;
373             }
374             }
375             }
376              
377             # podinherit_insert_token
378              
379             =head1 SEE ALSO
380              
381             There is a mailing list at L<https://groups.google.com/forum/#!forum/bio-phylo>
382             for any user or developer questions and discussions.
383              
384             =over
385              
386             =item L<Bio::Phylo::IO>
387              
388             The newick parser is called by the L<Bio::Phylo::IO> object.
389             Look there to learn how to parse newick strings.
390              
391             =item L<Bio::Phylo::Manual>
392              
393             Also see the manual: L<Bio::Phylo::Manual> and L<http://rutgervos.blogspot.com>.
394              
395             =back
396              
397             =head1 CITATION
398              
399             If you use Bio::Phylo in published research, please cite it:
400              
401             B<Rutger A Vos>, B<Jason Caravas>, B<Klaas Hartmann>, B<Mark A Jensen>
402             and B<Chase Miller>, 2011. Bio::Phylo - phyloinformatic analysis using Perl.
403             I<BMC Bioinformatics> B<12>:63.
404             L<http://dx.doi.org/10.1186/1471-2105-12-63>
405              
406             =cut
407              
408             1;