File Coverage

blib/lib/Bio/Phylo/Parsers/Newick.pm

Criterion	Covered	Total	%
statement	181	188	96.2
branch	73	80	91.2
condition	107	128	83.5
subroutine	13	14	92.8
pod			n/a
total	374	410	91.2

line	stmt	bran	cond	sub	time	code
1						package Bio::Phylo::Parsers::Newick;
2	24			24	975	use warnings;
	24				54
	24				796
3	24			24	123	use strict;
	24				44
	24				549
4	24			24	108	use base 'Bio::Phylo::Parsers::Abstract';
	24				45
	24				6489
5	24			24	167	no warnings 'recursion';
	24				47
	24				45451
6
7						=head1 NAME
8
9						Bio::Phylo::Parsers::Newick - Parser used by Bio::Phylo::IO, no serviceable parts inside
10
11						=head1 DESCRIPTION
12
13						This module parses tree descriptions in parenthetical format. It is called by the
14						L<Bio::Phylo::IO> facade, don't call it directly. Several additional flags can be
15						passed to the Bio::Phylo::IO parse and parse_tree functions to influence how to deal
16						with complex newick strings:
17
18						-keep => [ ...list of taxa names... ]
19
20						The C<-keep> flag allows you to only retain certain taxa of interest, ignoring others
21						while building the tree object.
22
23						-ignore_comments => 1,
24
25						This will treat comments in square brackets as if they are a normal taxon name character,
26						this so that names such as C<Choristoneura diversana\|BC ZSM Lep 23401[05/*> are parsed
27						"successfully". (Note: square brackets should NOT be used in this way as it will break
28						many parsers).
29
30						-keep_whitespace => 1,
31
32						This will treat unescaped whitespace as if it is a normal taxon name character. Normally,
33						whitespace is only retained inside quoted strings (e.g. C<'Homo sapiens'>), otherwise it
34						is the convention to use underscores (C<Homo_sapiens>). This is because some programs
35						introduce whitespace to prettify a newick string, e.g. to indicate indentation/depth,
36						in which case you almost certainly want to ignore it. This is the default behaviour. The
37						option to keep it is provided for dealing with incorrectly formatted data.
38
39						=cut
40
41	83			83	235	sub _return_is_scalar { 1 }
42
43
44						sub _simplify {
45						# Simplify a Newick tree string by removing unneeded nodes. The leaves to
46						# keep are given as $ids, an arrayref of terminal node IDs. Note that only
47						# cherries are simplified to keep the function fast. Ternary or higher order
48						# branches are left alone. Quoted strings should be handled properly.
49	56			56	135	my ($string, $ids) = @_;
50	56				107	my %id_hash = map { $_ => undef } @$ids;
	106				280
51
52						# Setup some regular expressions:
53						# 1/ ID is anything but these characters (except when quoted): , ; : ( ) " '
54	56				204	my $id_re_simple = qr/[^)(,:"';]+/;
55	56				124	my $id_re_squote = qr/[^']+/;
56	56				119	my $id_re_dquote = qr/[^']+/;
57	56				316	my $id_re = qr/ (?: $id_re_simple \| '$id_re_squote' \| "$id_re_dquote" ) /x;
58						# 2/ Distance is a real number (regexp taken from Regexp::Common $RE{num}{real})
59	56				125	my $dist_re = qr/(?:(?i)(?:[+-]?)(?:(?=[.]?[0123456789])(?:[0123456789]*)(?:(?:[.])(?:[0123456789]{0,}))?)(?:(?:[E])(?:(?:[+-]?)(?:[0123456789]+))\|))/;
60						# 3/ A pair of ID and distance (both optional)
61	56				293	my $pair_re = qr/ ($id_re)? (?: \: ($dist_re) )? /x;
62						# 4/ Cherry
63	56				396	my $cherry_re = qr/ ( $ $pair_re , $pair_re $ $pair_re ) /x;
64						# 5/ Whitespaces
65	56				128	my $ws_re = qr/ \s+ /msx;
66
67						# Remove spaces and newlines (no spaces allowed in node names)
68	56				238	$string =~ s/$ws_re//g;
69
70						# Prune cherries
71	56				99	my $prev_string = '';
72	56				137	while (not $string eq $prev_string) {
73	134				206	$prev_string = $string;
74	134				1512	$string =~ s/ $cherry_re / _prune_cherry($1, $2, $3, $4, $5, $6, $7, \%id_hash) /gex;
	108				283
75						}
76	56				185	__PACKAGE__->_logger->debug("simplified string by removing unneeded nodes");
77	56				431	return $string;
78						}
79
80
81						sub _prune_cherry {
82	108			108	481	my ($match, $id1, $dist1, $id2, $dist2, $idp, $distp, $id_hash) = @_;
83	108				184	my $repl;
84	108		100		319	my $id1_exists = defined $id1 && exists $id_hash->{$id1};
85	108		100		281	my $id2_exists = defined $id2 && exists $id_hash->{$id2};
86	108	100	100		259	if ( $id1_exists && $id2_exists ) {
87						# Keep both leaves
88	27				44	$repl = $match;
89						} else {
90						# There are from zero to one leaves to keep. Delete one of them.
91	81	100			156	my ($id, $dist) = $id1_exists ? ($id1, $dist1) : ($id2, $dist2);
92	81	100	100		179	if ( defined($dist) \|\| defined($distp) ) {
93	64		100		357	$dist = ':'.(($dist\|\|0) + ($distp\|\|0));
			100
94						}
95	81	100	50		254	$id \|\|= '' if not defined $id;
96	81	100	50		153	$dist \|\|= '' if not defined $dist;
97	81				136	$repl = $id.$dist;
98						}
99	108				505	return $repl;
100						}
101
102
103						sub _parse {
104	84			84	155	my $self = shift;
105	84				303	my $fh = $self->_handle;
106	84				266	my $forest = $self->_factory->create_forest;
107
108	84				174	my $string;
109	84				589	while (<$fh>) {
110	106				233	chomp;
111	106				407	$string .= $_;
112						}
113
114	84				428	my $ids = $self->_args->{'-keep'};
115	84				241	my $ignore = $self->_args->{'-ignore_comments'};
116	84				193	my $whitespace = $self->_args->{'-keep_whitespace'};
117	84				315	my $quotes = $self->_args->{'-ignore_quotes'};
118
119						# remove comments, split on tree descriptions
120	84				150	my $counter = 1;
121
122	84				290	for my $newick ( $self->_split($string,$ignore,$whitespace,$quotes) ) {
123	104				371	$self->_logger->debug("going to process newick string " . $counter++);
124						# simplify tree
125	104	100			819	if ($ids) {
126	1				9	$newick = _simplify($string, $ids);
127						}
128
129						# parse trees
130	104				311	my $tree = $self->_parse_string($newick);
131
132						# adding labels to untagged nodes
133	104	50			472	if ( $self->_args->{'-label'} ) {
134	0				0	my $i = 1;
135						$tree->visit(
136						sub {
137	0			0	0	my $n = shift;
138	0	0			0	$n->set_name( 'n' . $i++ ) unless $n->get_name;
139						}
140	0				0	);
141						}
142	104				547	$forest->insert($tree);
143						}
144	84				326	return $forest;
145						}
146
147						=begin comment
148
149						Type : Parser
150						Title : _split($string)
151						Usage : my @strings = $newick->_split($string);
152						Function: Creates an array of (decommented) tree descriptions
153						Returns : A Bio::Phylo::Forest::Tree object.
154						Args : $string = concatenated tree descriptions
155
156						=end comment
157
158						=cut
159
160						sub _split {
161	84			84	258	my ( $self, $string, $ignore, $whitespace, $quotes ) = @_;
162	84				275	my $log = $self->_logger;
163	84				199	my ( $QUOTED, $COMMENTED ) = ( 0, 0 );
164	84				147	my $decommented = '';
165	84				275	my @trees;
166	84				397	TOKEN: for my $i ( 0 .. length($string) ) {
167	76604				96542	my $token = substr( $string, $i, 1 );
168
169						# detect apostrophe as ' between two letters
170	76604	100			109455	my $prev = $i > 0 ? substr( $string, $i-1, 1 ) : 0;
171	76604	100			110226	my $next = $i< length($string) ? substr( $string, $i+1, 1 ) : 0;
172	76604		33		128623	my $apostr = substr( $string, $i, 1 ) eq "'" && $prev=~/[a-z]/i && $next=~/[a-z]/i;
173	76604	50			102534	$log->debug("detected apostrophe") if $apostr;
174
175	76604	100	100		532717	if ( !$QUOTED && !$COMMENTED && $token eq "'" && ! $quotes && ! $apostr ) {
		100	100
		100	66
		100	66
			100
			100
			100
			100
			100
			66
			66
			100
			66
			66
			33
176	3				4	$QUOTED++;
177						}
178						elsif ( !$QUOTED && !$COMMENTED && $token eq "[" && ! $ignore ) {
179	2				4	$COMMENTED++;
180	2				12	$log->debug("quote level changed to $COMMENTED");
181	2				5	next TOKEN;
182						}
183						elsif ( !$QUOTED && $COMMENTED && $token eq "]" && ! $ignore ) {
184	2				5	$COMMENTED--;
185	2				4	next TOKEN;
186						}
187						elsif ($QUOTED
188						&& !$COMMENTED
189						&& $token eq "'"
190						&& substr( $string, $i, 2 ) ne "''" && ! $quotes && ! $apostr )
191						{
192	3				5	$QUOTED--;
193						}
194	76600	100	100		169671	if ( !$QUOTED && $token eq ' ' && ! $whitespace ) {
			66
195	18				30	next TOKEN;
196						}
197	76582	100			108165	$decommented .= $token unless $COMMENTED;
198	76582	100	100		234884	if ( !$QUOTED && !$COMMENTED && substr( $string, $i, 1 ) eq ';' ) {
			100
199	104				345	push @trees, $decommented;
200	104				249	$decommented = '';
201						}
202
203						}
204	84				590	$log->debug("removed comments, split on tree descriptions");
205	84				388	$log->debug("found ".scalar(@trees)." tree descriptions");
206	84				302	return @trees;
207						}
208
209						=begin comment
210
211						Type : Parser
212						Title : _parse_string($string)
213						Usage : my $tree = $newick->_parse_string($string);
214						Function: Creates a populated Bio::Phylo::Forest::Tree object from a newick
215						string.
216						Returns : A Bio::Phylo::Forest::Tree object.
217						Args : $string = a newick tree description
218
219						=end comment
220
221						=cut
222
223						sub _parse_string {
224	104			104	237	my ( $self, $string ) = @_;
225	104				596	my $fac = $self->_factory;
226	104				254	$self->_logger->debug("going to parse tree string '$string'");
227	104				765	my $tree = $fac->create_tree;
228	104				225	my $remainder = $string;
229	104				202	my $token;
230						my @tokens;
231	104				383	while ( ( $token, $remainder ) = $self->_next_token($remainder) ) {
232	16003	100	66		45601	last if ( !defined $token \|\| !defined $remainder );
233	15899				30008	$self->_logger->debug("fetched token '$token'");
234
235	15899				36378	push @tokens, $token;
236						}
237	104				202	my $i;
238	104				391	for ( $i = $#tokens ; $i >= 0 ; $i-- ) {
239	104	50			340	last if $tokens[$i] eq ';';
240						}
241	104				990	my $root = $fac->create_node;
242	104				662	$tree->insert($root);
243	104				1726	$self->_parse_node_data( $root, @tokens[ 0 .. ( $i - 1 ) ] );
244	104				1083	$self->_parse_clade( $tree, $root, @tokens[ 0 .. ( $i - 1 ) ] );
245	104				3007	return $tree;
246						}
247
248						sub _parse_clade {
249	3882			3882	23621	my ( $self, $tree, $root, @tokens ) = @_;
250	3882				9157	my $fac = $self->_factory;
251	3882				7520	$self->_logger->debug("recursively parsing clade '@tokens'");
252	3882				6679	my ( @clade, $depth, @remainder );
253	3882				8710	TOKEN: for my $i ( 0 .. $#tokens ) {
254	193342	100	100		410479	if ( $tokens[$i] eq '(' ) {
		100
		100
255	21185	100			27283	if ( not defined $depth ) {
256	1826				2493	$depth = 1;
257	1826				2986	next TOKEN;
258						}
259						else {
260	19359				21178	$depth++;
261						}
262						}
263						elsif ( $tokens[$i] eq ',' && $depth == 1 ) {
264	1952				10103	my $node = $fac->create_node;
265	1952				6081	$root->set_child($node);
266	1952				5822	$tree->insert($node);
267	1952				6128	$self->_parse_node_data( $node, @clade );
268	1952				7757	$self->_parse_clade( $tree, $node, @clade );
269	1952				8135	@clade = ();
270	1952				3364	next TOKEN;
271						}
272						elsif ( $tokens[$i] eq ')' ) {
273	21185				23040	$depth--;
274	21185	100			28886	if ( $depth == 0 ) {
275	1826				5472	@remainder = @tokens[ ( $i + 1 ) .. $#tokens ];
276	1826				9853	my $node = $fac->create_node;
277	1826				5484	$root->set_child($node);
278	1826				5097	$tree->insert($node);
279	1826				5775	$self->_parse_node_data( $node, @clade );
280	1826				6475	$self->_parse_clade( $tree, $node, @clade );
281	1826				20164	last TOKEN;
282						}
283						}
284	187738				259783	push @clade, $tokens[$i];
285						}
286						}
287
288						sub _parse_node_data {
289	3667			3667	33365	my ( $self, $node, @clade ) = @_;
290	3667				9070	$self->_logger->debug("parsing name and branch length for node");
291	3667				5311	my @tail;
292	3667				8120	PARSE_TAIL: for ( my $i = $#clade ; $i >= 0 ; $i-- ) {
293	11265	100			24413	if ( $clade[$i] eq ')' ) {
		100
294	1719				4978	@tail = @clade[ ( $i + 1 ) .. $#clade ];
295	1719				3324	last PARSE_TAIL;
296						}
297						elsif ( $i == 0 ) {
298	1948				5381	@tail = @clade;
299						}
300						}
301
302	3667	50	100		14502	if ( defined($tail[-1]) and $tail[-1] =~ /(\[.+\])$/ and scalar @tail != 1 ) {
			66
303	0				0	my $anno = $1;
304	0				0	$self->_logger->info("discarding branch comment $anno");
305	0				0	$tail[-1] =~ s/\Q$anno\E//;
306						}
307
308						# name only
309	3667	100			10498	if ( scalar @tail == 1 ) {
		100
		100
310	317				917	$node->set_name( $tail[0] );
311						}
312						elsif ( scalar @tail == 2 ) {
313	245				638	$node->set_branch_length( $tail[-1] );
314						}
315						elsif ( scalar @tail == 3 ) {
316	2913				9109	$node->set_name( $tail[0] );
317	2913				7703	$node->set_branch_length( $tail[-1] );
318						}
319						}
320
321						sub _next_token {
322	16003			16003	23665	my ( $self, $string ) = @_;
323	16003				25450	$self->_logger->debug("tokenizing string '$string'");
324	16003				32469	my $ignore = $self->_args->{'-ignore_comments'};
325	16003				19540	my $QUOTED = 0;
326	16003				17946	my $COMMENTED = 0;
327	16003				17996	my $token = '';
328	16003				33280	my $TOKEN_DELIMITER = qr/[():,;]/;
329	16003				31536	TOKEN: for my $i ( 0 .. length($string) ) {
330	83347				118072	$token .= substr( $string, $i, 1 );
331	83347				144801	$self->_logger->debug("growing token: '$token'");
332
333						# detect apostrophe as ' between two letters
334	83347	100			160749	my $prev = $i > 0 ? substr( $string, $i-1, 1 ) : 0;
335	83347	100			138454	my $next = $i< length($string) ? substr( $string, $i+1, 1 ) : 0;
336	83347		33		156908	my $apostr = substr( $string, $i, 1 ) eq "'" && $prev=~/[a-z]/i && $next=~/[a-z]/i;
337	83347	50			119085	$self->_logger->debug("detected apostrophe") if $apostr;
338
339						# if -ignore_comments was specified the string can still contain comments
340						# that can contain token delimiters, so we still need to track
341						# whether we are inside a comment
342	83347	100	100		151728	if ( $ignore && $token =~ /\[$/ ) {
343	267				330	$COMMENTED++;
344						}
345	83347	100	100		142115	if ( $ignore && $token =~ /\]$/ ) {
346	267				335	$COMMENTED--;
347	267				489	next TOKEN;
348						}
349	83080	100	100		333320	if ( !$QUOTED && !$COMMENTED && $token =~ $TOKEN_DELIMITER ) {
			100
350	15899				21506	my $length = length($token);
351	15899	100			22706	if ( $length == 1 ) {
352	9081				17459	$self->_logger->debug("single char token: '$token'");
353	9081				40306	return $token, substr( $string, ( $i + 1 ) );
354						}
355						else {
356	6818				12941	$self->_logger->debug(
357						sprintf( "range token: %s",
358						substr( $token, 0, $length - 1 ) )
359						);
360	6818				42065	return substr( $token, 0, $length - 1 ),
361						substr( $token, $length - 1, 1 )
362						. substr( $string, ( $i + 1 ) );
363						}
364						}
365	67181	100	100		277182	if ( !$QUOTED && !$COMMENTED && substr( $string, $i, 1 ) eq "'" && ! $apostr ) {
		100	100
			66
			66
			100
			66
			66
366	3				6	$QUOTED++;
367						}
368						elsif ($QUOTED && !$COMMENTED
369						&& substr( $string, $i, 1 ) eq "'"
370						&& substr( $string, $i, 2 ) ne "''" && ! $apostr)
371						{
372	3				7	$QUOTED--;
373						}
374						}
375						}
376
377						# podinherit_insert_token
378
379						=head1 SEE ALSO
380
381						There is a mailing list at L<https://groups.google.com/forum/#!forum/bio-phylo>
382						for any user or developer questions and discussions.
383
384						=over
385
386						=item L<Bio::Phylo::IO>
387
388						The newick parser is called by the L<Bio::Phylo::IO> object.
389						Look there to learn how to parse newick strings.
390
391						=item L<Bio::Phylo::Manual>
392
393						Also see the manual: L<Bio::Phylo::Manual> and L<http://rutgervos.blogspot.com>.
394
395						=back
396
397						=head1 CITATION
398
399						If you use Bio::Phylo in published research, please cite it:
400
401						B<Rutger A Vos>, B<Jason Caravas>, B<Klaas Hartmann>, B<Mark A Jensen>
402						and B<Chase Miller>, 2011. Bio::Phylo - phyloinformatic analysis using Perl.
403						I<BMC Bioinformatics> B<12>:63.
404						L<http://dx.doi.org/10.1186/1471-2105-12-63>
405
406						=cut
407
408						1;