File Coverage

blib/lib/Bio/Phylo/Parsers/Nexus.pm

Criterion	Covered	Total	%
statement	561	680	82.5
branch	195	282	69.1
condition	114	195	58.4
subroutine	46	54	85.1
pod			n/a
total	916	1211	75.6

line	stmt	bran	cond	sub	time	code
1						package Bio::Phylo::Parsers::Nexus;
2	6			6	33	use strict;
	6				11
	6				147
3	6			6	25	use warnings;
	6				11
	6				135
4	6			6	25	use base 'Bio::Phylo::Parsers::Abstract';
	6				10
	6				1512
5	6			6	35	use Bio::Phylo::Factory;
	6				12
	6				21
6	6			6	27	use Bio::Phylo::IO 'parse';
	6				13
	6				265
7	6			6	32	use Bio::Phylo::Util::CONSTANT qw':objecttypes looks_like_instance';
	6				11
	6				1093
8	6			6	37	use Bio::Phylo::Util::Exceptions 'throw';
	6				11
	6				2644
9
10						# TODO: handle mixed? distances, splits, bipartitions
11						my $TAXA = _TAXA_;
12						my $MATRIX = _MATRIX_;
13
14						# useful regular expressions
15						my $COMMENT = qr\|^\[\|; # crude, only checks first char, use after tokenizing!
16						my $QUOTES_OR_BRACKETS =
17						qr/[\[\]'"]/mox; # catch all for opening/closing square brackets and quotes
18						my $OPENING_QUOTE_OR_BRACKET =
19						qr/^(.?)([\['"].)$/mox; # capturing regex for opening sq. br. & q.
20
21						# this is a dispatch table whose sub references are invoked
22						# during parsing. the keys match the tokens upon which the
23						# respective subs are called. Underscored (private) fields are for parsing
24						# context. The fields of this table comprise the default state of the
25						# parser object.
26						my %defaults = (
27						'_lines' => undef,
28						'_current' => undef,
29						'_previous' => undef,
30						'_begin' => undef,
31						'_ntax' => undef,
32						'_nchar' => undef,
33						'_gap' => undef,
34						'_missing' => undef,
35						'_i' => undef,
36						'_tree' => undef,
37						'_trees' => undef,
38						'_treename' => undef,
39						'_treestart' => undef,
40						'_row' => undef,
41						'_matrixtype' => undef,
42						'_found' => 0,
43						'_linemode' => 0,
44						'_taxlabels' => [],
45						'_tokens' => [],
46						'_context' => [],
47						'_translate' => [],
48						'_symbols' => [],
49						'_charlabels' => [],
50						'_statelabels' => [],
51						'_charstatelabels' => [],
52						'_tmpstatelabels' => [],
53						'_comments' => [],
54						'_treenames' => [],
55						'_matrixrowlabels' => [],
56						'_matrix' => {},
57						'_charset' => {},
58						'_taxset' => {},
59						'begin' => \&_begin,
60						'taxa' => \&_taxa,
61						'title' => \&_title,
62						'dimensions' => \&_dimensions,
63						'ntax' => \&_ntax,
64						'taxlabels' => \&_taxlabels,
65						'blockid' => \&_blockid,
66						'data' => \&_data,
67						'characters' => \&_characters,
68						'codons' => \&_codons,
69						'nchar' => \&_nchar,
70						'format' => \&_format,
71						'datatype' => \&_datatype,
72						'matchchar' => \&_matchchar,
73						'gap' => \&_gap,
74						'missing' => \&_missing,
75						'charlabels' => \&_charlabels,
76						'statelabels' => \&_statelabels,
77						'charstatelabels' => \&_charstatelabels,
78						'symbols' => \&_symbols,
79						'items' => \&_items,
80						'matrix' => \&_matrix,
81						'charset' => \&_charset,
82						'taxset' => \&_taxset,
83						'trees' => \&_trees,
84						'translate' => \&_translate,
85						'tree' => \&_tree,
86						'utree' => \&_tree,
87						'end' => \&_end,
88						'endblock' => \&_end,
89						'#nexus' => \&_nexus,
90						'link' => \&_link,
91						';' => \&_semicolon,
92						'interleave' => \&_interleave,
93						);
94
95						=head1 NAME
96
97						Bio::Phylo::Parsers::Nexus - Parser used by Bio::Phylo::IO, no serviceable parts inside
98
99						=head1 DESCRIPTION
100
101						This module parses nexus files. It is called by the L<Bio::Phylo::IO> module,
102						there is no direct usage. The parser can handle files and strings with multiple
103						tree, taxon, and characters blocks whose links are defined using Mesquite's
104						"TITLE = 'some_name'" and "LINK TAXA = 'some_name'" tokens.
105
106						The parser returns a reference to an array containing one or more taxa, trees
107						and matrices objects. Nexus comments are stripped, private nexus blocks (and the
108						'assumptions' block) are skipped. It currently doesn't handle 'mixed' data.
109
110						=begin comment
111
112						Type : Constructor
113						Title : _new
114						Usage : my $nexus = Bio::Phylo::Parsers::Nexus->_new;
115						Function: Initializes a Bio::Phylo::Parsers::Nexus object.
116						Returns : A Bio::Phylo::Parsers::Nexus object.
117						Args : none.
118
119						=end comment
120
121						=cut
122
123						sub _process_defaults {
124	15			15	27	my $self = shift;
125	15				213	for my $key ( keys %defaults ) {
126	990	100			1487	if ( looks_like_instance( $defaults{$key}, 'ARRAY' ) ) {
		100
127	180				363	$self->{$key} = [];
128						}
129						elsif ( looks_like_instance( $defaults{$key}, 'HASH' ) ) {
130	45				124	$self->{$key} = {};
131						}
132						else {
133	765				1533	$self->{$key} = $defaults{$key};
134						}
135						}
136	15				59	return $self;
137						}
138
139						=begin comment
140
141						Type : Wrapper
142						Title : _from_handle(\*FH)
143						Usage : $nexus->_from_handle(\*FH);
144						Function: Does all the parser magic, from a file handle
145						Returns : ARRAY
146						Args : \*FH = file handle
147
148						=end comment
149
150						=cut
151
152						sub _parse {
153	15			15	31	my $self = shift;
154	15				53	$self->_process_defaults;
155	15				75	$self->_logger->info("going to parse nexus data");
156	15				68	$self->{'_lines'} = $self->_stringify(@_);
157	15				53	$self->{'_tokens'} = $self->_tokenize( $self->{'_lines'} );
158
159						# iterate over tokens, dispatch methods from %{ $self } table
160						# This is the meat of the parsing, from here everything else is called.
161	15				59	$self->_logger->info("tokenized and split data, going to parse blocks");
162	15				28	my $i = 0;
163	15				28	my $private_block;
164	15				40	my $token_queue = [ undef, undef, undef ];
165	6			6	40	no strict 'refs';
	6				12
	6				33597
166	15				26	TOKEN_LINE: for my $token_line ( @{ $self->{'_tokens'} } ) {
	15				51
167	313	100			586	if ( not $self->{'_linemode'} ) {
		50
168	265				345	RAW_TOKEN: for my $raw_token ( @{$token_line} ) {
	265				443
169	1060	100			3808	if ( $raw_token =~ qr/^\[/ ) {
170	10				17	push @{ $self->{'_comments'} }, $raw_token;
	10				30
171	10				28	next RAW_TOKEN;
172						}
173	1050				2014	my $lower_case_token = lc($raw_token);
174	1050				1694	push @$token_queue, $lower_case_token;
175	1050				1261	shift @$token_queue;
176	1050	100	66		3431	if ( exists $self->{$lower_case_token} and not $private_block )
		100	66
177						{
178	415	50			917	if ( ref $self->{$lower_case_token} eq 'CODE' ) {
179	415				608	$self->{'_previous'} = $self->{'_current'};
180	415				555	$self->{'_current'} = $lower_case_token;
181
182						# pull code ref from dispatch table
183	415				550	my $c = $self->{$lower_case_token};
184
185						# invoke as object method
186	415				1528	$self->$c($raw_token);
187	414				853	next RAW_TOKEN;
188						}
189						}
190						elsif ( $self->{'_current'} and not $private_block ) {
191	634				954	my $c = $self->{ $self->{'_current'} };
192	634				1116	$self->$c($raw_token);
193	634				1109	next RAW_TOKEN;
194						}
195
196						# $self->{'_begin'} is switched 'on' by &_begin(), and 'off'
197						# again by any one of the appropriate subsequent tokens, i.e.
198						# taxa, data, characters and trees
199	1	0	33		10	if ( $self->{'_begin'}
			33
200						and not exists $self->{$lower_case_token}
201						and not $private_block )
202						{
203	0				0	$private_block = $raw_token;
204	0				0	next RAW_TOKEN;
205						}
206
207						# jump over private block content
208	1	50	33		4	if ( $private_block
			33
209						and $token_queue->[-2] eq 'end'
210						and $token_queue->[-1] eq ';' )
211						{
212	0				0	$private_block = 0;
213	0				0	$self->_logger->info(
214						"Skipped private $private_block block");
215	0				0	next RAW_TOKEN;
216						}
217						else {
218	1				3	next RAW_TOKEN;
219						}
220						}
221						}
222						elsif ( $self->{'_linemode'} ) {
223	48				83	my $c = $self->{ $self->{'_current'} };
224	48				63	push @{$token_queue}, $token_line;
	48				94
225	48				66	shift @$token_queue;
226	48				104	$self->$c($token_line);
227	47				106	next TOKEN_LINE;
228						}
229						}
230	13				59	return $self->_post_process(@_);
231						}
232
233						# makes array reference of strings, one string per line, from input
234						# file handle or string;
235						sub _stringify {
236	15			15	28	my $self = shift;
237	15				46	$self->_logger->info("going to split nexus data on lines");
238	15				37	my %opts = @_;
239	15				31	my @lines;
240	15				72	my $handle = $self->_handle;
241	15				89	while (<$handle>) {
242	435				601	my $line = $_;
243	435				1448	push @lines, grep { /\S/ } split( /\n\|\r\|\r\n/, $line );
	379				1085
244	435				879	$self->_logger->debug("read line: $line");
245						}
246	15				57	return \@lines;
247						}
248
249						=begin comment
250
251						Type : Method
252						Title : _tokenize()
253						Usage : $nexus->_tokenize($lines);
254						Function: Tokenizes lines in $lines array ref
255						Returns : Two dimensional ARRAY
256						Args : An array ref of lines (e.g. read from an input file);
257						Comments: This method accepts an array ref holding lines that may contain
258						single quotes, double quotes or square brackets. Line breaks and
259						spaces inside these quoted/bracketed fragments are ignored, otherwise
260						it is split, e.g.:
261
262						[
263						[ '#NEXUS' ],
264						[ 'BEGIN TAXA; [taxablock comment]' ],
265						[ 'DIMENSIONS NTAX=3;' ],
266						[ 'TAXLABELS "Taxon \' A" \'Taxon B\' TAXON[comment]C' ],
267						...etc...
268						]
269
270						becomes:
271						[
272						[ '#NEXUS' ],
273						[
274						'BEGIN',
275						'TAXA',
276						';',
277						'[taxablock comment]'
278						],
279						[
280						'DIMENSIONS',
281						'NTAX',
282						'=',
283						'3',
284						';'
285						],
286						[
287						'TAXLABELS',
288						'"Taxon \' A"',
289						'\'Taxon B\'',
290						'TAXON',
291						'[comment]',
292						'C'
293						],
294						...etc...
295						]
296
297
298						=end comment
299
300						=cut
301
302						sub _tokenize {
303	15			15	45	my ( $self, $lines ) = @_;
304	15				37	$self->_logger->info("going to split lines on tokens");
305	15				44	my ( $extract, $INSIDE_QUOTE, $continue ) = ( '', 0, 0 );
306	15				29	my ( @tokens, @split );
307	15				85	my $CLOSING_BRACKET_MIDLINE = qr/^.?(\])(.)$/mox;
308	15				53	my $CONTEXT_QB_AT_START = qr/^([\['"])(.*)$/mox;
309	15				43	my $CONTEXT_CLOSER;
310						my $QuoteContext; # either " ' or [
311	15				0	my $QuoteStartLine;
312	15				28	my $LineCount = 0;
313	15				86	my %CLOSE_CHAR = (
314						'"' => '"',
315						"'" => "'",
316						'[' => ']',
317						);
318	15				67	my %INVERSE_CLOSE_CHAR = (
319						'"' => '"',
320						"'" => "'",
321						']' => '[',
322						')' => '(',
323						);
324
325						# tokenize
326	15				38	LINE: for my $line ( @{$lines} ) {
	15				41
327	379				456	$LineCount++;
328	379				937	TOKEN: while ( $line =~ /\S/ ) {
329
330						# line in file has no quoting/bracketing characters, and
331						# is no extension of a quoted/bracketed fragment starting
332						# on a previous line
333	408	100	100		2362	if ( $line !~ $QUOTES_OR_BRACKETS && !$INSIDE_QUOTE ) {
		100	100
		100	66
		100	66
		50	33
334	332	100			493	if ($continue) {
335	20				28	push @{ $tokens[-1] }, $line;
	20				49
336	20				33	$continue = 0;
337						}
338						else {
339	312				568	push @tokens, [$line];
340						}
341	332				411	my $logline = join( ' ', @{ $tokens[-1] } );
	332				621
342	332				452	chomp($logline);
343	332				660	$self->_logger->debug("Tokenized line $LineCount: $logline");
344	332				618	next LINE;
345						}
346
347						# line in file has opening quoting/bracketing characters, and
348						# is no extension of a quoted/bracketed fragment starting
349						# on a previous line
350						elsif ( $line =~ $OPENING_QUOTE_OR_BRACKET && !$INSIDE_QUOTE ) {
351	20				86	my ( $start, $quoted ) = ( $1, $2 );
352	20				52	push @tokens, [$start];
353	20				36	$line = $quoted;
354	20				31	$extract = $quoted;
355	20				30	$INSIDE_QUOTE++;
356	20				33	$continue = 1;
357	20				41	$QuoteContext = substr( $quoted, 0, 1 );
358	20				52	$self->_logger->debug("Line $LineCount contains $QuoteContext");
359	20				35	$QuoteStartLine = $LineCount;
360	20				251	$CONTEXT_QB_AT_START = qr/^(\Q$QuoteContext\E)(.*)$/;
361	20				54	my $context_closer = $CLOSE_CHAR{$QuoteContext};
362	20				235	$CONTEXT_CLOSER = qr/^(.?)(\Q$context_closer\E)(.)$/;
363	20				90	next TOKEN;
364						}
365
366						# line in file has no quoting/bracketing characters, and
367						# is an extension of a quoted/bracketed fragment starting
368						# on a previous line
369						elsif ( $line !~ $CONTEXT_CLOSER && $INSIDE_QUOTE ) {
370	30				66	$self->_logger->debug(
371						"Line $LineCount extends quote or comment");
372	30				41	$extract .= $line;
373	30				53	next LINE;
374						}
375						elsif ( $line =~ $CONTEXT_QB_AT_START && $INSIDE_QUOTE ) {
376	20				93	my ( $q, $remainder ) = ( $1, $1 . $2 );
377	20	100	100		90	if ( $q eq '"' \|\| $q eq "'" ) {
		50
378	9	50			144	if ( $remainder =~ m/^($q[^$q]?$q)(.)$/ ) {
		0
379	9				37	$self->_logger->debug(
380						"Line $LineCount closes $INVERSE_CLOSE_CHAR{$q} with $q"
381						);
382	9				16	push @{ $tokens[-1] }, ($1);
	9				33
383	9				30	$line = $2;
384	9				17	$INSIDE_QUOTE--;
385	9				36	next TOKEN;
386						}
387						elsif ( $remainder =~ m/^$q[^$q]*$/ ) {
388	0				0	$extract .= $line;
389	0				0	$continue = 1;
390	0				0	next LINE;
391						}
392						}
393						elsif ( $q eq '[' ) {
394	11				35	for my $i ( 1 .. length($line) ) {
395	830	100			1221	$INSIDE_QUOTE++ if substr( $line, $i, 1 ) eq '[';
396	830	100	66		2019	if ( $i and !$INSIDE_QUOTE ) {
397	8				13	push @{ $tokens[-1] }, substr( $line, 0, $i );
	8				32
398	8				23	my $logqc = substr( $line, ( $i - 1 ), 1 );
399	8				28	$self->_logger->debug(
400						"Line $LineCount closes $INVERSE_CLOSE_CHAR{$logqc} with $logqc"
401						);
402	8				20	$line = substr( $line, $i );
403	8				29	next TOKEN;
404						}
405	822	100			1326	$INSIDE_QUOTE-- if substr( $line, $i, 1 ) eq ']';
406						}
407	3				8	$extract = $line;
408	3				5	$continue = 1;
409	3				7	next LINE;
410						}
411						}
412						elsif ( $line =~ $CONTEXT_CLOSER && $INSIDE_QUOTE ) {
413	6				26	my ( $start, $q, $remainder ) = ( $1, $2, $3 );
414	6				22	$self->_logger->debug(
415						"Line $LineCount closes $INVERSE_CLOSE_CHAR{$q} with $q");
416	6	50			19	$start = $extract . $start if $continue;
417	6	50	33		30	if ( $q eq '"' or $q eq "'" ) {
		50
418	0				0	push @{ $tokens[-1] }, $start;
	0				0
419	0				0	$line = $remainder;
420	0				0	next TOKEN;
421						}
422						elsif ( $q eq ']' ) {
423	6				16	for my $i ( 0 .. length($line) ) {
424	414	100			602	$INSIDE_QUOTE++ if substr( $line, $i, 1 ) eq '[';
425	414	100	100		856	if ( $i and !$INSIDE_QUOTE ) {
426	3				5	my $segment = substr( $line, 0, $i );
427	3	50			8	if ($continue) {
428	3				4	push @{ $tokens[-1] }, $extract . $segment;
	3				16
429						}
430						else {
431	0				0	push @{ $tokens[-1] }, $segment;
	0				0
432						}
433	3				7	$line = substr( $line, $i );
434	3				9	next TOKEN;
435						}
436	411	100			643	$INSIDE_QUOTE-- if substr( $line, $i, 1 ) eq ']';
437						}
438	3	50			11	if ($continue) {
439	3				7	$extract .= $line;
440						}
441						else {
442	0				0	$extract = $line;
443						}
444	3				6	$continue = 1;
445	3				6	next LINE;
446						}
447						}
448						}
449						}
450
451						# an exception here means that an opening quote symbol " ' [
452						# ($QuoteContext) was encountered at input file/string line $QuoteStartLine.
453						# This can happen if any of these symbols is used in an illegal
454						# way, e.g. by using double quotes as gap symbols in matrices.
455	15	50			45	if ($INSIDE_QUOTE) {
456	0				0	throw 'BadArgs' =>
457						"Unbalanced $QuoteContext starting at line $QuoteStartLine";
458						}
459
460						# final split: non-quoted/bracketed fragments are split on whitespace,
461						# others are preserved verbatim
462						$self->_logger->info(
463	15				59	"going to split non-quoted/commented fragments on whitespace");
464	15				41	foreach my $line (@tokens) {
465	332				388	my @line;
466	332				511	foreach my $word (@$line) {
467	372	100			1025	if ( $word !~ $QUOTES_OR_BRACKETS ) {
468	352				1603	$word =~ s/(=\|;\|,)/ $1 /g;
469	352				1371	push @line, grep { /\S/ } split /\s+/, $word;
	1470				3280
470						}
471						else {
472	20				54	push @line, $word;
473						}
474						}
475	332				657	push @split, \@line;
476						}
477	15				163	return \@split;
478						}
479
480						# link matrices and forests to taxa
481						sub _post_process {
482	13			13	30	my $self = shift;
483	13				29	my $taxa = [];
484	13				34	foreach my $block ( @{ $self->{'_context'} } ) {
	13				40
485	26	100	33		85	if ( $block->_type == $TAXA ) {
		50
486	13				23	push @{$taxa}, $block;
	13				26
487						}
488						elsif ( $block->_type != $TAXA and $block->can('set_taxa') ) {
489	13	50	33		88	if ( $taxa->[-1]
			33
490						and $taxa->[-1]->can('_type') == $TAXA
491						and not $block->get_taxa )
492						{
493	0				0	$block->set_taxa( $taxa->[-1] ); # XXX exception here?
494						}
495						}
496						}
497	13				33	my $blocks = $self->{'_context'};
498
499						# initialize object, note we have to
500						# force data type references to be empty
501	13				21	@{$taxa} = ();
	13				29
502	13				228	for my $key ( keys %defaults ) {
503	858	100			1322	if ( looks_like_instance( $defaults{$key}, 'ARRAY' ) ) {
		100
504	156				402	$self->{$key} = [];
505						}
506						elsif ( looks_like_instance( $defaults{$key}, 'HASH' ) ) {
507	39				86	$self->{$key} = {};
508						}
509						else {
510	663				1083	$self->{$key} = $defaults{$key};
511						}
512						}
513	13				54	return @{$blocks};
	13				71
514						}
515
516						=begin comment
517
518						The following subs are called by the dispatch table stored in the object when
519						their respective tokens are encountered.
520
521						=end comment
522
523						=cut
524
525						sub _nexus {
526	14			14	35	my $self = shift;
527	14	50			58	if ( uc( $_[0] ) eq '#NEXUS' ) {
528	14				45	$self->_logger->info("found nexus token");
529						}
530						}
531
532						sub _begin {
533	43			43	66	my $self = shift;
534	43				75	$self->{'_begin'} = 1;
535						}
536
537						sub _taxa {
538	18			18	30	my $self = shift;
539	18	100			49	if ( $self->{'_begin'} ) {
540	13				61	my $taxa = $self->_factory->create_taxa;
541	13				29	push @{ $self->{'_context'} }, $taxa;
	13				41
542	13				42	$self->_logger->info("starting taxa block");
543	13				36	$self->{'_begin'} = 0;
544						}
545						else {
546	5				9	$self->{'_current'} = 'link'; # because of 'link taxa = blah' construct
547						}
548						}
549
550						sub _charset {
551	17			17	19	my $self = shift;
552	17				20	my $token = shift;
553
554						# first thing after the CHARSET token is the set name
555	17	100	100		119	if ( $token !~ /CHARSET/i && ! $self->{'_charset'}->{'name'} ) {
		100	100
		100	100
		100
556	2				6	$self->{'_charset'}->{'name'} = $token;
557	2				8	$self->{'_charset'}->{'range'} = [];
558						}
559
560						# then there might be a mesquite-style matrix reference, e.g. (CHARACTERS = matrix_name)
561						elsif ( $token =~ m/^\(/ ) {
562	1				3	$self->{'_charset'}->{'matrix'} = '';
563						}
564						elsif ( defined $self->{'_charset'}->{'matrix'} && ! $self->{'_charset'}->{'matrix'} && $token !~ /(?:\(?CHARACTERS\|=)/i ) {
565	1				4	$token =~ s/\)$//;
566	1				3	$self->{'_charset'}->{'matrix'} = $token;
567						}
568
569						# then come the indices
570						elsif ( $token =~ /(?:\d+\|-)/ ) {
571	8				9	push @{ $self->{'_charset'}->{'range'} }, $token;
	8				16
572						}
573						}
574
575						sub _taxset {
576	19			19	21	my $self = shift;
577	19				24	my $token = shift;
578
579						# first thing after the TAXSET token is the set name
580	19	100	100		116	if ( $token !~ /TAXSET/i && ! $self->{'_taxset'}->{'name'} ) {
		100	100
		100	100
		100
581	2				5	$self->{'_taxset'}->{'name'} = $token;
582	2				6	$self->{'_taxset'}->{'range'} = [];
583						}
584
585						# then there might be a mesquite-style taxa reference, e.g. (TAXA = matrix_name)
586						elsif ( $token =~ m/^\(/ ) {
587	1				3	$self->{'_taxset'}->{'taxa'} = '';
588						}
589						elsif ( defined $self->{'_taxset'}->{'taxa'} && ! $self->{'_taxset'}->{'taxa'} && $token !~ /(?:\(?TAXA\|=)/ ) {
590	1				5	$token =~ s/\)$//;
591	1				4	$self->{'_taxset'}->{'taxa'} = $token;
592						}
593
594						# then come the indices
595						elsif ( $token =~ /(?:\d+\|-)/ ) {
596	10				13	push @{ $self->{'_taxset'}->{'range'} }, $token;
	10				20
597						}
598						}
599
600						sub _interleave {
601	0			0	0	my $self = shift;
602	0				0	my $token = shift;
603	0				0	$self->_logger->info("perhaps we'll need to parse interleaved");
604	0	0	0		0	if ( defined $token and uc($token) eq 'NO' ) {
605	0				0	$self->_logger->info("no, we don't need to parse interleaved");
606						}
607						}
608
609						sub _title {
610	25			25	37	my $self = shift;
611	25				34	my $token = shift;
612	25	100	66		96	if ( defined $token and uc($token) ne 'TITLE' ) {
613	11				19	my $title = $token;
614	11	50			31	if ( not $self->_current->get_name ) {
615	11				23	$self->_current->set_name($title);
616	11				30	$self->_logger->info("block has title '$title'");
617						}
618						}
619						}
620
621						sub _link {
622	2			2	4	my $self = shift;
623	2				2	my $token = shift;
624	2	50	33		17	if ( defined $token and $token !~ m/^(?:LINK\|TAXA\|=)$/i ) {
625	0				0	my $link = $token;
626	0	0			0	if ( not $self->_current->get_taxa ) {
627	0				0	foreach my $block ( @{ $self->{'_context'} } ) {
	0				0
628	0	0	0		0	if ( $block->get_name and $block->get_name eq $link ) {
629	0				0	$self->_current->set_taxa($block);
630	0				0	last;
631						}
632						}
633						$self->_logger->info(
634	0				0	"block links to taxa block with title '$link'");
635						}
636						}
637						}
638
639				22		sub _dimensions {
640
641						#my $self = shift;
642						}
643
644						sub _ntax {
645	39			39	54	my $self = shift;
646	39	100	66		195	if ( defined $_[0] and $_[0] =~ m/^\d+$/ ) {
647	13				33	$self->{'_ntax'} = shift;
648	13				29	my $ntax = $self->{'_ntax'};
649	13				43	$self->_logger->info("number of taxa: $ntax");
650						}
651						}
652
653						sub _taxlabels {
654	72			72	103	my $self = shift;
655	72	100	66		291	if ( defined $_[0] and uc( $_[0] ) ne 'TAXLABELS' ) {
		50	33
656	59				84	my $taxon = shift;
657	59				127	$self->_logger->debug("taxon: $taxon");
658	59				83	push @{ $self->{'_taxlabels'} }, $taxon;
	59				131
659						}
660						elsif ( defined $_[0] and uc( $_[0] ) eq 'TAXLABELS' ) {
661						$self->_current->set_generic(
662	13				53	'nexus_comments' => $self->{'_comments'} );
663	13				38	$self->{'_comments'} = [];
664	13				46	$self->_logger->info("starting taxlabels");
665						}
666						}
667
668						sub _blockid {
669	0			0	0	my $self = shift;
670	0	0	0		0	if ( defined $_[0] and uc( $_[0] ) ne 'BLOCKID' ) {
671	0				0	my $blockid = shift;
672	0				0	$self->_logger->debug("blockid: $blockid");
673	0				0	$self->_current->set_generic( 'blockid' => $blockid );
674						}
675						}
676
677						sub _data {
678	0			0	0	my $self = shift;
679	0	0			0	if ( $self->{'_begin'} ) {
680	0				0	$self->{'_begin'} = 0;
681	0				0	push @{ $self->{'_context'} }, $self->_factory->create_matrix;
	0				0
682	0				0	$self->_logger->info("starting data block");
683						}
684						}
685
686						sub _characters {
687	9			9	18	my $self = shift;
688	9	50			33	if ( $self->{'_begin'} ) {
689	9				18	$self->{'_begin'} = 0;
690	9				16	push @{ $self->{'_context'} }, $self->_factory->create_matrix;
	9				37
691	9				39	$self->_logger->info("starting characters block");
692						}
693						}
694
695						sub _nchar {
696	27			27	42	my $self = shift;
697	27	100	66		137	if ( defined $_[0] and $_[0] =~ m/^\d+$/ ) {
698	9				22	$self->{'_nchar'} = shift;
699	9				20	my $nchar = $self->{'_nchar'};
700	9				80	$self->_logger->info("number of characters: $nchar");
701						}
702						}
703
704				9		sub _format {
705
706						#my $self = shift;
707						}
708
709						sub _datatype {
710	27			27	39	my $self = shift;
711	27	100	66		148	if ( defined $_[0] and $_[0] !~ m/^(?:DATATYPE\|=)/i ) {
712	9				25	my $datatype = shift;
713	9				32	$self->_current->set_type($datatype);
714	9				28	$self->_logger->info("datatype: $datatype");
715						}
716						}
717
718						sub _matchchar {
719	0			0	0	my $self = shift;
720	0	0	0		0	if ( defined $_[0] and $_[0] !~ m/^(?:MATCHCHAR\|=)/i ) {
721	0				0	my $matchchar = shift;
722	0				0	$self->_current->set_matchchar($matchchar);
723	0				0	$self->_logger->info("matchchar: $matchchar");
724						}
725						}
726
727				0		sub _items {
728
729						#my $self = shift;
730						}
731
732						sub _gap {
733	27			27	44	my $self = shift;
734	27	100	66		122	if ( $_[0] !~ m/^(?:GAP\|=)/i and !$self->{'_gap'} ) {
735	9				20	$self->{'_gap'} = shift;
736	9				20	my $gap = $self->{'_gap'};
737	9				26	$self->_current->set_gap($gap);
738	9				28	$self->_logger->info("gap character: $gap");
739	9				19	undef $self->{'_gap'};
740						}
741						}
742
743						sub _missing {
744	27			27	39	my $self = shift;
745	27	100	66		118	if ( $_[0] !~ m/^(?:MISSING\|=)/i and !$self->{'_missing'} ) {
746	9				19	$self->{'_missing'} = shift;
747	9				20	my $missing = $self->{'_missing'};
748	9				26	$self->_current->set_missing($missing);
749	9				29	$self->_logger->info("missing character: $missing");
750	9				24	undef $self->{'_missing'};
751						}
752						}
753
754						sub _symbols {
755	15			15	19	my $self = shift;
756	15	100	66		79	if ( $_[0] !~ m/^(?:SYMBOLS\|=)$/i and $_[0] =~ m/^"?(.+)"?$/ ) {
757	5				15	my $sym = $1;
758	5				21	$sym =~ s/"//g;
759	5				40	my @syms = grep { /\S+/ } split /\s+/, $sym;
	17				47
760	5				13	push @{ $self->{'_symbols'} }, @syms;
	5				16
761	5				19	$self->_logger->debug("recorded character state symbols '@syms'");
762						}
763						}
764
765						sub _charlabels {
766	0			0	0	my $self = shift;
767	0	0	0		0	if ( defined $_[0] and uc $_[0] ne 'CHARLABELS' ) {
768	0				0	push @{ $self->{'_charlabels'} }, shift;
	0				0
769						}
770						}
771
772						sub _charstatelabels {
773	18			18	21	my $self = shift;
774	18				23	my $token = shift;
775	18				36	$self->_logger->debug($token);
776	18	100	66		57	if ( defined $token and uc $token ne 'CHARSTATELABELS' ) {
777	16				21	push @{ $self->{'_charstatelabels'} }, $token;
	16				30
778						}
779						}
780
781						sub _statelabels {
782	0			0	0	my $self = shift;
783	0				0	my $token = shift;
784	0	0	0		0	if ( defined $token and uc $token ne 'STATELABELS' ) {
785	0	0			0	if ( $token eq ',' ) {
786	0				0	my $tmpstatelabels = $self->{'_tmpstatelabels'};
787	0				0	my $index = shift @{$tmpstatelabels};
	0				0
788	0				0	$self->{'_statelabels'}->[ $index - 1 ] = $tmpstatelabels;
789	0				0	$self->{'_tmpstatelabels'} = [];
790						}
791						else {
792	0				0	push @{ $self->{'_tmpstatelabels'} }, $token;
	0				0
793						}
794						}
795						}
796
797						# for data type, character labels, state labels
798						sub _add_matrix_metadata {
799	57			57	77	my $self = shift;
800	57				111	$self->_logger->info("adding matrix metadata");
801	57	100			116	if ( not defined $self->{'_matrixtype'} ) {
802	9				24	$self->{'_matrixtype'} = $self->_current->get_type;
803	9	50			19	if ( @{ $self->{'_charlabels'} } ) {
	9				34
804	0				0	$self->_current->set_charlabels( $self->{'_charlabels'} );
805	0				0	$self->_logger->debug("adding character labels");
806						}
807	9	50			21	if ( @{ $self->{'_statelabels'} } ) {
	9				29
808	0				0	$self->_current->set_statelabels( $self->{'_statelabels'} );
809	0				0	$self->_logger->debug("adding state labels");
810						}
811	9	50			17	if ( my @symbols = @{ $self->{'_symbols'} } ) {
	9				34
812	0				0	$self->_logger->debug("updating state lookup table");
813	0				0	my $to = $self->_current->get_type_object;
814	0				0	my $lookup = $to->get_lookup;
815	0	0			0	if ($lookup) {
816	0				0	for my $sym (@symbols) {
817	0	0			0	if ( not exists $lookup->{$sym} ) {
818	0				0	$lookup->{$sym} = [$sym];
819						}
820						}
821						}
822						}
823						}
824	57				74	return $self;
825						}
826
827						sub _add_tokens_to_row {
828	48			48	77	my ( $self, $tokens ) = @_;
829	48				63	my $rowname;
830	48				58	for my $token ( @{$tokens} ) {
	48				92
831	90				190	$self->_logger->debug("token: $token");
832	90	100			176	last if $token eq ';';
833
834						# mesquite sometimes writes multiline (but not interleaved)
835						# matrix rows (harrumph).
836	81	100	66		474	if ( not defined $rowname and $token !~ $COMMENT ) {
		50	33
837	39				65	my $taxa;
838	39	50			69	if ( $taxa = $self->_current->get_taxa ) {
		50
839	0	0			0	if ( my $taxon = $taxa->get_by_name($token) ) {
840	0				0	$rowname = $token;
841						}
842						else {
843	0				0	$rowname = $self->{'_matrixrowlabels'}->[-1];
844						}
845						}
846						elsif ( $taxa = $self->_find_last_seen_taxa_block ) {
847	39	50			118	if ( my $taxon = $taxa->get_by_name($token) ) {
848	39				64	$rowname = $token;
849						}
850						else {
851	0				0	$rowname = $self->{'_matrixrowlabels'}->[-1];
852						}
853						}
854						else {
855	0				0	$rowname = $token;
856						}
857	39	50			91	if ( not exists $self->{'_matrix'}->{$rowname} ) {
858	39				95	$self->{'_matrix'}->{$rowname} = [];
859	39				53	push @{ $self->{'_matrixrowlabels'} }, $rowname;
	39				92
860						}
861						}
862						elsif ( defined $rowname and $token !~ $COMMENT ) {
863	42				74	my $row = $self->{'_matrix'}->{$rowname};
864	42	100			239	if ( $self->{'_matrixtype'} =~ m/^continuous$/i ) {
865	6				9	push @{$row}, split( /\s+/, $token );
	6				22
866						}
867						else {
868	36				46	push @{$row}, split( //, $token );
	36				194
869						}
870						}
871						}
872						}
873
874						sub _find_last_seen_taxa_block {
875	85			85	126	my $self = shift;
876	85				121	my $name = shift;
877	85				94	for ( my $i = $#{ $self->{'_context'} } ; $i >= 0 ; $i-- ) {
	85				251
878	180	100			468	if ( $self->{'_context'}->[$i]->_type == $TAXA ) {
879	84	100			154	if ( $name ) {
880	2	100			5	if ( $self->{'_context'}->[$i]->get_name eq $name ) {
881	1				3	return $self->{'_context'}->[$i];
882						}
883						}
884						else {
885	82				211	return $self->{'_context'}->[$i];
886						}
887						}
888						}
889	2				7	return;
890						}
891
892						sub _find_last_seen_matrix {
893	4			4	7	my $self = shift;
894	4				8	my $name = shift;
895	4				6	for ( my $i = $#{ $self->{'_context'} } ; $i >= 0 ; $i-- ) {
	4				18
896	5	50			19	if ( $self->{'_context'}->[$i]->_type == $MATRIX ) {
897	5	100			14	if ( $name ) {
898	2	100			6	if ( $self->{'_context'}->[$i]->get_name eq $name ) {
899	1				3	return $self->{'_context'}->[$i];
900						}
901						}
902						else {
903	3				9	return $self->{'_context'}->[$i];
904						}
905						}
906						}
907	0				0	return;
908						}
909
910						sub _set_taxon {
911	74			74	134	my ( $self, $obj, $taxa ) = @_;
912
913						# first case: a taxon by $obj's name already exists
914	74	50			150	if ( my $taxon = $taxa->get_by_name( $obj->get_name ) ) {
915	74				189	$obj->set_taxon($taxon);
916	74				145	return $self;
917						}
918
919						# second case: no taxon by $obj's name exists yet
920						else {
921	0				0	my $taxon = $self->_factory->create_taxon( '-name' => $obj->get_name );
922	0				0	$taxa->insert($taxon);
923	0				0	$obj->set_taxon($taxon);
924	0				0	return $self;
925						}
926						}
927
928						sub _resolve_taxon {
929	132			132	199	my ( $self, $obj ) = @_;
930	132				209	my $container = $self->_current;
931
932						# first case: the object is actually already
933						# linked to a taxon
934	132	100			284	if ( my $taxon = $obj->get_taxon ) {
935	88				163	return $self;
936						}
937
938						# second case: the container is already linked
939						# to a taxa block, but the object isn't
940	44	100			141	if ( my $taxa = $container->get_taxa ) {
941	30				67	$self->_set_taxon( $obj, $taxa );
942						}
943
944						# third case: the container isn't explicitly linked,
945						# but a taxa block has been seen
946	44	100			112	if ( my $taxa = $self->_find_last_seen_taxa_block ) {
947	42				159	$container->set_taxa($taxa);
948	42				104	$self->_set_taxon( $obj, $taxa );
949						}
950
951						# final case: no taxa block exists
952						else {
953	2				11	my $taxa = $container->make_taxa;
954	2				5	pop @{ $self->{'_context'} };
	2				6
955	2				4	push @{ $self->{'_context'} }, $taxa, $container;
	2				5
956	2				11	$self->_set_taxon( $obj, $taxa );
957						}
958						}
959
960						sub _resolve_ambig {
961	39			39	83	my ( $self, $datum, $chars ) = @_;
962	39				112	my %brackets = (
963						'(' => ')',
964						'{' => '}',
965						);
966	39				92	my $to = $datum->get_type_object;
967	39				60	my @resolved;
968	39				55	my $in_set = 0;
969	39				56	my @set;
970						my $close;
971	39				52	for my $c ( @{$chars} ) {
	39				77
972
973	228	50	33		547	if ( not $in_set and not exists $brackets{$c} ) {
		0	0
		0	0
		0	0
974	228	50			498	push @resolved, $c if defined $c;
975						}
976						elsif ( not $in_set and exists $brackets{$c} ) {
977	0				0	$in_set++;
978	0				0	$close = $brackets{$c};
979						}
980						elsif ( $in_set and $c ne $close ) {
981	0				0	push @set, $c;
982						}
983						elsif ( $in_set and $c eq $close ) {
984	0				0	push @resolved, $to->get_symbol_for_states(@set);
985	0				0	@set = ();
986	0				0	$in_set = 0;
987	0				0	$close = undef;
988						}
989						}
990	39				104	return \@resolved;
991						}
992
993						sub _codons {
994	0			0	0	my $self = shift;
995	0				0	$self->_logger->info(shift);
996	0	0			0	if ( $self->{'_begin'} ) {
997	0				0	$self->{'_begin'} = 0;
998						}
999						}
1000
1001						sub _matrix {
1002	57			57	86	my $self = shift;
1003	57				78	my $token = shift;
1004	57				132	$self->_add_matrix_metadata;
1005
1006						# first token: 'MATRIX', i.e. we're just starting to parse
1007						# the actual matrix. Here we need to switch to "linemode",
1008						# so that subsequently tokens will be array references (all
1009						# the tokens on a line). This is so that we can handle
1010						# interleaved matrices, which unfortunately need line breaks
1011						# in them.
1012	57	100	66		125	if ( not looks_like_instance( $token, 'ARRAY' ) and uc($token) eq 'MATRIX' )
		100	66
		50	33
1013						{
1014	9				23	$self->{'_linemode'} = 1;
1015	9				25	$self->_logger->info("starting matrix");
1016	9				16	return;
1017						}
1018
1019						# a row inside the matrix, after adding tokens to row, nothing
1020						# else to do
1021						elsif ( looks_like_instance( $token, 'ARRAY' )
1022	90				303	and not grep { /^;$/ } @{$token} )
	48				93
1023						{
1024	39				107	$self->_add_tokens_to_row($token);
1025	39				105	$self->_logger->info("adding tokens to row");
1026	39				149	return;
1027						}
1028
1029						# the last row of the matrix, after adding tokens to row,
1030						# instantiate & populate datum objects, link against taxa
1031						# objects
1032						elsif ( looks_like_instance( $token, 'ARRAY' )
1033	9				56	and grep { /^;$/ } @{$token} )
	9				179
1034						{
1035	9				32	$self->_add_tokens_to_row($token);
1036
1037						# link to taxa
1038	9				17	for my $row ( @{ $self->{'_matrixrowlabels'} } ) {
	9				115
1039
1040						# create new datum
1041	39				115	my $datum = $self->_factory->create_datum(
1042						'-type_object' => $self->_current->get_type_object,
1043						'-name' => $row,
1044						);
1045						my $char =
1046	39				148	$self->_resolve_ambig( $datum, $self->{'_matrix'}->{$row} );
1047	39				150	$datum->set_char($char);
1048
1049						# insert new datum in matrix
1050	39				93	$self->_current->insert($datum);
1051
1052						# link to taxon
1053	39				138	$self->_resolve_taxon($datum);
1054	39				119	my ( $length, $seq ) = ( $datum->get_length, $datum->get_char );
1055	39				118	$self->_logger->info("parsed $length characters for ${row}: $seq");
1056						}
1057
1058						# Let's avoid these!
1059	9	100	33		29	if ( $self->_current->get_nchar != $self->{'_nchar'} ) {
		50
1060						my ( $obs, $exp ) =
1061	1				5	( $self->_current->get_nchar, $self->{'_nchar'} );
1062	1				6	_bad_format("Observed and expected nchar mismatch: $obs vs. $exp");
1063						}
1064
1065						# ntax is only defined for "data" blocks (which have ntax token),
1066						# not for "characters" blocks (which should match up with taxa block)
1067						elsif ( defined $self->{'_ntax'}
1068						and $self->_current->get_ntax != $self->{'_ntax'} )
1069						{
1070	0				0	my ( $obs, $exp ) = ( $self->_current->get_ntax, $self->{'_ntax'} );
1071	0				0	_bad_format("Observed and expected ntax mismatch: $obs vs. $exp");
1072						}
1073
1074						# XXX matrix clean up here
1075	8				25	$self->{'_ntax'} = undef;
1076	8				19	$self->{'_nchar'} = undef;
1077	8				20	$self->{'_matrixtype'} = undef;
1078	8				48	$self->{'_matrix'} = {};
1079	8				21	$self->{'_matrixrowlabels'} = [];
1080	8				28	$self->{'_linemode'} = 0;
1081						}
1082						}
1083
1084						sub _bad_format {
1085	2			2	13	throw 'BadFormat' => shift;
1086						}
1087	408			408	1337	sub _current { shift->{'_context'}->[-1] }
1088
1089						sub _trees {
1090	5			5	10	my $self = shift;
1091	5	50			16	if ( $self->{'_begin'} ) {
1092	5				10	$self->{'_begin'} = 0;
1093	5				9	$self->{'_trees'} = '';
1094	5				16	$self->{'_treenames'} = [];
1095	5				7	push @{ $self->{'_context'} }, $self->_factory->create_forest;
	5				25
1096	5				26	$self->_logger->info("starting trees block");
1097						}
1098						}
1099
1100						sub _translate {
1101	171			171	203	my $self = shift;
1102	171				221	my $i = $self->{'_i'};
1103	171	100	100		354	if ( $i && $i == 1 )
1104						{ # actually, $i can be 0 according to BayesPhylogenies translation table
1105	5				19	$self->_logger->info("starting translation table");
1106						}
1107	171	100	100		787	if ( !defined($i) && $_[0] =~ m/^\d+$/ ) {
		100	66
			33
			33
1108	57				97	$self->{'_i'} = shift;
1109	57				130	$self->{'_translate'}->[ $self->{'_i'} ] = undef;
1110						}
1111						elsif (defined($i)
1112						&& exists $self->{'_translate'}->[$i]
1113						&& !defined $self->{'_translate'}->[$i]
1114						&& $_[0] ne ';' )
1115						{
1116	57				84	$self->{'_translate'}->[$i] = $_[0];
1117	57				128	$self->_logger->debug("Translation: $i => $_[0]");
1118	57				95	$self->{'_i'} = undef;
1119						}
1120						}
1121
1122						sub _tree {
1123	212			212	245	my $self = shift;
1124	212	100	100		402	if ( not $self->{'_treename'} and $_[0] !~ m/^(U?TREE\|\*)$/i ) {
1125	13				23	$self->{'_treename'} = $_[0];
1126						}
1127	212	100	66		375	if ( $_[0] eq '=' and not $self->{'_treestart'} ) {
1128	13				21	$self->{'_treestart'} = 1;
1129						}
1130	212	100	100		503	if ( $_[0] ne '=' and $self->{'_treestart'} ) {
1131	173				236	$self->{'_tree'} .= $_[0];
1132						}
1133
1134						# tr/// returns # of replacements, hence can be used to check
1135						# tree description is balanced
1136	212	100	100		828	if ( $self->{'_treestart'}
			100
1137						and $self->{'_tree'}
1138						and $self->{'_tree'} =~ tr/(/(/ == $self->{'_tree'} =~ tr/)/)/ )
1139						{
1140	13				22	my $translated = $self->{'_tree'};
1141	13				42	my $translate = $self->{'_translate'};
1142	13	100			28	my $start =
1143						exists $translate->[0]
1144						? 0
1145						: 1; # BayesPhylogenies starts translation table w. 0
1146	13				19	for my $i ( $start .. $#{$translate} ) {
	13				34
1147	93				2088	$translated =~ s/($\|,)$i(,\|$\|:)/$1$translate->[$i]$2/;
1148						}
1149						my ( $logtreename, $logtree ) =
1150	13				45	( $self->{'_treename'}, $self->{'_tree'} );
1151	13				43	$self->_logger->info("tree: $logtreename string: $logtree");
1152	13				42	$self->{'_trees'} .= $translated . ';';
1153	13				18	push @{ $self->{'_treenames'} }, $self->{'_treename'};
	13				32
1154
1155						# XXX tree cleanup here
1156	13				21	$self->{'_treestart'} = 0;
1157	13				21	$self->{'_tree'} = undef;
1158	13				23	$self->{'_treename'} = undef;
1159						}
1160						}
1161
1162						sub _end {
1163	33			33	57	my $self = shift;
1164	33				86	$self->{'_translate'} = [];
1165	33	100	100		176	if ( uc $self->{'_previous'} eq ';' and $self->{'_trees'} ) {
1166	5				20	my $forest = $self->_current;
1167						my $trees = parse(
1168						'-format' => 'newick',
1169	5				39	'-string' => $self->{'_trees'},
1170						'-as_project' => 0
1171						);
1172	5				37	for my $tree ( @{ $trees->get_entities } ) {
	5				24
1173	13				34	$forest->insert($tree);
1174						}
1175
1176						# set tree names
1177	5				14	for my $i ( 0 .. $#{ $self->{'_treenames'} } ) {
	5				22
1178	13				43	$forest->get_by_index($i)->set_name( $self->{'_treenames'}->[$i] );
1179						}
1180
1181						# link tips to taxa
1182	5				12	for my $tree ( @{ $forest->get_entities } ) {
	5				18
1183	13				22	for my $tip ( @{ $tree->get_terminals } ) {
	13				49
1184	93				169	$self->_resolve_taxon($tip);
1185						}
1186						}
1187
1188						# XXX trees cleanup here
1189	5				18	$self->{'_trees'} = '';
1190	5				47	$self->{'_treenames'} = [];
1191						}
1192						}
1193
1194						sub _semicolon {
1195	189			189	264	my $self = shift;
1196	189	50			1187	if ( uc $self->{'_previous'} eq 'MATRIX' ) {
		100
		100
		100
		100
		100
		50
		50
1197	0				0	$self->{'_matrixtype'} = undef;
1198	0				0	$self->{'_matrix'} = {};
1199	0				0	$self->{'_charlabels'} = [];
1200	0				0	$self->{'_statelabels'} = [];
1201	0				0	$self->{'_linemode'} = 0;
1202	0	0			0	if ( not $self->_current->get_ntax ) {
1203	0				0	my $taxon = {};
1204	0				0	foreach my $row ( @{ $self->_current->get_entities } ) {
	0				0
1205	0				0	$taxon->{ $row->get_taxon }++;
1206						}
1207	0				0	my $ntax = scalar keys %{$taxon};
	0				0
1208						}
1209						}
1210
1211						# finalize character set
1212						elsif ( uc $self->{'_previous'} eq 'CHARSET' ) {
1213	2				8	my $matrix = $self->_find_last_seen_matrix( $self->{'_charset'}->{'matrix'} );
1214	2				7	my $characters = $matrix->get_characters;
1215	2				6	my $set = $self->_factory->create_set( '-name' => $self->{'_charset'}->{'name'} );
1216	2				12	$characters->add_set($set);
1217	2				4	my $range = $self->{'_charset'}->{'range'};
1218	2				4	my @range;
1219	2	50			7	if ( ref($range) eq 'ARRAY' ) {
1220	2				4	while ( @{ $range } ) {
	6				12
1221	4				6	my $index = shift @{ $range };
	4				7
1222	4	100	66		14	if ( $range->[0] && $range->[0] eq '-' ) {
1223	2				4	shift @{ $range };
	2				24
1224	2				5	my $end = shift @{ $range };
	2				4
1225	2				10	push @range, ( $index - 1 ) .. ( $end - 1 );
1226						}
1227						else {
1228	2				5	push @range, ( $index - 1 );
1229						}
1230						}
1231	2				4	for my $i ( @range ) {
1232	6				18	my $character = $characters->get_by_index($i);
1233	6	50			10	if ( $character ) {
1234	6				17	$characters->add_to_set($character,$set);
1235						}
1236						else {
1237	0				0	throw 'API' => "No character at index $i";
1238						}
1239						}
1240						}
1241	2				8	$self->{'_charset'} = {};
1242						}
1243
1244						# finalize character state labels
1245						elsif ( uc $self->{'_previous'} eq 'CHARSTATELABELS' ) {
1246	2				10	my $matrix = $self->_find_last_seen_matrix;
1247	2				3	my @labels = @{ $self->{'_charstatelabels'} };
	2				7
1248	2	100			16	if ( $matrix->get_type =~ m/continuous/i ) {
1249	1				3	my @charlabels;
1250	1				2	my $charnum = 1;
1251	1				4	while (@labels) {
1252
1253						# expecting an index at the beginning of the statement
1254	2				3	my $index = shift @labels;
1255	2	50			6	$index != $charnum && _bad_format( "Expecting character number $charnum, observed $index in CHARSTATELABELS" );
1256
1257						# then the character label
1258	2				3	push @charlabels, shift @labels;
1259
1260						# then a comma
1261	2	100			5	if ( @labels ) {
1262	1	50			5	$labels[0] eq ',' ? shift @labels : _bad_format( "Expecting , observed $labels[0] in CHARSTATELABELS" );
1263						}
1264	2				5	$charnum++;
1265						}
1266	1				6	$matrix->set_charlabels(\@charlabels);
1267	1				4	$self->{'_charstatelabels'} = [];
1268						}
1269						else {
1270	1				3	my ( @charlabels, @statelabels );
1271	1				2	my $charnum = 1;
1272	1				4	while (@labels) {
1273
1274						# expecting an index at the beginning of the statement
1275	2				3	my $index = shift @labels;
1276	2	50			6	$index != $charnum && _bad_format( "Expecting character number $charnum, observed $index in CHARSTATELABELS" );
1277
1278						# then the character label
1279	2				4	push @charlabels, shift @labels;
1280
1281						# then a forward slash
1282	2				4	my $slash = shift @labels;
1283	2	50			4	$slash ne '/' && _bad_format( "Expecting /, observed $slash in CHARSTATELABELS" );
1284
1285						# then a list of state labels
1286	2				3	my @stateset;
1287	2		100		16	push @stateset, shift @labels while(@labels and $labels[0] ne ',');
1288	2				5	push @statelabels, \@stateset;
1289
1290						# then a comma
1291	2	100			5	if ( @labels ) {
1292	1	50			3	$labels[0] eq ',' ? shift @labels : _bad_format( "Expecting , observed $labels[0] in CHARSTATELABELS" );
1293						}
1294	2				4	$charnum++;
1295						}
1296	1				7	$matrix->set_charlabels(\@charlabels);
1297	1				4	$matrix->set_statelabels(\@statelabels);
1298	1				4	$self->{'_charstatelabels'} = [];
1299						}
1300						}
1301
1302						# finalize taxon set
1303						elsif ( uc $self->{'_previous'} eq 'TAXSET' ) {
1304	2				7	my $taxa = $self->_find_last_seen_taxa_block( $self->{'_taxset'}->{'taxa'} );
1305	2				7	my $set = $self->_factory->create_set( '-name' => $self->{'_taxset'}->{'name'} );
1306	2				13	$taxa->add_set($set);
1307	2				4	my $range = $self->{'_taxset'}->{'range'};
1308	2				4	my @range;
1309	2				4	while ( @{ $range } ) {
	8				17
1310	6				7	my $index = shift @{ $range };
	6				9
1311	6	100	100		18	if ( $range->[0] && $range->[0] eq '-' ) {
1312	2				4	shift @{ $range };
	2				3
1313	2				4	my $end = shift @{ $range };
	2				5
1314	2				7	push @range, ( $index - 1 ) .. ( $end - 1 );
1315						}
1316						else {
1317	4				7	push @range, ( $index - 1 );
1318						}
1319						}
1320	2				5	for my $i ( @range ) {
1321	8				22	my $taxon = $taxa->get_by_index($i);
1322	8	50			12	if ( $taxon ) {
1323	8				20	$taxa->add_to_set($taxon,$set);
1324						}
1325						else {
1326	0				0	_bad_format( "No taxon at index $i" );
1327						}
1328						}
1329	2				10	$self->{'_taxset'} = {};
1330						}
1331
1332						# finalize taxa labels
1333						elsif ( uc $self->{'_previous'} eq 'TAXLABELS' ) {
1334	13				28	foreach my $name ( @{ $self->{'_taxlabels'} } ) {
	13				33
1335	59				154	my $taxon = $self->_factory->create_taxon( '-name' => $name );
1336	59				172	$self->_current->insert($taxon);
1337						}
1338	13	100			42	if ( $self->_current->get_ntax != $self->{'_ntax'} ) {
1339						_bad_format(
1340						sprintf(
1341						'Mismatch between observed and expected ntax: %d vs %d',
1342	1				3	$self->_current->get_ntax, $self->{'_ntax'}
1343						)
1344						);
1345						}
1346
1347						# XXX taxa cleanup here
1348	12				28	$self->{'_ntax'} = undef;
1349	12				32	$self->{'_taxlabels'} = [];
1350						}
1351
1352						# finalize symbols list
1353						elsif ( uc $self->{'_previous'} eq 'SYMBOLS' ) {
1354	5				9	my $logsymbols = join( ' ', @{ $self->{'_symbols'} } );
	5				14
1355	5				18	$self->_logger->info("symbols: $logsymbols");
1356	5				14	$self->{'_symbols'} = [];
1357						}
1358
1359						# finalize character labels
1360						elsif ( uc $self->{'_previous'} eq 'CHARLABELS' ) {
1361	0	0				if ( @{ $self->{'_charlabels'} } ) {
	0
1362	0					my $logcharlabels = join( ' ', @{ $self->{'_charlabels'} } );
	0
1363	0					$self->_logger->info("charlabels: $logcharlabels");
1364						}
1365						}
1366
1367						# finalize state labels
1368						elsif ( uc $self->{'_previous'} eq 'STATELABELS' ) {
1369	0	0				if ( @{ $self->{'_statelabels'} } ) {
	0
1370	0					my $logstatelabels = join( ' ', @{ $self->{'_statelabels'} } );
	0
1371	0					$self->_logger->info("statelabels: $logstatelabels");
1372						}
1373						}
1374						}
1375
1376						# podinherit_insert_token
1377
1378						=head1 SEE ALSO
1379
1380						There is a mailing list at L<https://groups.google.com/forum/#!forum/bio-phylo>
1381						for any user or developer questions and discussions.
1382
1383						=over
1384
1385						=item L<Bio::Phylo::IO>
1386
1387						The nexus parser is called by the L<Bio::Phylo::IO> object. Look there for
1388						examples of file parsing and manipulation.
1389
1390						=item L<Bio::Phylo::Manual>
1391
1392						Also see the manual: L<Bio::Phylo::Manual> and L<http://rutgervos.blogspot.com>.
1393
1394						=back
1395
1396						=head1 CITATION
1397
1398						If you use Bio::Phylo in published research, please cite it:
1399
1400						B<Rutger A Vos>, B<Jason Caravas>, B<Klaas Hartmann>, B<Mark A Jensen>
1401						and B<Chase Miller>, 2011. Bio::Phylo - phyloinformatic analysis using Perl.
1402						I<BMC Bioinformatics> B<12>:63.
1403						L<http://dx.doi.org/10.1186/1471-2105-12-63>
1404
1405						=cut
1406
1407						1;