File Coverage

blib/lib/DDC/PP/yyqlexer.pm

Criterion	Covered	Total	%
statement	198	322	61.4
branch	186	290	64.1
condition	20	36	55.5
subroutine	22	28	78.5
pod	0	20	0.0
total	426	696	61.2

line	stmt	bran	cond	sub	pod	time	code
1							##-- Mode: CPerl --
2
3							## File: DDC::::yyqlexer.pm
4							## Author: Bryan Jurish
5							## Description:
6							## + lexer for ddc queries (formerly DDC::Query::yylexer)
7							## + last updated for ddc v2.1.15
8							##======================================================================
9
10							package DDC::PP::yyqlexer;
11	20			20		509	use 5.010; ##-- we need at least v5.10.0 for /p regex modifier
	20					74
12	20			20		119	use Encode qw(encode_utf8 decode_utf8);
	20					66
	20					1139
13	20			20		122	use Carp;
	20					61
	20					1193
14	20			20		161	use IO::File;
	20					90
	20					2971
15	20			20		178	use IO::Handle;
	20					39
	20					753
16	20			20		114	use strict;
	20					58
	20					7590
17
18							##======================================================================
19							## Globals etc.
20							our @ISA = qw();
21
22							##----------------------------------------------------------------------
23							## Globals: regexes for Parse::Lex lexer token regexes
24
25							## %DEF
26							## + common shared regex definitions
27							our (%DEF);
28							BEGIN {
29							##-- adapted from ddc-2.1.1/src/ConcordLib/yyQLexer.l ; extra escapes needed for backslashes ('\\' -> '\\\\')
30
31							##-- whitespace
32	20			20		109	$DEF{ws} = '[ \t\n\r\f\x0b]';
33
34							##-- integer boundary characters
35	20					43	$DEF{int_boundary} = '[ \t\n\r\f\x0b\0&\|!?^%,:;#*=~(){}<>\[\]\/\\\\\'\".$@_+-]';
36
37							##-- bareword symbols
38	20					42	$DEF{symbol_cfirst} = '[^ \t\n\r\f\x0b\0&\|!?^%,:;#*=~(){}<>\[\]\\\\\/\'\".$@]';
39	20					66	$DEF{symbol_crest} = '[^ \t\n\r\f\x0b\0&\|!?^%,:;#*=~(){}<>\[\]\\\\\/\'\"]';
40	20					36	$DEF{symbol_cescape} = '(?:\\\\.)';
41	20					140	$DEF{symbol_text} = "(?:$DEF{symbol_cescape}\|$DEF{symbol_cfirst})(?:$DEF{symbol_cescape}\|$DEF{symbol_crest})*";
42
43							##-- subcorpus symbols (ddc >= v2.2.0; also allow '!' here)
44	20					73	$DEF{corpus_cfirst} = '[^ \t\n\r\f\v\0&\|?^%,:;#=~(){}<>\[\]\\\'\"$@]';
45	20					55	$DEF{corpus_crest} = '[^ \t\n\r\f\v\0&\|?^%,:;#=~(){}<>\[\]\\\'\"]';
46	20					83	$DEF{corpus_text} = "(?:$DEF{symbol_cescape}\|$DEF{corpus_cfirst})(?:$DEF{symbol_cescape}\|$DEF{corpus_crest})*";
47
48							##-- bareword index names (underscore and digits ok, but no '.', '-', or '+')
49	20					42	$DEF{index_char} = '[^ \t\n\r\f\x0b\0&\|!?^%,:;#*=~(){}<>\[\]\\\\/\'\".$@+-]';
50	20					73	$DEF{index_name} = "(?:$DEF{index_char}\|$DEF{symbol_cescape})+";
51
52							##-- single-quoted symbols
53	20					51	$DEF{sq_text} = "(?:[^\']\|$DEF{symbol_cescape})*";
54
55							##-- bareword dates (>= 4 digits of year, otherwise breaks {count(...) #by[$w=1-1]})
56	20					44	$DEF{date_bare} = "[+-]?[0-9]{4,}(-[0-9]{1,2}){1,2}";
57
58							##-- regexes
59	20					47	$DEF{regex_text} = "(?:(?:\\\\.)\|[^\\\\/])*";
60	20					49	$DEF{regex_modifier} = '[dgimsx]';
61
62							##-- comments
63	20					79	$DEF{comment_text} = "(?:\\\\.\|[^\\]])*";
64
65							##-- compile patterns
66	20					116	foreach (keys %DEF) {
67							#print STDERR __PACKAGE__, ": compiling regex macro: $_ ~ /$DEF{$_}/\n";
68	320					11879	$DEF{$_} = qr/$DEF{$_}/;
69							}
70							}
71
72							##======================================================================
73							## $lex = $CLASS_OR_OBJ->new(%args)
74							## + abstract constructor
75							## + %$lex, %args:
76							## {
77							## src => $name, ##-- source name
78							## fh => $srcfh, ##-- source filehandle
79							## bufr => \$buf, ##-- source buffer (string reference)
80							## bufp => $pos, ##-- current pos() in source buffer
81							## buf => $buf, ##-- local buffer (for filehandle input)
82							## state => $q, ##-- symbolic state name (default: 'INITIAL')
83							## stack => \@stack, ##-- state stack
84							##
85							## ##-- utf-8 or byte mode?
86							## utf8 => $bool, ##-- whether to use utf8 or byte-mode (default: true (non-compatible but handy))
87							##
88							## ##-- runtime data
89							## yytext => $text, ##-- current text
90							## yytype => $type, ##-- current token type
91							## yylineno => $line, ##-- current source line (file input only)
92							## }
93							sub new {
94	15			15	0	44	my $that = shift;
95	15		33			244	my $lex = bless({
96							src =>undef,
97							fh =>undef,
98							bufr =>undef,
99							bufp =>undef,
100							buf =>undef,
101							utf8 =>1,
102							state => 'INITIAL',
103							stack => [],
104
105							yytext=>undef,
106							yytype=>undef,
107							yylineno=>undef,
108
109							##-- lexer comment retention hacks
110							comments=>[],
111
112							##-- user args
113							@_
114							},
115							ref($that)\|\|$that
116							);
117	15					129	return $lex;
118							}
119
120							## $lex = $lex->clear()
121							## + clear lexer buffer, source, etc
122							sub clear {
123	422			422	0	615	my $lex = shift;
124	422					1393	delete @$lex{qw(src fh bufr bufp buf yytext yytype yylineno)};
125	422					762	$lex->{state} = 'INITIAL';
126	422					603	@{$lex->{stack}} = qw();
	422					827
127	422					643	@{$lex->{comments}} = qw();
	422					661
128	422					660	$lex->{yyDtrsPos} = 0;
129	422					601	$lex->{yyDtrsLen} = 0;
130	422					710	delete $lex->{_cmtbuf};
131	422					694	return $lex;
132							}
133	20			20		96156	BEGIN { reset = close = \&clear; }
134
135							##======================================================================
136							## I/O
137
138							## $lex = $lex->from($which,$src, %opts)
139							## + $which is one of qw(fh file string)
140							## + $src is the actual source (default: 'string')
141							sub from {
142	211			211	0	496	my ($lex,$which,$src,%opts) = @_;
143	211	50				514	return $lex->fromFh($src,%opts) if ($which eq 'fh');
144	211	50				431	return $lex->fromFile($src,%opts) if ($which eq 'file');
145	211					585	return $lex->fromString($src,%opts);
146							}
147
148							## $lex = $lex->fromFile($filename_or_handle,%opts)
149							sub fromFile {
150	0			0	0	0	my ($lex,$file,%opts) = @_;
151	0	0				0	return $lex->fromFh($file,%opts) if (ref($file));
152	0	0				0	my $fh = IO::File->new("<$file")
153							or confess("cannot open '$file' for read: $!");
154	0	0				0	binmode($fh,':encoding(utf8)') if ($lex->{utf8});
155	0					0	return $lex->fromFh($fh,src=>"file \`$file'",%opts);
156							}
157
158							our $FH_SLURP=0; ##-- DEBUG: slurp whole files instead of line-wise input
159
160							## $lex = $lex->fromFh($fh,%opts)
161							## + uses native $fh encoding
162							sub fromFh {
163	0			0	0	0	my ($lex,$fh,%opts) = @_;
164	0	0				0	if ($FH_SLURP) {
165							##-- always use string mode
166	0					0	local $/=undef;
167	0					0	my $buf = $fh->getline;
168	0					0	$fh->close();
169	0					0	return $lex->fromString(\$buf,src=>"filehandle \`$fh'",%opts);
170							}
171							##-- line-wise buffering
172	0					0	$lex->clear();
173	0					0	@$lex{keys %opts} = values(%opts);
174	0					0	$lex->{fh} = $fh;
175	0					0	$lex->{buf} = undef;
176	0					0	$lex->{bufr} = \$lex->{buf};
177	0					0	$lex->{bufp} = 0;
178	0	0				0	$lex->{src} = "filehandle \`$fh'" if (!defined($lex->{src}));
179	0					0	return $lex;
180							}
181
182							## $lex = $lex->fromString($str,%opts)
183							## $lex = $lex->fromString(\$str,%opts)
184							sub fromString {
185	211			211	0	419	my ($lex,$str,%opts) = @_;
186	211					487	$lex->clear();
187	211	50				523	if (ref($str)) {
188	211					397	$lex->{bufr} = $str;
189	211	50				965	$lex->{src} = "buffer \`$str'" if (!defined($lex->{src}));
190							} else {
191	0					0	$lex->{bufr} = \$str;
192	0	0				0	$lex->{src} = "string \`$str'" if (!defined($lex->{src}));
193							}
194	211					396	$lex->{bufp} = 0;
195	211					379	$lex->{yylineno} = 0;
196
197							##-- utf8 checks
198	211	50	33			567	if ($lex->{utf8} && !utf8::is_utf8(${$lex->{bufr}})) {
	211	0	0			866
199							##-- lexer:utf8, string:bytes --> assume string is utf8-encoded
200	211	50				336	${$lex->{bufr}} = decode_utf8(${$lex->{bufr}}) if (!utf8::is_utf8(${$lex->{bufr}}));
	211					5157
	211					708
	211					581
201							}
202	0					0	elsif (!$lex->{utf8} && utf8::is_utf8(${$lex->{bufr}})) {
203							##-- lexer:bytes, string:utf8 --> encode as utf8 octets
204	0					0	${$lex->{bufr}} = encode_utf8(${$lex->{bufr}});
	0					0
	0					0
205							}
206
207	211					804	return $lex;
208							}
209
210							##======================================================================
211							## Utilities
212
213							## $bool = $lex->eof()
214							## + true iff at end-of-file
215							sub eof {
216	1151	50		1151	0	2877	return $FH_SLURP ? $_[0]->eob() : !$_[0]->getmore();
217							}
218
219							## $bool = $lex->eob()
220							## + true at end-of-buffer
221							sub eob {
222	1151		66	1151	0	2746	return (!$_[0]{bufr} \|\| !${$_[0]{bufr}} \|\| ($_[0]{bufp}\|\|0) >= length(${$_[0]{bufr}}));
223							}
224
225							## $bufr_or_undef = $lex->getmore()
226							## + returns true iff there is still data in the buffer
227							sub getmore {
228	1151	100		1151	0	1976	return $_[0]{bufr} if (!$_[0]->eob());
229	209	50				655	if (defined($_[0]{fh})) {
230	0					0	$_[0]{bufp} = 0;
231	0					0	$_[0]{buf} = $_[0]{fh}->getline;
232	0					0	$_[0]{bufr} = \$_[0]{buf};
233	0					0	$_[0]{yylineno} = $_[0]{fh}->input_line_number;
234	0	0				0	return defined($_[0]{buf}) ? $_[0]{bufr} : undef;
235							}
236	209					576	return undef;
237							}
238
239							##======================================================================
240							## Runtime lexer accessors
241
242							## $yytext = $lex->yytext
243							## + always defined; otherwise using $lex->{yytext} is faster
244	2	50		2	0	42	sub yytext { return defined($_[0]{yytext}) ? $_[0]{yytext} : ''; }
245
246							## $yytype = $lex->yytype
247							## + always defined; otherwise using $lex->{yytype} is faster
248	2	50		2	0	22	sub yytype { return defined($_[0]{yytype}) ? $_[0]{yytype} : '__EOF__'; }
249
250							## $line = $lex->yylineno()
251							## + returns current line number
252							sub yylineno {
253	2			2	0	23	return $_[0]{yylineno};
254							}
255
256							## $pos = $lex->yycolumn()
257							## + return column at which current token starts (if any)
258							sub yycolumn {
259	0	0	0	0	0	0	return ($_[0]{bufp}\|\|0) - (defined($_[0]{yytext}) ? length($_[0]{yytext}) : 0);
260							}
261
262							## $pos = $lex->yypos()
263							## + return byte position in current line (or input string)
264							sub yypos {
265	102		50	102	0	356	return ($_[0]{bufp}\|\|0);
266							}
267
268							## $string = $lex->yyerror(@msg)
269							## + create a helpful error message
270							sub yyerror {
271	0			0	0	0	my $lex = shift;
272	0					0	confess(ref($lex).": error in ".$lex->yywhere().join('',@_));
273							}
274
275							## $string = $lex->yywhere()
276							## + location string used by yyerror()
277							sub yywhere {
278	0			0	0	0	my $lex = shift;
279							return ("$lex->{src} at "
280							.(defined($lex->{fh}) ? ("line $lex->{yylineno}, ") : '')
281							."column ".$lex->yycolumn
282	0	0				0	.", near ".(defined($lex->{yytext}) ? "\`$lex->{yytext}\'" : '__EOF__')
		0
283							);
284							}
285
286							## $q = $lex->yypushq($new_state)
287							sub yypushq {
288	19			19	0	49	my ($lex,$qnew) = @_;
289	19					56	push(@{$lex->{stack}},$lex->{state});
	19					59
290	19					45	return $lex->{state} = $qnew;
291							}
292
293							## $q = $lex->yypopq()
294							sub yypopq {
295	29			29	0	55	my $lex = shift;
296	29		100			49	return $lex->{state} = pop(@{$lex->{stack}}) \|\| 'INITIAL';
297							}
298
299							##======================================================================
300							## Runtime lexer routines
301
302							## ($typ,$text) = $lex->yylex()
303							## + get next token from input stream
304							sub yylex {
305	1125			1125	0	1736	my $lex = shift;
306	1125					1804	my ($bufr,$type,$text,$match,@part);
307							#use re 'eval'; ##-- dangerous!
308							LEXBUF:
309	1125					2404	while (!$lex->eof()) {
310	942					1679	$bufr = $lex->{bufr};
311	942					2334	pos($$bufr) = $lex->{bufp};
312
313							LEXSKIP:
314							##------------------------------------
315							## LEXSKIP: main lexer loop
316	942					1618	while (1) {
317	1345					2312	$type = $text = $match = undef;
318	1345					2114	@part = qw();
319
320							##------------------------
321	1345	100	100			3844	if ($lex->{state} eq 'INITIAL' \|\| $lex->{state} eq 'Q_MATCHID') {
		100
		100
		100
		100
		50
		50
322
323							##-- end-of-file (should be first pattern)
324	1213	100	100			58883	if ($$bufr =~ m/\G\z/) { $type = '__EOF__'; }
	26	100	100			42
		100	66
		100	33
		100	33
		50
		100
		100
		100
		100
		50
		50
		100
		100
		100
		100
		100
		100
		50
		100
		100
		50
		50
		50
		50
		50
		100
		50
		50
		50
		50
		50
		50
		50
		50
		50
		100
		50
		50
		100
		50
		50
		100
		50
		50
		100
		100
		50
		50
		50
		50
		100
		100
		100
		50
		50
		50
		50
		100
		50
		50
		100
		100
		100
		100
		50
		100
		50
		50
		100
		50
		100
		50
		50
		50
		100
		100
		100
		100
		100
		50
		50
		50
		0
325
326							##-- comments
327	22					77	elsif ($$bufr =~ m/\G\#:[^\n]*/sp) { $type='__SKIP__'; push(@{$lex->{comments}},${^MATCH}."\n"); }
	22					38
	22					97
328	19					39	elsif ($$bufr =~ m/\G\#\[/p) { $type='__SKIP__'; $lex->{_cmtbuf}=${^MATCH}; $lex->yypushq('Q_COMMENT'); }
	19					59
	19					64
329
330							##-- operators
331	21					39	elsif ($$bufr =~ m/\G\&\&/p) { $type = 'OP_BOOL_AND'; }
332	2					7	elsif ($$bufr =~ m/\G\\|\\|/p) { $type = 'OP_BOOL_OR'; }
333	0					0	elsif ($$bufr =~ m/\Gnear/pi) { $type = 'NEAR'; }
334	2					6	elsif ($$bufr =~ m/\G(?:\!=\|\&\!=\|\&=\s\!\|\!with\|with(?:out\|\s\!))/pi) { $type = 'WITHOUT'; }
335	2					6	elsif ($$bufr =~ m/\G(?:\\|=\|withor\|orwith\|wor)/pi) { $type = 'WITHOR'; }
336	2					5	elsif ($$bufr =~ m/\G(?:\&=\|with)/pi) { $type = 'WITH'; }
337	28					66	elsif ($$bufr =~ m/\Gcount/pi) { $type = 'COUNT'; }
338	0					0	elsif ($$bufr =~ m/\Gkeys/pi) { $type = 'KEYS'; }
339
340							##-- count-by keywords
341	0					0	elsif ($$bufr =~ m/\G(?:file\|doc)_?id/pi) { $type = 'KW_FILEID'; }
342	2					6	elsif ($$bufr =~ m/\G(?:file\|doc)_?(?:name)?/pi) { $type = 'KW_FILENAME'; }
343	2					5	elsif ($$bufr =~ m/\Gdate/pi) { $type = 'KW_DATE'; $lex->{state}='Q_DATE' }
	2					6
344
345							##-- query operators
346	17					40	elsif ($$bufr =~ m/\G\#(?:comment\|cmt)/pi) { $type = 'KW_COMMENT'; }
347	2					14	elsif ($$bufr =~ m/\G\#(?:(?:co?n?te?xt?\|n))/pi) { $type = 'CNTXT'; }
348	2					6	elsif ($$bufr =~ m/\G\#(?:with)?in/pi) { $type = 'WITHIN'; }
349	3					7	elsif ($$bufr =~ m/\G\#(?:sep(?:arate)?\|nojoin)(?:_hits)?/pi) { $type = 'SEPARATE_HITS'; }
350	0					0	elsif ($$bufr =~ m/\G\#(?:nosep(?:arate)?\|join)(?:_hits)?/pi) { $type = 'NOSEPARATE_HITS'; }
351	4					9	elsif ($$bufr =~ m/\G\#(?:is_\|has_)?date/pi) { $type = 'IS_DATE'; }
352	2					5	elsif ($$bufr =~ m/\G\#has(?:_field)?/pi) { $type = 'HAS_FIELD'; }
353	0					0	elsif ($$bufr =~ m/\G\#file(?:_?)names/pi) { $type = 'FILENAMES_ONLY'; }
354	0					0	elsif ($$bufr =~ m/\G\#debug_rank/pi) { $type = 'DEBUG_RANK'; }
355	0					0	elsif ($$bufr =~ m/\G\#(?:greater\|de?sc)(?:_by)?_rank/pi) { $type = 'GREATER_BY_RANK'; }
356	0					0	elsif ($$bufr =~ m/\G\#(?:less\|asc)(?:_by)?_rank/pi) { $type = 'LESS_BY_RANK'; }
357	0					0	elsif ($$bufr =~ m/\G\#(?:greater\|de?sc)(?:_by)?_date/pi) { $type = 'GREATER_BY_DATE'; }
358	10					26	elsif ($$bufr =~ m/\G\#(?:less\|asc)(?:_by)?_date/pi) { $type = 'LESS_BY_DATE'; }
359	0					0	elsif ($$bufr =~ m/\G\#(?:greater\|de?sc)(?:_by)?_size/pi) { $type = 'GREATER_BY_SIZE'; }
360	0					0	elsif ($$bufr =~ m/\G\#(?:less\|asc)(?:_by)?_size/pi) { $type = 'LESS_BY_SIZE'; }
361	0					0	elsif ($$bufr =~ m/\G\#(?:is_\|has_)?size/pi) { $type = 'IS_SIZE'; }
362	0					0	elsif ($$bufr =~ m/\G\#(?:(?:less\|asc)_(?:by_)?)?left/pi) { $type = 'LESS_BY_LEFT'; }
363	0					0	elsif ($$bufr =~ m/\G\#(?:(?:greater\|de?sc)_(?:by_)?)left/pi) { $type = 'GREATER_BY_LEFT'; }
364	0					0	elsif ($$bufr =~ m/\G\#(?:(?:less\|asc)_(?:by_)?)?right/pi) { $type = 'LESS_BY_RIGHT'; }
365	0					0	elsif ($$bufr =~ m/\G\#(?:(?:greater\|de?sc)_(?:by_)?)right/pi) { $type = 'GREATER_BY_RIGHT'; }
366	0					0	elsif ($$bufr =~ m/\G\#(?:(?:less\|asc)_(?:by_)?)?mid(?:dle)?/pi) { $type = 'LESS_BY_MIDDLE'; }
367	0					0	elsif ($$bufr =~ m/\G\#(?:(?:greater\|de?sc)_(?:by_)?)mid(?:dle)?/pi) { $type = 'GREATER_BY_MIDDLE'; }
368	2					7	elsif ($$bufr =~ m/\G\#(?:(?:less\|asc)(?:_by)?_key)/pi) { $type = 'LESS_BY_KEY'; }
369	0					0	elsif ($$bufr =~ m/\G\#(?:(?:greater\|de?sc)(?:_by)?_key)/pi) { $type = 'GREATER_BY_KEY'; }
370	0					0	elsif ($$bufr =~ m/\G\#(?:(?:less\|asc)(?:_by)?_(?:count\|val(?:ue)?))/pi) { $type = 'LESS_BY_COUNT'; }
371	2					7	elsif ($$bufr =~ m/\G\#(?:(?:greater\|de?sc)(?:_by)?_(?:count\|val(?:ue)?))/pi) { $type = 'GREATER_BY_COUNT'; }
372	0					0	elsif ($$bufr =~ m/\G\#(?:less\|asc)(?:_by)?/pi) { $type = 'LESS_BY'; }
373	0					0	elsif ($$bufr =~ m/\G\#(?:greater\|de?sc)(?:_by)?/pi) { $type = 'GREATER_BY'; }
374	4					14	elsif ($$bufr =~ m/\G\#prune(?:_less\|_asc)?(?:_by)?/pi) { $type = 'PRUNE_ASC'; }
375	0					0	elsif ($$bufr =~ m/\G\#prune(?:_greater\|_de?sc)(?:_by)?/pi) { $type = 'PRUNE_DESC'; }
376	0					0	elsif ($$bufr =~ m/\G\#rand(?:om)?/pi) { $type = 'RANDOM'; }
377	28					69	elsif ($$bufr =~ m/\G\#by/pi) { $type = 'BY'; }
378	2					6	elsif ($$bufr =~ m/\G\#samp(?:le)?/pi) { $type = 'SAMPLE'; }
379	0					0	elsif ($$bufr =~ m/\G\#clim(?:it)?/pi) { $type = 'CLIMIT'; }
380
381							##-- regexes
382	0					0	elsif ($$bufr =~ m/\G\!\/($DEF{regex_text})\/(?=$DEF{regex_modifier})/po) { $type='NEG_REGEX'; $text=$1; $lex->{state}='Q_REGOPT'; }
	0					0
	0					0
383	0					0	elsif ($$bufr =~ m/\G\!\/($DEF{regex_text})\//po) { $type='NEG_REGEX'; $text=$1; }
	0					0
384	0					0	elsif ($$bufr =~ m/\G\/($DEF{regex_text})\/(?=$DEF{regex_modifier})/po) { $type='REGEX'; $text=$1; $lex->{state}='Q_REGOPT'; }
	0					0
	0					0
385	11					30	elsif ($$bufr =~ m/\G\/($DEF{regex_text})\//po) { $type='REGEX'; $text=$1; }
	11					31
386	2					7	elsif ($$bufr =~ m/\Gs\/($DEF{regex_text})\//po) { $type='REGEX_SEARCH'; $text=$1; $lex->{state}='Q_REGREP'; }
	2					7
	2					5
387
388							##-- punctutation & special characters
389	2					6	elsif ($$bufr =~ m/\G#=/p) { $type = 'HASH_EQUAL'; } ##-- hash+equal: exact distance
390	0					0	elsif ($$bufr =~ m/\G#
391	0					0	elsif ($$bufr =~ m/\G#>/p) { $type = 'HASH_GREATER'; } ##-- hash+greater: min distance
392	0					0	elsif ($$bufr =~ m/\G\$\./p) { $type = 'DOLLAR_DOT'; } ##-- positional anchor pseudo-index
393	0					0	elsif ($$bufr =~ m/\G\:\{/p) { $type = 'COLON_LBRACE'; } ##-- theusaurus-query operator
394	4					11	elsif ($$bufr =~ m/\G\@\{/p) { $type = 'AT_LBRACE'; } ##-- literal-set operator
395	0					0	elsif ($$bufr =~ m/\G\*\{/p) { $type = 'STAR_LBRACE'; } ##-- prefix-set opener
396	0					0	elsif ($$bufr =~ m/\G\}\*/p) { $type = 'RBRACE_STAR'; } ##-- suffix-set closer
397	78					234	elsif ($$bufr =~ m/\G[!.,;=@%^#~\/]/p) { $type = ${^MATCH}; } ##-- single-char punctuation operators
398	172					477	elsif ($$bufr =~ m/\G[\[\]{}()<>]/p) { $type = ${^MATCH}; } ##-- parentheses
399	4					14	elsif ($$bufr =~ m/\G\"/p) { $type = ${^MATCH}; } ##-- double-quotes
400
401							##-- subcorpus path-lists
402	34					158	elsif ($$bufr =~ m/\G\:/p) { $type = ${^MATCH}; $lex->{yyDtrsPos}=$lex->yypos(); $lex->{state}='Q_CORPORA'; }
	34					96
	34					73
403
404							##-- truncated symbols
405	0					0	elsif ($$bufr =~ m/\G\\'($DEF{sq_text})\'\/po) { $type='INFIX'; $text=$1; } ##-- dual-truncated quoted string (infix symbol)
	0					0
406	2					8	elsif ($$bufr =~ m/\G\'($DEF{sq_text})\'\*/po) { $type='PREFIX'; $text=$1; } ##-- right-truncated quoted string (prefix symbol)
	2					9
407	0					0	elsif ($$bufr =~ m/\G\*\'($DEF{sq_text})\'/po) { $type='SUFFIX'; $text=$1; } ##-- left-truncated quoted string (suffix symbol)
	0					0
408
409	0					0	elsif ($$bufr =~ m/\G\($DEF{symbol_text})\/po) { $type='INFIX'; $text=$1; } ##-- dual-truncated bareword (infix symbol)
	0					0
410	2					9	elsif ($$bufr =~ m/\G($DEF{symbol_text})\*/po) { $type='PREFIX'; $text=$1; } ##-- right-truncated bareword (prefix symbol)
	2					9
411	0					0	elsif ($$bufr =~ m/\G\*($DEF{symbol_text})/po) { $type='SUFFIX'; $text=$1; } ##-- left-truncated bareword (suffix symbol)
	0					0
412
413							##-- index names (special handling to allow count(foo) #by[$w-1]
414	24					71	elsif ($$bufr =~ m/\G\$($DEF{index_name})/po) { $type='INDEX'; $text=$1; }
	24					66
415	0					0	elsif ($$bufr =~ m/\G\$'($DEF{index_name})'/po) { $type='INDEX'; $text=$1; }
	0					0
416	0					0	elsif ($$bufr =~ /m\G\$/p) { $type='$'; }
417
418							##-- numeric tokens (handled below with symbols for perl lexer w/o longest-match)
419
420							##-- single-term wildcard
421	0					0	elsif ($$bufr =~ m/\G\/p) { $type = ''; }
422
423							##-- term expander pipelines
424	4					14	elsif ($$bufr =~ m/\G\\|/p) { $lex->{state}='Q_XPIPE'; $type='__SKIP__'; }
	4					5
425
426							##-- symbols: quoted
427							elsif ($$bufr =~ m/\G\'($DEF{sq_text})\'/po) {
428	43					152	$text = $1;
429	43	100				222	if ($text =~ m/^[\+\-]?[0-9]+\z/) { $type='INTEGER'; }
	10	100				22
430	6					16	elsif ($text =~ m/^[0-9-]+\z/) { $type='DATE'; }
431	27					66	else { $type='SYMBOL'; }
432							}
433							##-- symbols: barewords
434							elsif ($lex->{state} eq 'INITIAL' && $$bufr =~ m/\G$DEF{symbol_text}/po) {
435	285					877	$text = ${^MATCH};
436	285					452	$match = ${^MATCH};
437	285	100				1423	if ($text =~ m/^[\+\-]?[0-9]+\z/) { $type='INTEGER'; }
	40	100				79
438	6					17	elsif ($text =~ m/^[0-9-]+\z/) { $type='DATE'; }
439	239					479	else { $type='SYMBOL'; }
440							}
441							##-- barewords: integers and dates (Q_MATCHID)
442	14					50	elsif ($lex->{state} ne 'INITIAL' && $$bufr =~ m/\G[\+\-]?[0-9]+(?=$DEF{int_boundary})/po) { $type='INTEGER'; }
443	26					71	elsif ($lex->{state} ne 'INITIAL' && $$bufr =~ m/\G[\+\-]?[0-9]+\z/p) { $type='INTEGER'; }
444
445	0					0	elsif ($lex->{state} ne 'INITIAL' && $$bufr =~ m/\G$DEF{date_bare}(?=$DEF{int_boundary})/po) { $type='DATE'; }
446	0					0	elsif ($lex->{state} ne 'INITIAL' && $$bufr =~ m/\G$DEF{date_bare}\z/po) { $type='DATE'; }
447
448							##-- misc
449	300					763	elsif ($$bufr =~ m/\G\s+/p) { $type = '__SKIP__'; }
450							#elsif ($$bufr =~ m/\G./p) { $type = 'SYMBOL'; }
451	0					0	elsif ($$bufr =~ m/\G./p) { $type = '__ERROR__'; }
452
453	1213	100				3462	$match = ${^MATCH} if (!defined($match));
454							}
455							##------------------------
456							elsif ($lex->{state} eq 'Q_CORPORA') {
457	86	100				1194	if ($$bufr =~ m/\G$DEF{corpus_text}/p) { $type='SYMBOL'; }
	28	100				74
		100
		100
458	22					51	elsif ($$bufr =~ m/\G\'($DEF{sq_text})\'/po) { $type='SYMBOL'; $text=$1; }
	22					49
459	18					53	elsif ($$bufr =~ m/\G\,/p) { $type=${^MATCH}; }
460	8					22	elsif ($$bufr =~ m/\G\s+/p) { $type='__SKIP__'; }
461	10					23	else { $type='__SKIP__'; $lex->yypopq(); }
	10					32
462
463	86					174	$match = ${^MATCH};
464	86	100				254	$lex->{yyDtrsLen} = $lex->yypos()-$lex->{yyDtrsPos} if ($type ne '__SKIP__');
465							}
466							##------------------------
467							elsif ($lex->{state} eq 'Q_COMMENT') {
468	38	100				462	if ($$bufr =~ m/\G\]/p) {
		50
469	19					36	$type='__SKIP__';
470	19					41	$lex->{_cmtbuf} .= ${^MATCH};
471	19					29	push(@{$lex->{comments}}, $lex->{_cmtbuf});
	19					60
472	19					56	delete $lex->{_cmtbuf};
473	19					60	$lex->yypopq();
474							}
475	19					50	elsif ($$bufr =~ m/\G$DEF{comment_text}/sp) { $type='__SKIP__'; $lex->{_cmtbuf} .= ${^MATCH}; }
	19					54
476	0					0	else { $type='__ERROR__'; }
477
478	38					96	$match = ${^MATCH};
479							}
480							##------------------------
481							elsif ($lex->{state} eq 'Q_DATE') {
482	2	50				19	if ($$bufr =~ m/^\s+/) { $type = '__SKIP__'; }
	0	50				0
483	0					0	elsif ($$bufr =~ m/^\//) { $type = '/'; $lex->{state}='INITIAL'; }
	0					0
484	2					6	else { $lex->{state}='INITIAL'; $type='__SKIP__'; }
	2					5
485
486	2					5	$match = ${^MATCH};
487							}
488							##------------------------
489							elsif ($lex->{state} eq 'Q_REGREP') {
490	2	50				158	if ($$bufr =~ m/\G($DEF{regex_text})\/(?=$DEF{regex_modifier})/po) { $type='REGEX_REPLACE'; $text=$1; $lex->{state}='Q_REGOPT'; }
	0	50				0
	0					0
	0					0
491	2					7	elsif ($$bufr =~ m/\G($DEF{regex_text})\//po) { $type='REGEX_REPLACE'; $text=$1; $lex->{state}='INITIAL'; }
	2					8
	2					7
492
493	2					7	$match = ${^MATCH};
494							}
495							##------------------------
496							elsif ($lex->{state} eq 'Q_REGOPT') {
497	0	0				0	if ($$bufr =~ m/\G$DEF{regex_modifier}+/po) { $type='REGOPT'; }
	0					0
498	0					0	else { $type='__SKIP__'; }
499	0					0	$lex->{state} = 'INITIAL';
500
501	0					0	$match = ${^MATCH};
502							}
503							##------------------------
504							elsif ($lex->{state} eq 'Q_XPIPE') {
505	4	50				248	if ($$bufr =~ m/\G\s+/p) { $type = '__SKIP__'; } ##-- whitespace: skip
	0	50				0
		50
		50
506	0					0	elsif ($$bufr =~ m/\G\-/p) { $lex->{state}='INITIAL'; $type='EXPANDER'; }
	0					0
507	0					0	elsif ($$bufr =~ m/\G\'($DEF{sq_text})\'/po) { $lex->{state}='INITIAL'; $type='EXPANDER'; $text=$1; }
	0					0
	0					0
508	4					14	elsif ($$bufr =~ m/\G$DEF{symbol_text}/po) { $lex->{state}='INITIAL'; $type='EXPANDER'; }
	4					7
509							#elsif ($$bufr =~ m/\G\z/p) { $lex->{state}='INITIAL'; $type='EXPANDER'; }
510	0					0	else { $lex->{state}='INITIAL'; $type='EXPANDER'; $text=''; }
	0					0
	0					0
511
512	4					12	$match = ${^MATCH};
513							}
514							##------------------------
515							## END perl-ification of flex sources
516
517							##-- guts
518	1345	100				2662	$text = $match if (!defined($text));
519	1345	100				3572	$lex->{bufp} += length($match) if (defined($match));
520
521	1345					3111	pos($$bufr) = $lex->{bufp};
522	1345	50				3352	return if (!defined($type));
523
524	1345	100				2719	next LEXSKIP if ($type eq '__SKIP__');
525	942	100				1704	next LEXBUF if ($type eq '__EOF__');
526							#elsif ($type eq '__ERROR__') {
527							# return $lex->yyerror();
528							#}
529	916					5159	return @$lex{qw(yytype yytext)} = ($type,$text);
530							}
531							}
532	209					861	return @$lex{qw(yytype yytext)} = ('__EOF__',undef);
533							}
534
535							##======================================================================
536							## Testing: dummy lexing
537
538							## undef = $lex->dummylex(@from_whatever)
539							sub dummylex {
540	0			0	0		my $lex = shift;
541	0						$lex->reset();
542	0						$lex->from(@_);
543	0						my ($type,$text);
544							TOKEN:
545	0						while(1) {
546	0						($type,$text) = $lex->yylex();
547							print("-" x 64, "\n",
548							">> Line: ", $lex->yylineno, ", Pos: ", $lex->yypos, "\n",
549	0	0					">> State: ", (defined($lex->{state}) ? $lex->{state} : '(undef)'), "\n",
		0
		0
550							">> Type: ", (defined($type) ? $type : '(undef)'), "\n",
551							">> Text: ", (defined($text) ? $text : '(undef)'), "\n",
552							);
553	0	0					if (!defined($type)) {
554	0						warn(":: undef type!");
555	0						return;
556							}
557	0	0					if ($type eq '__ERROR__') {
558	0						print(":: ERROR DETECTED\n");
559	0						$lex->yyerror();
560							}
561	0	0					last if ($type eq '__EOF__');
562							}
563							}
564
565
566							1; ##-- be happy
567
568							__END__