File Coverage

blib/lib/DDC/PP/yyqlexer.pm

Criterion	Covered	Total	%
statement	191	316	60.4
branch	180	284	63.3
condition	19	36	52.7
subroutine	21	28	75.0
pod	0	20	0.0
total	411	684	60.0

line	stmt	bran	cond	sub	pod	time	code
1							##-- Mode: CPerl --
2
3							## File: DDC::::yyqlexer.pm
4							## Author: Bryan Jurish
5							## Description:
6							## + lexer for ddc queries (formerly DDC::Query::yylexer)
7							## + last updated for ddc v2.1.15
8							##======================================================================
9
10							package DDC::PP::yyqlexer;
11	20			20		467	use 5.010; ##-- we need at least v5.10.0 for /p regex modifier
	20					69
12	20			20		144	use Encode qw(encode_utf8 decode_utf8);
	20					37
	20					1077
13	20			20		120	use Carp;
	20					44
	20					1014
14	20			20		131	use IO::File;
	20					37
	20					2616
15	20			20		141	use IO::Handle;
	20					45
	20					693
16	20			20		112	use strict;
	20					41
	20					6889
17
18							##======================================================================
19							## Globals etc.
20							our @ISA = qw();
21
22							##----------------------------------------------------------------------
23							## Globals: regexes for Parse::Lex lexer token regexes
24
25							## %DEF
26							## + common shared regex definitions
27							our (%DEF);
28							BEGIN {
29							##-- adapted from ddc-2.1.1/src/ConcordLib/yyQLexer.l ; extra escapes needed for backslashes ('\\' -> '\\\\')
30
31							##-- whitespace
32	20			20		123	$DEF{ws} = '[ \t\n\r\f\x0b]';
33
34							##-- integer boundary characters
35	20					60	$DEF{int_boundary} = '[ \t\n\r\f\x0b\0&\|!?^%,:;#*=~(){}<>\[\]\/\\\\\'\".$@_+-]';
36
37							##-- bareword symbols
38	20					34	$DEF{symbol_cfirst} = '[^ \t\n\r\f\x0b\0&\|!?^%,:;#*=~(){}<>\[\]\\\\\/\'\".$@]';
39	20					50	$DEF{symbol_crest} = '[^ \t\n\r\f\x0b\0&\|!?^%,:;#*=~(){}<>\[\]\\\\\/\'\"]';
40	20					44	$DEF{symbol_cescape} = '(?:\\\\.)';
41	20					102	$DEF{symbol_text} = "(?:$DEF{symbol_cescape}\|$DEF{symbol_cfirst})(?:$DEF{symbol_cescape}\|$DEF{symbol_crest})*";
42
43							##-- subcorpus symbols (ddc >= v2.2.0; also allow '!' here)
44	20					51	$DEF{corpus_cfirst} = '[^ \t\n\r\f\v\0&\|?^%,:;#=~(){}<>\[\]\\\'\"$@]';
45	20					55	$DEF{corpus_crest} = '[^ \t\n\r\f\v\0&\|?^%,:;#=~(){}<>\[\]\\\'\"]';
46	20					83	$DEF{corpus_text} = "(?:$DEF{symbol_cescape}\|$DEF{corpus_cfirst})(?:$DEF{symbol_cescape}\|$DEF{corpus_crest})*";
47
48							##-- bareword index names (underscore and digits ok, but no '.', '-', or '+')
49	20					37	$DEF{index_char} = '[^ \t\n\r\f\x0b\0&\|!?^%,:;#*=~(){}<>\[\]\\\\/\'\".$@+-]';
50	20					62	$DEF{index_name} = "(?:$DEF{index_char}\|$DEF{symbol_cescape})+";
51
52							##-- single-quoted symbols
53	20					44	$DEF{sq_text} = "(?:[^\']\|$DEF{symbol_cescape})*";
54
55							##-- bareword dates (>= 4 digits of year, otherwise breaks {count(...) #by[$w=1-1]})
56	20					42	$DEF{date_bare} = "[+-]?[0-9]{4,}(-[0-9]{1,2}){1,2}";
57
58							##-- regexes
59	20					43	$DEF{regex_text} = "(?:(?:\\\\.)\|[^\\\\/])*";
60	20					35	$DEF{regex_modifier} = '[dgimsx]';
61
62							##-- comments
63	20					54	$DEF{comment_text} = "(?:\\\\.\|[^\\]])*";
64
65							##-- compile patterns
66	20					149	foreach (keys %DEF) {
67							#print STDERR __PACKAGE__, ": compiling regex macro: $_ ~ /$DEF{$_}/\n";
68	320					10734	$DEF{$_} = qr/$DEF{$_}/;
69							}
70							}
71
72							##======================================================================
73							## $lex = $CLASS_OR_OBJ->new(%args)
74							## + abstract constructor
75							## + %$lex, %args:
76							## {
77							## src => $name, ##-- source name
78							## fh => $srcfh, ##-- source filehandle
79							## bufr => \$buf, ##-- source buffer (string reference)
80							## bufp => $pos, ##-- current pos() in source buffer
81							## buf => $buf, ##-- local buffer (for filehandle input)
82							## state => $q, ##-- symbolic state name (default: 'INITIAL')
83							## stack => \@stack, ##-- state stack
84							##
85							## ##-- utf-8 or byte mode?
86							## utf8 => $bool, ##-- whether to use utf8 or byte-mode (default: true (non-compatible but handy))
87							##
88							## ##-- runtime data
89							## yytext => $text, ##-- current text
90							## yytype => $type, ##-- current token type
91							## yylineno => $line, ##-- current source line (file input only)
92							## }
93							sub new {
94	15			15	0	39	my $that = shift;
95	15		33			224	my $lex = bless({
96							src =>undef,
97							fh =>undef,
98							bufr =>undef,
99							bufp =>undef,
100							buf =>undef,
101							utf8 =>1,
102							state => 'INITIAL',
103							stack => [],
104
105							yytext=>undef,
106							yytype=>undef,
107							yylineno=>undef,
108
109							##-- lexer comment retention hacks
110							comments=>[],
111
112							##-- user args
113							@_
114							},
115							ref($that)\|\|$that
116							);
117	15					171	return $lex;
118							}
119
120							## $lex = $lex->clear()
121							## + clear lexer buffer, source, etc
122							sub clear {
123	398			398	0	534	my $lex = shift;
124	398					1221	delete @$lex{qw(src fh bufr bufp buf yytext yytype yylineno)};
125	398					644	$lex->{state} = 'INITIAL';
126	398					524	@{$lex->{stack}} = qw();
	398					674
127	398					560	@{$lex->{comments}} = qw();
	398					584
128	398					543	delete $lex->{_cmtbuf};
129	398					653	return $lex;
130							}
131	20			20		86428	BEGIN { reset = close = \&clear; }
132
133							##======================================================================
134							## I/O
135
136							## $lex = $lex->from($which,$src, %opts)
137							## + $which is one of qw(fh file string)
138							## + $src is the actual source (default: 'string')
139							sub from {
140	199			199	0	407	my ($lex,$which,$src,%opts) = @_;
141	199	50				471	return $lex->fromFh($src,%opts) if ($which eq 'fh');
142	199	50				352	return $lex->fromFile($src,%opts) if ($which eq 'file');
143	199					507	return $lex->fromString($src,%opts);
144							}
145
146							## $lex = $lex->fromFile($filename_or_handle,%opts)
147							sub fromFile {
148	0			0	0	0	my ($lex,$file,%opts) = @_;
149	0	0				0	return $lex->fromFh($file,%opts) if (ref($file));
150	0	0				0	my $fh = IO::File->new("<$file")
151							or confess("cannot open '$file' for read: $!");
152	0	0				0	binmode($fh,':encoding(utf8)') if ($lex->{utf8});
153	0					0	return $lex->fromFh($fh,src=>"file \`$file'",%opts);
154							}
155
156							our $FH_SLURP=0; ##-- DEBUG: slurp whole files instead of line-wise input
157
158							## $lex = $lex->fromFh($fh,%opts)
159							## + uses native $fh encoding
160							sub fromFh {
161	0			0	0	0	my ($lex,$fh,%opts) = @_;
162	0	0				0	if ($FH_SLURP) {
163							##-- always use string mode
164	0					0	local $/=undef;
165	0					0	my $buf = $fh->getline;
166	0					0	$fh->close();
167	0					0	return $lex->fromString(\$buf,src=>"filehandle \`$fh'",%opts);
168							}
169							##-- line-wise buffering
170	0					0	$lex->clear();
171	0					0	@$lex{keys %opts} = values(%opts);
172	0					0	$lex->{fh} = $fh;
173	0					0	$lex->{buf} = undef;
174	0					0	$lex->{bufr} = \$lex->{buf};
175	0					0	$lex->{bufp} = 0;
176	0	0				0	$lex->{src} = "filehandle \`$fh'" if (!defined($lex->{src}));
177	0					0	return $lex;
178							}
179
180							## $lex = $lex->fromString($str,%opts)
181							## $lex = $lex->fromString(\$str,%opts)
182							sub fromString {
183	199			199	0	342	my ($lex,$str,%opts) = @_;
184	199					424	$lex->clear();
185	199	50				463	if (ref($str)) {
186	199					339	$lex->{bufr} = $str;
187	199	50				797	$lex->{src} = "buffer \`$str'" if (!defined($lex->{src}));
188							} else {
189	0					0	$lex->{bufr} = \$str;
190	0	0				0	$lex->{src} = "string \`$str'" if (!defined($lex->{src}));
191							}
192	199					390	$lex->{bufp} = 0;
193	199					339	$lex->{yylineno} = 0;
194
195							##-- utf8 checks
196	199	50	33			473	if ($lex->{utf8} && !utf8::is_utf8(${$lex->{bufr}})) {
	199	0	0			757
197							##-- lexer:utf8, string:bytes --> assume string is utf8-encoded
198	199	50				307	${$lex->{bufr}} = decode_utf8(${$lex->{bufr}}) if (!utf8::is_utf8(${$lex->{bufr}}));
	199					5239
	199					641
	199					467
199							}
200	0					0	elsif (!$lex->{utf8} && utf8::is_utf8(${$lex->{bufr}})) {
201							##-- lexer:bytes, string:utf8 --> encode as utf8 octets
202	0					0	${$lex->{bufr}} = encode_utf8(${$lex->{bufr}});
	0					0
	0					0
203							}
204
205	199					694	return $lex;
206							}
207
208							##======================================================================
209							## Utilities
210
211							## $bool = $lex->eof()
212							## + true iff at end-of-file
213							sub eof {
214	1075	50		1075	0	2412	return $FH_SLURP ? $_[0]->eob() : !$_[0]->getmore();
215							}
216
217							## $bool = $lex->eob()
218							## + true at end-of-buffer
219							sub eob {
220	1075		66	1075	0	2372	return (!$_[0]{bufr} \|\| !${$_[0]{bufr}} \|\| ($_[0]{bufp}\|\|0) >= length(${$_[0]{bufr}}));
221							}
222
223							## $bufr_or_undef = $lex->getmore()
224							## + returns true iff there is still data in the buffer
225							sub getmore {
226	1075	100		1075	0	1706	return $_[0]{bufr} if (!$_[0]->eob());
227	197	50				553	if (defined($_[0]{fh})) {
228	0					0	$_[0]{bufp} = 0;
229	0					0	$_[0]{buf} = $_[0]{fh}->getline;
230	0					0	$_[0]{bufr} = \$_[0]{buf};
231	0					0	$_[0]{yylineno} = $_[0]{fh}->input_line_number;
232	0	0				0	return defined($_[0]{buf}) ? $_[0]{bufr} : undef;
233							}
234	197					547	return undef;
235							}
236
237							##======================================================================
238							## Runtime lexer accessors
239
240							## $yytext = $lex->yytext
241							## + always defined; otherwise using $lex->{yytext} is faster
242	2	50		2	0	38	sub yytext { return defined($_[0]{yytext}) ? $_[0]{yytext} : ''; }
243
244							## $yytype = $lex->yytype
245							## + always defined; otherwise using $lex->{yytype} is faster
246	2	50		2	0	22	sub yytype { return defined($_[0]{yytype}) ? $_[0]{yytype} : '__EOF__'; }
247
248							## $line = $lex->yylineno()
249							## + returns current line number
250							sub yylineno {
251	2			2	0	12	return $_[0]{yylineno};
252							}
253
254							## $pos = $lex->yycolumn()
255							## + return column at which current token starts (if any)
256							sub yycolumn {
257	0	0	0	0	0	0	return ($_[0]{bufp}\|\|0) - (defined($_[0]{yytext}) ? length($_[0]{yytext}) : 0);
258							}
259
260							## $pos = $lex->yypos()
261							## + return byte position in current line (or input string)
262							sub yypos {
263	0		0	0	0	0	return ($_[0]{bufp}\|\|0);
264							}
265
266							## $string = $lex->yyerror(@msg)
267							## + create a helpful error message
268							sub yyerror {
269	0			0	0	0	my $lex = shift;
270	0					0	confess(ref($lex).": error in ".$lex->yywhere().join('',@_));
271							}
272
273							## $string = $lex->yywhere()
274							## + location string used by yyerror()
275							sub yywhere {
276	0			0	0	0	my $lex = shift;
277							return ("$lex->{src} at "
278							.(defined($lex->{fh}) ? ("line $lex->{yylineno}, ") : '')
279							."column ".$lex->yycolumn
280	0	0				0	.", near ".(defined($lex->{yytext}) ? "\`$lex->{yytext}\'" : '__EOF__')
		0
281							);
282							}
283
284							## $q = $lex->yypushq($new_state)
285							sub yypushq {
286	19			19	0	56	my ($lex,$qnew) = @_;
287	19					31	push(@{$lex->{stack}},$lex->{state});
	19					54
288	19					45	return $lex->{state} = $qnew;
289							}
290
291							## $q = $lex->yypopq()
292							sub yypopq {
293	21			21	0	33	my $lex = shift;
294	21		100			38	return $lex->{state} = pop(@{$lex->{stack}}) \|\| 'INITIAL';
295							}
296
297							##======================================================================
298							## Runtime lexer routines
299
300							## ($typ,$text) = $lex->yylex()
301							## + get next token from input stream
302							sub yylex {
303	1057			1057	0	1654	my $lex = shift;
304	1057					1532	my ($bufr,$type,$text,$match,@part);
305							#use re 'eval'; ##-- dangerous!
306							LEXBUF:
307	1057					2036	while (!$lex->eof()) {
308	878					1500	$bufr = $lex->{bufr};
309	878					1997	pos($$bufr) = $lex->{bufp};
310
311							LEXSKIP:
312							##------------------------------------
313							## LEXSKIP: main lexer loop
314	878					1390	while (1) {
315	1241					1903	$type = $text = $match = undef;
316	1241					1714	@part = qw();
317
318							##------------------------
319	1241	100	100			3260	if ($lex->{state} eq 'INITIAL' \|\| $lex->{state} eq 'Q_MATCHID') {
		100
		100
		100
		100
		50
		50
320
321							##-- end-of-file (should be first pattern)
322	1137	100	100			48956	if ($$bufr =~ m/\G\z/) { $type = '__EOF__'; }
	18	100	100			33
		100	66
		100	33
		100	33
		50
		100
		100
		100
		100
		50
		50
		100
		100
		100
		100
		100
		100
		50
		100
		100
		50
		50
		50
		50
		50
		100
		50
		50
		50
		50
		50
		50
		50
		50
		50
		100
		50
		50
		100
		50
		50
		50
		100
		100
		50
		50
		50
		50
		100
		100
		100
		50
		50
		50
		50
		100
		50
		50
		100
		100
		100
		100
		50
		100
		50
		50
		100
		50
		100
		50
		50
		50
		100
		100
		100
		100
		100
		50
		50
		50
		0
323
324							##-- comments
325	14					25	elsif ($$bufr =~ m/\G\#:[^\n]*/sp) { $type='__SKIP__'; push(@{$lex->{comments}},${^MATCH}."\n"); }
	14					24
	14					60
326	19					34	elsif ($$bufr =~ m/\G\#\[/p) { $type='__SKIP__'; $lex->{_cmtbuf}=${^MATCH}; $lex->yypushq('Q_COMMENT'); }
	19					56
	19					69
327
328							##-- operators
329	21					40	elsif ($$bufr =~ m/\G\&\&/p) { $type = 'OP_BOOL_AND'; }
330	2					6	elsif ($$bufr =~ m/\G\\|\\|/p) { $type = 'OP_BOOL_OR'; }
331	0					0	elsif ($$bufr =~ m/\Gnear/pi) { $type = 'NEAR'; }
332	2					7	elsif ($$bufr =~ m/\G(?:\!=\|\&\!=\|\&=\s\!\|\!with\|with(?:out\|\s\!))/pi) { $type = 'WITHOUT'; }
333	2					6	elsif ($$bufr =~ m/\G(?:\\|=\|withor\|orwith\|wor)/pi) { $type = 'WITHOR'; }
334	2					6	elsif ($$bufr =~ m/\G(?:\&=\|with)/pi) { $type = 'WITH'; }
335	28					56	elsif ($$bufr =~ m/\Gcount/pi) { $type = 'COUNT'; }
336	0					0	elsif ($$bufr =~ m/\Gkeys/pi) { $type = 'KEYS'; }
337
338							##-- count-by keywords
339	0					0	elsif ($$bufr =~ m/\G(?:file\|doc)_?id/pi) { $type = 'KW_FILEID'; }
340	2					5	elsif ($$bufr =~ m/\G(?:file\|doc)_?(?:name)?/pi) { $type = 'KW_FILENAME'; }
341	2					5	elsif ($$bufr =~ m/\Gdate/pi) { $type = 'KW_DATE'; $lex->{state}='Q_DATE' }
	2					4
342
343							##-- query operators
344	17					33	elsif ($$bufr =~ m/\G\#(?:comment\|cmt)/pi) { $type = 'KW_COMMENT'; }
345	2					6	elsif ($$bufr =~ m/\G\#(?:(?:co?n?te?xt?\|n))/pi) { $type = 'CNTXT'; }
346	2					4	elsif ($$bufr =~ m/\G\#(?:with)?in/pi) { $type = 'WITHIN'; }
347	3					7	elsif ($$bufr =~ m/\G\#(?:sep(?:arate)?\|nojoin)(?:_hits)?/pi) { $type = 'SEPARATE_HITS'; }
348	0					0	elsif ($$bufr =~ m/\G\#(?:nosep(?:arate)?\|join)(?:_hits)?/pi) { $type = 'NOSEPARATE_HITS'; }
349	4					10	elsif ($$bufr =~ m/\G\#(?:is_\|has_)?date/pi) { $type = 'IS_DATE'; }
350	2					5	elsif ($$bufr =~ m/\G\#has(?:_field)?/pi) { $type = 'HAS_FIELD'; }
351	0					0	elsif ($$bufr =~ m/\G\#file(?:_?)names/pi) { $type = 'FILENAMES_ONLY'; }
352	0					0	elsif ($$bufr =~ m/\G\#debug_rank/pi) { $type = 'DEBUG_RANK'; }
353	0					0	elsif ($$bufr =~ m/\G\#(?:greater\|de?sc)(?:_by)?_rank/pi) { $type = 'GREATER_BY_RANK'; }
354	0					0	elsif ($$bufr =~ m/\G\#(?:less\|asc)(?:_by)?_rank/pi) { $type = 'LESS_BY_RANK'; }
355	0					0	elsif ($$bufr =~ m/\G\#(?:greater\|de?sc)(?:_by)?_date/pi) { $type = 'GREATER_BY_DATE'; }
356	10					24	elsif ($$bufr =~ m/\G\#(?:less\|asc)(?:_by)?_date/pi) { $type = 'LESS_BY_DATE'; }
357	0					0	elsif ($$bufr =~ m/\G\#(?:greater\|de?sc)(?:_by)?_size/pi) { $type = 'GREATER_BY_SIZE'; }
358	0					0	elsif ($$bufr =~ m/\G\#(?:less\|asc)(?:_by)?_size/pi) { $type = 'LESS_BY_SIZE'; }
359	0					0	elsif ($$bufr =~ m/\G\#(?:is_\|has_)?size/pi) { $type = 'IS_SIZE'; }
360	0					0	elsif ($$bufr =~ m/\G\#(?:(?:less\|asc)_(?:by_)?)?left/pi) { $type = 'LESS_BY_LEFT'; }
361	0					0	elsif ($$bufr =~ m/\G\#(?:(?:greater\|de?sc)_(?:by_)?)left/pi) { $type = 'GREATER_BY_LEFT'; }
362	0					0	elsif ($$bufr =~ m/\G\#(?:(?:less\|asc)_(?:by_)?)?right/pi) { $type = 'LESS_BY_RIGHT'; }
363	0					0	elsif ($$bufr =~ m/\G\#(?:(?:greater\|de?sc)_(?:by_)?)right/pi) { $type = 'GREATER_BY_RIGHT'; }
364	0					0	elsif ($$bufr =~ m/\G\#(?:(?:less\|asc)_(?:by_)?)?mid(?:dle)?/pi) { $type = 'LESS_BY_MIDDLE'; }
365	0					0	elsif ($$bufr =~ m/\G\#(?:(?:greater\|de?sc)_(?:by_)?)mid(?:dle)?/pi) { $type = 'GREATER_BY_MIDDLE'; }
366	2					10	elsif ($$bufr =~ m/\G\#(?:(?:less\|asc)(?:_by)?_key)/pi) { $type = 'LESS_BY_KEY'; }
367	0					0	elsif ($$bufr =~ m/\G\#(?:(?:greater\|de?sc)(?:_by)?_key)/pi) { $type = 'GREATER_BY_KEY'; }
368	0					0	elsif ($$bufr =~ m/\G\#(?:(?:less\|asc)(?:_by)?_(?:count\|val(?:ue)?))/pi) { $type = 'LESS_BY_COUNT'; }
369	2					17	elsif ($$bufr =~ m/\G\#(?:(?:greater\|de?sc)(?:_by)?_(?:count\|val(?:ue)?))/pi) { $type = 'GREATER_BY_COUNT'; }
370	0					0	elsif ($$bufr =~ m/\G\#(?:less\|asc)(?:_by)?/pi) { $type = 'LESS_BY'; }
371	0					0	elsif ($$bufr =~ m/\G\#(?:greater\|de?sc)(?:_by)?/pi) { $type = 'GREATER_BY'; }
372	0					0	elsif ($$bufr =~ m/\G\#rand(?:om)?/pi) { $type = 'RANDOM'; }
373	28					64	elsif ($$bufr =~ m/\G\#by/pi) { $type = 'BY'; }
374	2					6	elsif ($$bufr =~ m/\G\#samp(?:le)?/pi) { $type = 'SAMPLE'; }
375	0					0	elsif ($$bufr =~ m/\G\#clim(?:it)?/pi) { $type = 'CLIMIT'; }
376
377							##-- regexes
378	0					0	elsif ($$bufr =~ m/\G\!\/($DEF{regex_text})\/(?=$DEF{regex_modifier})/po) { $type='NEG_REGEX'; $text=$1; $lex->{state}='Q_REGOPT'; }
	0					0
	0					0
379	0					0	elsif ($$bufr =~ m/\G\!\/($DEF{regex_text})\//po) { $type='NEG_REGEX'; $text=$1; }
	0					0
380	0					0	elsif ($$bufr =~ m/\G\/($DEF{regex_text})\/(?=$DEF{regex_modifier})/po) { $type='REGEX'; $text=$1; $lex->{state}='Q_REGOPT'; }
	0					0
	0					0
381	11					28	elsif ($$bufr =~ m/\G\/($DEF{regex_text})\//po) { $type='REGEX'; $text=$1; }
	11					27
382	2					6	elsif ($$bufr =~ m/\Gs\/($DEF{regex_text})\//po) { $type='REGEX_SEARCH'; $text=$1; $lex->{state}='Q_REGREP'; }
	2					5
	2					4
383
384							##-- punctutation & special characters
385	2					65	elsif ($$bufr =~ m/\G#=/p) { $type = 'HASH_EQUAL'; } ##-- hash+equal: exact distance
386	0					0	elsif ($$bufr =~ m/\G#
387	0					0	elsif ($$bufr =~ m/\G#>/p) { $type = 'HASH_GREATER'; } ##-- hash+greater: min distance
388	0					0	elsif ($$bufr =~ m/\G\$\./p) { $type = 'DOLLAR_DOT'; } ##-- positional anchor pseudo-index
389	0					0	elsif ($$bufr =~ m/\G\:\{/p) { $type = 'COLON_LBRACE'; } ##-- theusaurus-query operator
390	4					10	elsif ($$bufr =~ m/\G\@\{/p) { $type = 'AT_LBRACE'; } ##-- literal-set operator
391	0					0	elsif ($$bufr =~ m/\G\*\{/p) { $type = 'STAR_LBRACE'; } ##-- prefix-set opener
392	0					0	elsif ($$bufr =~ m/\G\}\*/p) { $type = 'RBRACE_STAR'; } ##-- suffix-set closer
393	74					189	elsif ($$bufr =~ m/\G[!.,;=@%^#~\/]/p) { $type = ${^MATCH}; } ##-- single-char punctuation operators
394	164					411	elsif ($$bufr =~ m/\G[\[\]{}()<>]/p) { $type = ${^MATCH}; } ##-- parentheses
395	4					14	elsif ($$bufr =~ m/\G\"/p) { $type = ${^MATCH}; } ##-- double-quotes
396
397							##-- subcorpus path-lists
398	26					63	elsif ($$bufr =~ m/\G\:/p) { $type = ${^MATCH}; $lex->{state}='Q_CORPORA'; }
	26					47
399
400							##-- truncated symbols
401	0					0	elsif ($$bufr =~ m/\G\\'($DEF{sq_text})\'\/po) { $type='INFIX'; $text=$1; } ##-- dual-truncated quoted string (infix symbol)
	0					0
402	2					7	elsif ($$bufr =~ m/\G\'($DEF{sq_text})\'\*/po) { $type='PREFIX'; $text=$1; } ##-- right-truncated quoted string (prefix symbol)
	2					6
403	0					0	elsif ($$bufr =~ m/\G\*\'($DEF{sq_text})\'/po) { $type='SUFFIX'; $text=$1; } ##-- left-truncated quoted string (suffix symbol)
	0					0
404
405	0					0	elsif ($$bufr =~ m/\G\($DEF{symbol_text})\/po) { $type='INFIX'; $text=$1; } ##-- dual-truncated bareword (infix symbol)
	0					0
406	2					8	elsif ($$bufr =~ m/\G($DEF{symbol_text})\*/po) { $type='PREFIX'; $text=$1; } ##-- right-truncated bareword (prefix symbol)
	2					8
407	0					0	elsif ($$bufr =~ m/\G\*($DEF{symbol_text})/po) { $type='SUFFIX'; $text=$1; } ##-- left-truncated bareword (suffix symbol)
	0					0
408
409							##-- index names (special handling to allow count(foo) #by[$w-1]
410	24					56	elsif ($$bufr =~ m/\G\$($DEF{index_name})/po) { $type='INDEX'; $text=$1; }
	24					53
411	0					0	elsif ($$bufr =~ m/\G\$'($DEF{index_name})'/po) { $type='INDEX'; $text=$1; }
	0					0
412	0					0	elsif ($$bufr =~ /m\G\$/p) { $type='$'; }
413
414							##-- numeric tokens (handled below with symbols for perl lexer w/o longest-match)
415
416							##-- single-term wildcard
417	0					0	elsif ($$bufr =~ m/\G\/p) { $type = ''; }
418
419							##-- term expander pipelines
420	4					11	elsif ($$bufr =~ m/\G\\|/p) { $lex->{state}='Q_XPIPE'; $type='__SKIP__'; }
	4					5
421
422							##-- symbols: quoted
423							elsif ($$bufr =~ m/\G\'($DEF{sq_text})\'/po) {
424	39					117	$text = $1;
425	39	100				190	if ($text =~ m/^[\+\-]?[0-9]+\z/) { $type='INTEGER'; }
	10	100				25
426	6					15	elsif ($text =~ m/^[0-9-]+\z/) { $type='DATE'; }
427	23					44	else { $type='SYMBOL'; }
428							}
429							##-- symbols: barewords
430							elsif ($lex->{state} eq 'INITIAL' && $$bufr =~ m/\G$DEF{symbol_text}/po) {
431	269					759	$text = ${^MATCH};
432	269					402	$match = ${^MATCH};
433	269	100				1188	if ($text =~ m/^[\+\-]?[0-9]+\z/) { $type='INTEGER'; }
	36	100				77
434	6					13	elsif ($text =~ m/^[0-9-]+\z/) { $type='DATE'; }
435	227					395	else { $type='SYMBOL'; }
436							}
437							##-- barewords: integers and dates (Q_MATCHID)
438	14					41	elsif ($lex->{state} ne 'INITIAL' && $$bufr =~ m/\G[\+\-]?[0-9]+(?=$DEF{int_boundary})/po) { $type='INTEGER'; }
439	26					122	elsif ($lex->{state} ne 'INITIAL' && $$bufr =~ m/\G[\+\-]?[0-9]+\z/p) { $type='INTEGER'; }
440
441	0					0	elsif ($lex->{state} ne 'INITIAL' && $$bufr =~ m/\G$DEF{date_bare}(?=$DEF{int_boundary})/po) { $type='DATE'; }
442	0					0	elsif ($lex->{state} ne 'INITIAL' && $$bufr =~ m/\G$DEF{date_bare}\z/po) { $type='DATE'; }
443
444							##-- misc
445	284					622	elsif ($$bufr =~ m/\G\s+/p) { $type = '__SKIP__'; }
446							#elsif ($$bufr =~ m/\G./p) { $type = 'SYMBOL'; }
447	0					0	elsif ($$bufr =~ m/\G./p) { $type = '__ERROR__'; }
448
449	1137	100				2776	$match = ${^MATCH} if (!defined($match));
450							}
451							##------------------------
452							elsif ($lex->{state} eq 'Q_CORPORA') {
453	58	100				867	if ($$bufr =~ m/\G$DEF{corpus_text}/p) { $type='SYMBOL'; }
	22	100				53
		100
		50
454	18					37	elsif ($$bufr =~ m/\G\'($DEF{sq_text})\'/po) { $type='SYMBOL'; $text=$1; }
	18					41
455	16					43	elsif ($$bufr =~ m/\G\,/p) { $type=${^MATCH}; }
456	0					0	elsif ($$bufr =~ m/\G\s+/p) { $type='__SKIP__'; }
457	2					7	else { $type='__SKIP__'; $lex->yypopq(); }
	2					11
458
459	58					110	$match = ${^MATCH};
460							}
461							##------------------------
462							elsif ($lex->{state} eq 'Q_COMMENT') {
463	38	100				422	if ($$bufr =~ m/\G\]/p) {
		50
464	19					33	$type='__SKIP__';
465	19					42	$lex->{_cmtbuf} .= ${^MATCH};
466	19					29	push(@{$lex->{comments}}, $lex->{_cmtbuf});
	19					48
467	19					39	delete $lex->{_cmtbuf};
468	19					65	$lex->yypopq();
469							}
470	19					47	elsif ($$bufr =~ m/\G$DEF{comment_text}/sp) { $type='__SKIP__'; $lex->{_cmtbuf} .= ${^MATCH}; }
	19					51
471	0					0	else { $type='__ERROR__'; }
472
473	38					84	$match = ${^MATCH};
474							}
475							##------------------------
476							elsif ($lex->{state} eq 'Q_DATE') {
477	2	50				18	if ($$bufr =~ m/^\s+/) { $type = '__SKIP__'; }
	0	50				0
478	0					0	elsif ($$bufr =~ m/^\//) { $type = '/'; $lex->{state}='INITIAL'; }
	0					0
479	2					5	else { $lex->{state}='INITIAL'; $type='__SKIP__'; }
	2					5
480
481	2					4	$match = ${^MATCH};
482							}
483							##------------------------
484							elsif ($lex->{state} eq 'Q_REGREP') {
485	2	50				135	if ($$bufr =~ m/\G($DEF{regex_text})\/(?=$DEF{regex_modifier})/po) { $type='REGEX_REPLACE'; $text=$1; $lex->{state}='Q_REGOPT'; }
	0	50				0
	0					0
	0					0
486	2					6	elsif ($$bufr =~ m/\G($DEF{regex_text})\//po) { $type='REGEX_REPLACE'; $text=$1; $lex->{state}='INITIAL'; }
	2					5
	2					6
487
488	2					6	$match = ${^MATCH};
489							}
490							##------------------------
491							elsif ($lex->{state} eq 'Q_REGOPT') {
492	0	0				0	if ($$bufr =~ m/\G$DEF{regex_modifier}+/po) { $type='REGOPT'; }
	0					0
493	0					0	else { $type='__SKIP__'; }
494	0					0	$lex->{state} = 'INITIAL';
495
496	0					0	$match = ${^MATCH};
497							}
498							##------------------------
499							elsif ($lex->{state} eq 'Q_XPIPE') {
500	4	50				193	if ($$bufr =~ m/\G\s+/p) { $type = '__SKIP__'; } ##-- whitespace: skip
	0	50				0
		50
		50
501	0					0	elsif ($$bufr =~ m/\G\-/p) { $lex->{state}='INITIAL'; $type='EXPANDER'; }
	0					0
502	0					0	elsif ($$bufr =~ m/\G\'($DEF{sq_text})\'/po) { $lex->{state}='INITIAL'; $type='EXPANDER'; $text=$1; }
	0					0
	0					0
503	4					12	elsif ($$bufr =~ m/\G$DEF{symbol_text}/po) { $lex->{state}='INITIAL'; $type='EXPANDER'; }
	4					7
504							#elsif ($$bufr =~ m/\G\z/p) { $lex->{state}='INITIAL'; $type='EXPANDER'; }
505	0					0	else { $lex->{state}='INITIAL'; $type='EXPANDER'; $text=''; }
	0					0
	0					0
506
507	4					11	$match = ${^MATCH};
508							}
509							##------------------------
510							## END perl-ification of flex sources
511
512							##-- guts
513	1241	100				2232	$text = $match if (!defined($text));
514	1241	100				2973	$lex->{bufp} += length($match) if (defined($match));
515
516	1241					2512	pos($$bufr) = $lex->{bufp};
517	1241	50				2726	return if (!defined($type));
518
519	1241	100				2562	next LEXSKIP if ($type eq '__SKIP__');
520	878	100				1490	next LEXBUF if ($type eq '__EOF__');
521							#elsif ($type eq '__ERROR__') {
522							# return $lex->yyerror();
523							#}
524	860					4334	return @$lex{qw(yytype yytext)} = ($type,$text);
525							}
526							}
527	197					772	return @$lex{qw(yytype yytext)} = ('__EOF__',undef);
528							}
529
530							##======================================================================
531							## Testing: dummy lexing
532
533							## undef = $lex->dummylex(@from_whatever)
534							sub dummylex {
535	0			0	0		my $lex = shift;
536	0						$lex->reset();
537	0						$lex->from(@_);
538	0						my ($type,$text);
539							TOKEN:
540	0						while(1) {
541	0						($type,$text) = $lex->yylex();
542							print("-" x 64, "\n",
543							">> Line: ", $lex->yylineno, ", Pos: ", $lex->yypos, "\n",
544	0	0					">> State: ", (defined($lex->{state}) ? $lex->{state} : '(undef)'), "\n",
		0
		0
545							">> Type: ", (defined($type) ? $type : '(undef)'), "\n",
546							">> Text: ", (defined($text) ? $text : '(undef)'), "\n",
547							);
548	0	0					if (!defined($type)) {
549	0						warn(":: undef type!");
550	0						return;
551							}
552	0	0					if ($type eq '__ERROR__') {
553	0						print(":: ERROR DETECTED\n");
554	0						$lex->yyerror();
555							}
556	0	0					last if ($type eq '__EOF__');
557							}
558							}
559
560
561							1; ##-- be happy
562
563							__END__