File Coverage

blib/lib/Regexp/Lexer.pm

Criterion	Covered	Total	%
statement	76	77	98.7
branch	25	28	89.2
condition	14	16	87.5
subroutine	8	8	100.0
pod	1	1	100.0
total	124	130	95.3

line	stmt	bran	cond	sub	pod	time	code
1							package Regexp::Lexer;
2	8			8		7274	use 5.010001;
	8					23
	8					259
3	8			8		122	use strict;
	8					10
	8					196
4	8			8		35	use warnings;
	8					10
	8					177
5	8			8		29	use B;
	8					8
	8					364
6	8			8		34	use Carp qw/croak/;
	8					10
	8					352
7	8			8		3236	use Regexp::Lexer::TokenType;
	8					14
	8					235
8	8			8		3254	use parent qw(Exporter);
	8					2274
	8					34
9
10							our @EXPORT_OK = qw(tokenize);
11
12							our $VERSION = "0.05";
13
14							my %escapedSpecialChar = (
15							t => Regexp::Lexer::TokenType::EscapedTab,
16							n => Regexp::Lexer::TokenType::EscapedNewline,
17							r => Regexp::Lexer::TokenType::EscapedReturn,
18							f => Regexp::Lexer::TokenType::EscapedFormFeed,
19							F => Regexp::Lexer::TokenType::EscapedFoldcase,
20							a => Regexp::Lexer::TokenType::EscapedAlarm,
21							e => Regexp::Lexer::TokenType::EscapedEscape,
22							c => Regexp::Lexer::TokenType::EscapedControlChar,
23							x => Regexp::Lexer::TokenType::EscapedCharHex,
24							o => Regexp::Lexer::TokenType::EscapedCharOct,
25							0 => Regexp::Lexer::TokenType::EscapedCharOct,
26							l => Regexp::Lexer::TokenType::EscapedLowerNext,
27							u => Regexp::Lexer::TokenType::EscapedUpperNext,
28							L => Regexp::Lexer::TokenType::EscapedLowerUntil,
29							U => Regexp::Lexer::TokenType::EscapedUpperUntil,
30							Q => Regexp::Lexer::TokenType::EscapedQuoteMetaUntil,
31							E => Regexp::Lexer::TokenType::EscapedEnd,
32							w => Regexp::Lexer::TokenType::EscapedWordChar,
33							W => Regexp::Lexer::TokenType::EscapedNotWordChar,
34							s => Regexp::Lexer::TokenType::EscapedWhiteSpaceChar,
35							S => Regexp::Lexer::TokenType::EscapedNotWhiteSpaceChar,
36							d => Regexp::Lexer::TokenType::EscapedDigitChar,
37							D => Regexp::Lexer::TokenType::EscapedNotDigitChar,
38							p => Regexp::Lexer::TokenType::EscapedProp,
39							P => Regexp::Lexer::TokenType::EscapedNotProp,
40							X => Regexp::Lexer::TokenType::EscapedUnicodeExtendedChar,
41							C => Regexp::Lexer::TokenType::EscapedCChar,
42							1 => Regexp::Lexer::TokenType::EscapedBackRef,
43							2 => Regexp::Lexer::TokenType::EscapedBackRef,
44							3 => Regexp::Lexer::TokenType::EscapedBackRef,
45							4 => Regexp::Lexer::TokenType::EscapedBackRef,
46							5 => Regexp::Lexer::TokenType::EscapedBackRef,
47							6 => Regexp::Lexer::TokenType::EscapedBackRef,
48							7 => Regexp::Lexer::TokenType::EscapedBackRef,
49							8 => Regexp::Lexer::TokenType::EscapedBackRef,
50							9 => Regexp::Lexer::TokenType::EscapedBackRef,
51							g => Regexp::Lexer::TokenType::EscapedBackRef,
52							k => Regexp::Lexer::TokenType::EscapedBackRef,
53							K => Regexp::Lexer::TokenType::EscapedKeepStuff,
54							v => Regexp::Lexer::TokenType::EscapedVerticalWhitespace,
55							V => Regexp::Lexer::TokenType::EscapedNotVerticalWhitespace,
56							h => Regexp::Lexer::TokenType::EscapedHorizontalWhitespace,
57							H => Regexp::Lexer::TokenType::EscapedNotHorizontalWhitespace,
58							R => Regexp::Lexer::TokenType::EscapedLinebreak,
59							b => Regexp::Lexer::TokenType::EscapedWordBoundary,
60							B => Regexp::Lexer::TokenType::EscapedNotWordBoundary,
61							A => Regexp::Lexer::TokenType::EscapedBeginningOfString,
62							Z => Regexp::Lexer::TokenType::EscapedEndOfStringBeforeNewline,
63							z => Regexp::Lexer::TokenType::EscapedEndOfString,
64							G => Regexp::Lexer::TokenType::EscapedPos,
65							);
66
67							my %specialChar = (
68							'.' => Regexp::Lexer::TokenType::MatchAny,
69							'\|' => Regexp::Lexer::TokenType::Alternation,
70							'(' => Regexp::Lexer::TokenType::LeftParenthesis,
71							')' => Regexp::Lexer::TokenType::RightParenthesis,
72							'[' => Regexp::Lexer::TokenType::LeftBracket,
73							']' => Regexp::Lexer::TokenType::RightBracket,
74							'{' => Regexp::Lexer::TokenType::LeftBrace,
75							'}' => Regexp::Lexer::TokenType::RightBrace,
76							'<' => Regexp::Lexer::TokenType::LeftAngle,
77							'>' => Regexp::Lexer::TokenType::RightAngle,
78							'*' => Regexp::Lexer::TokenType::Asterisk,
79							'+' => Regexp::Lexer::TokenType::Plus,
80							'?' => Regexp::Lexer::TokenType::Question,
81							',' => Regexp::Lexer::TokenType::Comma,
82							'-' => Regexp::Lexer::TokenType::Minus,
83							'$' => Regexp::Lexer::TokenType::ScalarSigil,
84							'@' => Regexp::Lexer::TokenType::ArraySigil,
85							':' => Regexp::Lexer::TokenType::Colon,
86							'#' => Regexp::Lexer::TokenType::Sharp,
87							'^' => Regexp::Lexer::TokenType::Cap,
88							'=' => Regexp::Lexer::TokenType::Equal,
89							'!' => Regexp::Lexer::TokenType::Exclamation,
90							q<'> => Regexp::Lexer::TokenType::SingleQuote,
91							q<"> => Regexp::Lexer::TokenType::DoubleQuote,
92							);
93
94							sub tokenize {
95	8			8	1	23387	my ($re) = @_;
96
97	8	100				36	if (ref $re ne 'Regexp') {
98	1					196	croak "Not regexp quoted argument is given";
99							}
100
101							# B::cstring() is used to escape backslashes
102	7					108	my $re_cluster_string = B::cstring($re);
103
104							# to remove double-quotes and parenthesis on leading and trailing
105	7					26	my $re_str = substr(substr($re_cluster_string, 2), 0, -2);
106
107	7					21	$re_str =~ s/\\"/"/g; # for double quote which is converted by B::cstring
108
109							# extract modifiers
110	7					46	$re_str =~ s/\A[?]([^:]*)://;
111	7					13	my @modifiers;
112	7					41	for my $modifier (split //, $1) {
113	11					25	push @modifiers, $modifier;
114							}
115
116	7					40	my @chars = split //, $re_str;
117
118	7					11	my @tokens;
119	7					12	my $index = 0;
120
121	7					10	my $end_of_line_exists = 0;
122	7	100	100			56	if (defined $chars[-1] && $chars[-1] eq '$') {
123	1					1	pop @chars;
124	1					2	$end_of_line_exists = 1;
125							}
126
127	7	100	100			41	if (defined $chars[0] && $chars[0] eq '^') {
128	1					4	push @tokens, {
129							char => shift @chars,
130							index => ++$index,
131							type => Regexp::Lexer::TokenType::BeginningOfLine,
132							};
133							}
134
135	7					9	my $backslashes = 0;
136	7					25	my $next_c;
137	7					29	for (my $i = 0; defined(my $c = $chars[$i]); $i++) {
138	111	100				146	if ($c eq '\\') {
139	59	100				96	if ($backslashes <= 1) {
140	47					38	$backslashes++;
141	47					72	next;
142							}
143
144							# now status -> '\\\\\\'
145	12	50				24	if ($backslashes == 2) {
146	12					19	$next_c = $chars[++$i];
147	12	50	33			46	if (!defined $next_c \|\| $next_c ne '\\') {
148	0					0	croak "Invalid syntax regexp is given"; # fail safe
149							}
150
151	12					42	push @tokens, {
152							char => '\\\\',
153							index => ++$index,
154							type => Regexp::Lexer::TokenType::EscapedCharacter,
155							};
156
157	12					15	$backslashes = 0;
158	12					23	next;
159							}
160							}
161
162							# To support NOT META newline character which is in regexp
163	52	100				107	if ($backslashes == 1) {
164	3					4	my $type = Regexp::Lexer::TokenType::Unknown;
165	3	100				12	if ($c eq 'n') {
		50
166	2					5	$type = Regexp::Lexer::TokenType::Newline;
167							}
168							elsif ($c eq 'r') { # XXX maybe unreachable
169	1					1	$type = Regexp::Lexer::TokenType::Return;
170							}
171
172	3					11	push @tokens, {
173							char => '\\' . $c,
174							index => ++$index,
175							type => $type,
176							};
177
178	3					6	$backslashes = 0;
179	3					10	next;
180							}
181
182	49	100				65	if ($backslashes == 2) {
183	10					15	my $type = $escapedSpecialChar{$c};
184
185							# Determine meaning of \N
186	10	100				28	if ($c eq 'N') {
187	4					4	$type = Regexp::Lexer::TokenType::EscapedCharUnicode;
188
189	4					5	$next_c = $chars[$i+1];
190	4	100	100			14	if (!defined $next_c \|\| $next_c ne '{') {
191	2					2	$type = Regexp::Lexer::TokenType::EscapedNotNewline;
192							}
193							}
194
195	10		100			46	push @tokens, {
196							char => '\\' . $c,
197							index => ++$index,
198							type => $type \|\| Regexp::Lexer::TokenType::EscapedCharacter,
199							};
200
201	10					10	$backslashes = 0;
202	10					19	next;
203							}
204
205	39		100			144	push @tokens, {
206							char => $c,
207							index => ++$index,
208							type => $specialChar{$c} \|\| Regexp::Lexer::TokenType::Character,
209							};
210
211	39					72	$backslashes = 0; # for fail safe
212							}
213
214	7	100				33	if ($end_of_line_exists) {
215	1					3	push @tokens, {
216							char => '$',
217							index => ++$index,
218							type => Regexp::Lexer::TokenType::EndOfLine,
219							};
220							}
221
222							return {
223	7					104	tokens => \@tokens,
224							modifiers => \@modifiers,
225							};
226							}
227
228							1;
229							__END__