File Coverage

blib/lib/PPI/Token/Whitespace.pm

Criterion	Covered	Total	%
statement	94	96	97.9
branch	79	86	91.8
condition	77	90	85.5
subroutine	7	8	87.5
pod	2	2	100.0
total	259	282	91.8

line	stmt	bran	cond	sub	pod	time	code
1							package PPI::Token::Whitespace;
2
3							=pod
4
5							=head1 NAME
6
7							PPI::Token::Whitespace - Tokens representing ordinary white space
8
9							=head1 INHERITANCE
10
11							PPI::Token::Whitespace
12							isa PPI::Token
13							isa PPI::Element
14
15							=head1 DESCRIPTION
16
17							As a full "round-trip" parser, PPI records every last byte in a
18							file and ensure that it is included in the L object.
19
20							This even includes whitespace. In fact, Perl documents are seen
21							as "floating in a sea of whitespace", and thus any document will
22							contain vast quantities of C objects.
23
24							For the most part, you shouldn't notice them. Or at least, you
25							shouldn't B to notice them.
26
27							This means doing things like consistently using the "S for significant"
28							series of L and L methods to do things.
29
30							If you want the nth child element, you should be using C rather
31							than C, and likewise C, C, and
32							so on and so forth.
33
34							=head1 METHODS
35
36							Again, for the most part you should really B need to do anything
37							very significant with whitespace.
38
39							But there are a couple of convenience methods provided, beyond those
40							provided by the parent L and L classes.
41
42							=cut
43
44	65			65		72403	use strict;
	65					140
	65					1937
45	65			65		763	use Clone ();
	65					2648
	65					790
46	65			65		751	use PPI::Token ();
	65					177
	65					118238
47
48							our $VERSION = '1.277';
49
50							our @ISA = "PPI::Token";
51
52							=pod
53
54							=head2 null
55
56							Because L sees documents as sitting on a sort of substrate made of
57							whitespace, there are a couple of corner cases that get particularly
58							nasty if they don't find whitespace in certain places.
59
60							Imagine walking down the beach to go into the ocean, and then quite
61							unexpectedly falling off the side of the planet. Well it's somewhat
62							equivalent to that, including the whole screaming death bit.
63
64							The C method is a convenience provided to get some internals
65							out of some of these corner cases.
66
67							Specifically it create a whitespace token that represents nothing,
68							or at least the null string C<''>. It's a handy way to have some
69							"whitespace" right where you need it, without having to have any
70							actual characters.
71
72							=cut
73
74							my $null;
75
76							sub null {
77	0		0	0	1	0	$null \|\|= $_[0]->new('');
78	0					0	Clone::clone($null);
79							}
80
81							### XS -> PPI/XS.xs:_PPI_Token_Whitespace__significant 0.900+
82							sub significant() { '' }
83
84							=pod
85
86							=head2 tidy
87
88							C is a convenience method for removing unneeded whitespace.
89
90							Specifically, it removes any whitespace from the end of a line.
91
92							Note that this B include POD, where you may well need
93							to keep certain types of whitespace. The entire POD chunk lives
94							in its own L object.
95
96							=cut
97
98							sub tidy {
99	2			2	1	19	$_[0]->{content} =~ s/^\s+?(?>\n)//;
100	2					9	1;
101							}
102
103
104
105
106
107							#####################################################################
108							# Parsing Methods
109
110							# Build the class and commit maps
111							my %COMMITMAP = (
112							map( { ord $_ => 'PPI::Token::Word' } 'a' .. 'u', 'A' .. 'Z', qw" w y z _ " ), # no v or x
113							map( { ord $_ => 'PPI::Token::Structure' } qw" ; [ ] { } ) " ),
114							ord '#' => 'PPI::Token::Comment',
115							ord 'v' => 'PPI::Token::Number::Version',
116							);
117							my %CLASSMAP = (
118							map( { ord $_ => 'Number' } 0 .. 9 ),
119							map( { ord $_ => 'Operator' } qw" = ? \| + > . ! ~ ^ " ),
120							map( { ord $_ => 'Unknown' } qw" * $ @ & : % " ),
121							ord ',' => 'PPI::Token::Operator',
122							ord "'" => 'Quote::Single',
123							ord '"' => 'Quote::Double',
124							ord '`' => 'QuoteLike::Backtick',
125							ord '\\' => 'Cast',
126							ord '_' => 'Word',
127							9 => 'Whitespace', # A horizontal tab
128							10 => 'Whitespace', # A newline
129							12 => 'Whitespace', # A form feed
130							13 => 'Whitespace', # A carriage return
131							32 => 'Whitespace', # A normal space
132							);
133
134							# Words (functions and keywords) after which a following / is
135							# almost certainly going to be a regex
136							my %MATCHWORD = map { $_ => 1 } qw{
137							return
138							split
139							if
140							unless
141							grep
142							map
143							};
144
145							sub __TOKENIZER__on_line_start {
146	51324			51324		77147	my $t = $_[1];
147	51324					89123	my $line = $t->{line};
148
149							# Can we classify the entire line in one go
150	51324	100				263169	if ( $line =~ /^\s*$/ ) {
		100
		100
		100
151							# A whitespace line
152	6679					19591	$t->_new_token( 'Whitespace', $line );
153	6679					13718	return 0;
154
155							} elsif ( $line =~ /^\s*#/ ) {
156							# A comment line
157	4323					12764	$t->_new_token( 'Comment', $line );
158	4323					10693	$t->_finalize_token;
159	4323					8792	return 0;
160
161							} elsif ( $line =~ /^=(\w+)/ ) {
162							# A Pod tag... change to pod mode
163	747					2664	$t->_new_token( 'Pod', $line );
164	747	50				2747	if ( $1 eq 'cut' ) {
165							# This is an error, but one we'll ignore
166							# Don't go into Pod mode, since =cut normally
167							# signals the end of Pod mode
168							} else {
169	747					1625	$t->{class} = 'PPI::Token::Pod';
170							}
171	747					1636	return 0;
172
173							} elsif ( $line =~ /^use v6\-alpha\;/ ) {
174							# Indicates a Perl 6 block. Make the initial
175							# implementation just suck in the entire rest of the
176							# file.
177	2					5	my @perl6;
178	2					2	while ( 1 ) {
179	693					1110	my $line6 = $t->_get_line;
180	693	100				1067	last unless defined $line6;
181	691					980	push @perl6, $line6;
182							}
183	2					5	push @{ $t->{perl6} }, join '', @perl6;
	2					121
184
185							# We only sucked in the block, we don't actually do
186							# anything to the "use v6..." line. So return as if
187							# we didn't find anything at all.
188	2					42	return 1;
189							}
190
191	39573					89516	1;
192							}
193
194							sub __TOKENIZER__on_char {
195	380726			380726		525132	my $t = $_[1];
196	380726					672729	my $c = substr $t->{line}, $t->{line_cursor}, 1;
197	380726					539206	my $char = ord $c;
198
199							# Do we definitely know what something is?
200	380726	100				1004421	return $COMMITMAP{$char}->__TOKENIZER__commit($t) if $COMMITMAP{$char};
201
202							# Handle the simple option first
203	262862	100				900046	return $CLASSMAP{$char} if $CLASSMAP{$char};
204
205	19098	100				61072	if ( $char == 40 ) { # $char eq '('
		100
		100
		100
		100
		50
206							# Finalise any whitespace token...
207	8194	100				27151	$t->_finalize_token if $t->{token};
208
209							# Is this the beginning of a sub prototype?
210							# We are a sub prototype IF
211							# 1. The previous significant token is a bareword.
212							# 2. The one before that is the word 'sub'.
213							# 3. The one before that is a 'structure'
214
215							# Get the three previous significant tokens
216	8194					18705	my @tokens = $t->_previous_significant_tokens(3);
217
218							# A normal subroutine declaration
219	8194					13917	my $p1 = $tokens[1];
220	8194					12921	my $p2 = $tokens[2];
221	8194	50	100			79049	if (
			100
			100
			100
			33
			100
222							$tokens[0]
223							and
224							$tokens[0]->isa('PPI::Token::Word')
225							and
226							$p1
227							and
228							$p1->isa('PPI::Token::Word')
229							and
230							$p1->content eq 'sub'
231							and (
232							not $p2
233							or
234							$p2->isa('PPI::Token::Structure')
235							or (
236							$p2->isa('PPI::Token::Whitespace')
237							and
238							$p2->content eq ''
239							)
240							or (
241							# Lexical subroutine
242							$p2->isa('PPI::Token::Word')
243							and
244							$p2->content =~ /^(?:my\|our\|state)$/
245							)
246							)
247							) {
248							# This is a sub prototype
249	317					1428	return 'Prototype';
250							}
251
252							# A prototyped anonymous subroutine
253	7877					13069	my $p0 = $tokens[0];
254	7877	100	100			41990	if ( $p0 and $p0->isa('PPI::Token::Word') and $p0->content eq 'sub'
			100
			66
			66
255							# Maybe it's invoking a method named 'sub'
256							and not ( $p1 and $p1->isa('PPI::Token::Operator') and $p1->content eq '->')
257							) {
258	42					214	return 'Prototype';
259							}
260
261							# This is a normal open bracket
262	7835					32545	return 'Structure';
263
264							} elsif ( $char == 60 ) { # $char eq '<'
265							# Finalise any whitespace token...
266	2064	100				6795	$t->_finalize_token if $t->{token};
267
268							# This is either "less than" or "readline quote-like"
269							# Do some context stuff to guess which.
270	2064					5654	my $prev = $t->_last_significant_token;
271
272							# The most common group of less-thans are used like
273							# $foo < $bar
274							# 1 < $bar
275							# $#foo < $bar
276	2064	100	100			13837	return 'Operator' if $prev and $prev->isa('PPI::Token::Symbol');
277	1734	50	66			8266	return 'Operator' if $prev and $prev->isa('PPI::Token::Magic');
278	1734	100	100			7979	return 'Operator' if $prev and $prev->isa('PPI::Token::Number');
279	1705	50	66			7766	return 'Operator' if $prev and $prev->isa('PPI::Token::ArrayIndex');
280
281							# If it is <<... it's a here-doc instead
282	1705					4426	my $next_char = substr( $t->{line}, $t->{line_cursor} + 1, 2 );
283	1705	100				7703	return 'Operator' if $next_char =~ /<[^>]/;
284
285	908	100				2411	return 'Operator' if not $prev;
286
287							# The most common group of readlines are used like
288							# while ( <...> )
289							# while <>;
290	829					2169	my $prec = $prev->content;
291	829	100	100			14276	return 'QuoteLike::Readline'
			100
			100
			100
			100
			100
			100
			100
			100
292							if ( $prev->isa('PPI::Token::Structure') and $prec eq '(' )
293							or ( $prev->isa('PPI::Token::Structure') and $prec eq ';' )
294							or ( $prev->isa('PPI::Token::Word') and $prec eq 'while' )
295							or ( $prev->isa('PPI::Token::Operator') and $prec eq '=' )
296							or ( $prev->isa('PPI::Token::Operator') and $prec eq ',' );
297
298	712	100	100			2837	if ( $prev->isa('PPI::Token::Structure') and $prec eq '}' ) {
299							# Could go either way... do a regex check
300							# $foo->{bar} < 2;
301							# grep { .. } ;
302	12					45	pos $t->{line} = $t->{line_cursor};
303	12	100				69	if ( $t->{line} =~ m/\G<(?!\d)\w+>/gc ) {
304							# Almost definitely readline
305	3					14	return 'QuoteLike::Readline';
306							}
307							}
308
309							# Otherwise, we guess operator, which has been the default up
310							# until this more comprehensive section was created.
311	709					2903	return 'Operator';
312
313							} elsif ( $char == 47 ) { # $char eq '/'
314							# Finalise any whitespace token...
315	1027	100				4577	$t->_finalize_token if $t->{token};
316
317							# This is either a "divided by" or a "start regex"
318							# Do some context stuff to guess ( ack ) which.
319							# Hopefully the guess will be good enough.
320	1027					3287	my $prev = $t->_last_significant_token;
321
322							# Or as the very first thing in a file
323	1027	100				4030	return 'Regexp::Match' if not $prev;
324
325	938					2495	my $prec = $prev->content;
326
327							# Most times following an operator, we are a regex.
328							# This includes cases such as:
329							# , - As an argument in a list
330							# .. - The second condition in a flip flop
331							# =~ - A bound regex
332							# !~ - Ditto
333	938	100				4875	return 'Regexp::Match' if $prev->isa('PPI::Token::Operator');
334
335							# After a symbol
336	492	100				2003	return 'Operator' if $prev->isa('PPI::Token::Symbol');
337	429	100	66			1475	if ( $prec eq ']' and $prev->isa('PPI::Token::Structure') ) {
338	11					50	return 'Operator';
339							}
340
341							# After another number
342	418	100				1850	return 'Operator' if $prev->isa('PPI::Token::Number');
343
344							# After going into scope/brackets
345	373	100	100			2190	if (
			100
346							$prev->isa('PPI::Token::Structure')
347							and (
348							$prec eq '('
349							or
350							$prec eq '{'
351							or
352							$prec eq ';'
353							)
354							) {
355	53					201	return 'Regexp::Match';
356							}
357
358							# Functions and keywords
359	320	100	66			1394	if (
360							$MATCHWORD{$prec}
361							and
362							$prev->isa('PPI::Token::Word')
363							) {
364	69					289	return 'Regexp::Match';
365							}
366
367							# What about the char after the slash? There's some things
368							# that would be highly illogical to see if it's an operator.
369	251					759	my $next_char = substr $t->{line}, $t->{line_cursor} + 1, 1;
370	251	50	33			1035	if ( defined $next_char and length $next_char ) {
371	251	100				1071	if ( $next_char =~ /(?:\^\|\[\|\\)/ ) {
372	17					105	return 'Regexp::Match';
373							}
374							}
375
376							# Otherwise... erm... assume operator?
377							# Add more tests here as potential cases come to light
378	234					1025	return 'Operator';
379
380							} elsif ( $char == 120 ) { # $char eq 'x'
381							# Could be a word, the x= operator, the x operator
382							# followed by whitespace, or the x operator without any
383							# space between itself and its operand, e.g.: '$a x3',
384							# which is the same as '$a x 3'. _current_x_is_operator
385							# assumes we have a complete 'x' token, but we don't
386							# yet. We may need to split this x character apart from
387							# what follows it.
388	815	100				2299	if ( $t->_current_x_is_operator ) {
389	207					819	pos $t->{line} = $t->{line_cursor} + 1;
390	207	100				1560	return 'Operator' if $t->{line} =~ m/\G(?:
391							\d # x op with no whitespace e.g. 'x3'
392							\|
393							(?!( # negative lookahead
394							=> # not on left of fat comma
395							\|
396							\w # not a word like "xyzzy"
397							\|
398							\s # not x op plus whitespace
399							))
400							)/gcx;
401							}
402
403							# Otherwise, commit like a normal bareword, including x
404							# operator followed by whitespace.
405	696					2460	return PPI::Token::Word->__TOKENIZER__commit($t);
406
407							} elsif ( $char == 45 ) { # $char eq '-'
408							# Look for an obvious operator operand context
409	6992					17005	my $context = $t->_opcontext;
410	6992	100				15249	if ( $context eq 'operator' ) {
411	5121					16300	return 'Operator';
412							} else {
413							# More logic needed
414	1871					6719	return 'Unknown';
415							}
416
417							} elsif ( $char >= 128 ) { # Outside ASCII
418	6	100				52	return 'PPI::Token::Word'->__TOKENIZER__commit($t) if $c =~ /\w/;
419	1	50				5	return 'Whitespace' if $c =~ /\s/;
420							}
421
422
423							# All the whitespaces are covered, so what to do
424							### For now, die
425	1					11	PPI::Exception->throw("Encountered unexpected character '$char'");
426							}
427
428							sub __TOKENIZER__on_line_end {
429	39201	100		39201		171012	$_[1]->_finalize_token if $_[1]->{token};
430							}
431
432							1;
433
434							=pod
435
436							=head1 SUPPORT
437
438							See the L in the main module.
439
440							=head1 AUTHOR
441
442							Adam Kennedy Eadamk@cpan.orgE
443
444							=head1 COPYRIGHT
445
446							Copyright 2001 - 2011 Adam Kennedy.
447
448							This program is free software; you can redistribute
449							it and/or modify it under the same terms as Perl itself.
450
451							The full text of the license can be found in the
452							LICENSE file included with this module.
453
454							=cut