File Coverage

blib/lib/PPI/Token/Whitespace.pm

Criterion	Covered	Total	%
statement	94	96	97.9
branch	79	86	91.8
condition	77	90	85.5
subroutine	7	8	87.5
pod	2	2	100.0
total	259	282	91.8

line	stmt	bran	cond	sub	pod	time	code
1							package PPI::Token::Whitespace;
2
3							=pod
4
5							=head1 NAME
6
7							PPI::Token::Whitespace - Tokens representing ordinary white space
8
9							=head1 INHERITANCE
10
11							PPI::Token::Whitespace
12							isa PPI::Token
13							isa PPI::Element
14
15							=head1 DESCRIPTION
16
17							As a full "round-trip" parser, PPI records every last byte in a
18							file and ensure that it is included in the L object.
19
20							This even includes whitespace. In fact, Perl documents are seen
21							as "floating in a sea of whitespace", and thus any document will
22							contain vast quantities of C objects.
23
24							For the most part, you shouldn't notice them. Or at least, you
25							shouldn't B to notice them.
26
27							This means doing things like consistently using the "S for significant"
28							series of L and L methods to do things.
29
30							If you want the nth child element, you should be using C rather
31							than C, and likewise C, C, and
32							so on and so forth.
33
34							=head1 METHODS
35
36							Again, for the most part you should really B need to do anything
37							very significant with whitespace.
38
39							But there are a couple of convenience methods provided, beyond those
40							provided by the parent L and L classes.
41
42							=cut
43
44	65			65		66907	use strict;
	65					114
	65					1470
45	65			65		582	use Clone ();
	65					2162
	65					746
46	65			65		604	use PPI::Token ();
	65					132
	65					91934
47
48							our $VERSION = '1.276';
49
50							our @ISA = "PPI::Token";
51
52							=pod
53
54							=head2 null
55
56							Because L sees documents as sitting on a sort of substrate made of
57							whitespace, there are a couple of corner cases that get particularly
58							nasty if they don't find whitespace in certain places.
59
60							Imagine walking down the beach to go into the ocean, and then quite
61							unexpectedly falling off the side of the planet. Well it's somewhat
62							equivalent to that, including the whole screaming death bit.
63
64							The C method is a convenience provided to get some internals
65							out of some of these corner cases.
66
67							Specifically it create a whitespace token that represents nothing,
68							or at least the null string C<''>. It's a handy way to have some
69							"whitespace" right where you need it, without having to have any
70							actual characters.
71
72							=cut
73
74							my $null;
75
76							sub null {
77	0		0	0	1	0	$null \|\|= $_[0]->new('');
78	0					0	Clone::clone($null);
79							}
80
81							### XS -> PPI/XS.xs:_PPI_Token_Whitespace__significant 0.900+
82							sub significant() { '' }
83
84							=pod
85
86							=head2 tidy
87
88							C is a convenience method for removing unneeded whitespace.
89
90							Specifically, it removes any whitespace from the end of a line.
91
92							Note that this B include POD, where you may well need
93							to keep certain types of whitespace. The entire POD chunk lives
94							in its own L object.
95
96							=cut
97
98							sub tidy {
99	2			2	1	11	$_[0]->{content} =~ s/^\s+?(?>\n)//;
100	2					6	1;
101							}
102
103
104
105
106
107							#####################################################################
108							# Parsing Methods
109
110							# Build the class and commit maps
111							my %COMMITMAP = (
112							map( { ord $_ => 'PPI::Token::Word' } 'a' .. 'u', 'A' .. 'Z', qw" w y z _ " ), # no v or x
113							map( { ord $_ => 'PPI::Token::Structure' } qw" ; [ ] { } ) " ),
114							ord '#' => 'PPI::Token::Comment',
115							ord 'v' => 'PPI::Token::Number::Version',
116							);
117							my %CLASSMAP = (
118							map( { ord $_ => 'Number' } 0 .. 9 ),
119							map( { ord $_ => 'Operator' } qw" = ? \| + > . ! ~ ^ " ),
120							map( { ord $_ => 'Unknown' } qw" * $ @ & : % " ),
121							ord ',' => 'PPI::Token::Operator',
122							ord "'" => 'Quote::Single',
123							ord '"' => 'Quote::Double',
124							ord '`' => 'QuoteLike::Backtick',
125							ord '\\' => 'Cast',
126							ord '_' => 'Word',
127							9 => 'Whitespace', # A horizontal tab
128							10 => 'Whitespace', # A newline
129							12 => 'Whitespace', # A form feed
130							13 => 'Whitespace', # A carriage return
131							32 => 'Whitespace', # A normal space
132							);
133
134							# Words (functions and keywords) after which a following / is
135							# almost certainly going to be a regex
136							my %MATCHWORD = map { $_ => 1 } qw{
137							return
138							split
139							if
140							unless
141							grep
142							map
143							};
144
145							sub __TOKENIZER__on_line_start {
146	51153			51153		62420	my $t = $_[1];
147	51153					71359	my $line = $t->{line};
148
149							# Can we classify the entire line in one go
150	51153	100				218698	if ( $line =~ /^\s*$/ ) {
		100
		100
		100
151							# A whitespace line
152	6697					15692	$t->_new_token( 'Whitespace', $line );
153	6697					10669	return 0;
154
155							} elsif ( $line =~ /^\s*#/ ) {
156							# A comment line
157	4323					10787	$t->_new_token( 'Comment', $line );
158	4323					8874	$t->_finalize_token;
159	4323					7334	return 0;
160
161							} elsif ( $line =~ /^=(\w+)/ ) {
162							# A Pod tag... change to pod mode
163	752					2135	$t->_new_token( 'Pod', $line );
164	752	50				2045	if ( $1 eq 'cut' ) {
165							# This is an error, but one we'll ignore
166							# Don't go into Pod mode, since =cut normally
167							# signals the end of Pod mode
168							} else {
169	752					1259	$t->{class} = 'PPI::Token::Pod';
170							}
171	752					1260	return 0;
172
173							} elsif ( $line =~ /^use v6\-alpha\;/ ) {
174							# Indicates a Perl 6 block. Make the initial
175							# implementation just suck in the entire rest of the
176							# file.
177	2					3	my @perl6;
178	2					3	while ( 1 ) {
179	693					923	my $line6 = $t->_get_line;
180	693	100				886	last unless defined $line6;
181	691					802	push @perl6, $line6;
182							}
183	2					2	push @{ $t->{perl6} }, join '', @perl6;
	2					102
184
185							# We only sucked in the block, we don't actually do
186							# anything to the "use v6..." line. So return as if
187							# we didn't find anything at all.
188	2					33	return 1;
189							}
190
191	39379					68938	1;
192							}
193
194							sub __TOKENIZER__on_char {
195	379931			379931		430746	my $t = $_[1];
196	379931					544108	my $c = substr $t->{line}, $t->{line_cursor}, 1;
197	379931					442927	my $char = ord $c;
198
199							# Do we definitely know what something is?
200	379931	100				808910	return $COMMITMAP{$char}->__TOKENIZER__commit($t) if $COMMITMAP{$char};
201
202							# Handle the simple option first
203	261770	100				735146	return $CLASSMAP{$char} if $CLASSMAP{$char};
204
205	19488	100				47669	if ( $char == 40 ) { # $char eq '('
		100
		100
		100
		100
		50
206							# Finalise any whitespace token...
207	8502	100				21423	$t->_finalize_token if $t->{token};
208
209							# Is this the beginning of a sub prototype?
210							# We are a sub prototype IF
211							# 1. The previous significant token is a bareword.
212							# 2. The one before that is the word 'sub'.
213							# 3. The one before that is a 'structure'
214
215							# Get the three previous significant tokens
216	8502					15693	my @tokens = $t->_previous_significant_tokens(3);
217
218							# A normal subroutine declaration
219	8502					11534	my $p1 = $tokens[1];
220	8502					10371	my $p2 = $tokens[2];
221	8502	50	100			67455	if (
			100
			100
			100
			33
			100
222							$tokens[0]
223							and
224							$tokens[0]->isa('PPI::Token::Word')
225							and
226							$p1
227							and
228							$p1->isa('PPI::Token::Word')
229							and
230							$p1->content eq 'sub'
231							and (
232							not $p2
233							or
234							$p2->isa('PPI::Token::Structure')
235							or (
236							$p2->isa('PPI::Token::Whitespace')
237							and
238							$p2->content eq ''
239							)
240							or (
241							# Lexical subroutine
242							$p2->isa('PPI::Token::Word')
243							and
244							$p2->content =~ /^(?:my\|our\|state)$/
245							)
246							)
247							) {
248							# This is a sub prototype
249	377					1318	return 'Prototype';
250							}
251
252							# A prototyped anonymous subroutine
253	8125					11558	my $p0 = $tokens[0];
254	8125	100	100			34536	if ( $p0 and $p0->isa('PPI::Token::Word') and $p0->content eq 'sub'
			100
			66
			66
255							# Maybe it's invoking a method named 'sub'
256							and not ( $p1 and $p1->isa('PPI::Token::Operator') and $p1->content eq '->')
257							) {
258	62					241	return 'Prototype';
259							}
260
261							# This is a normal open bracket
262	8063					28143	return 'Structure';
263
264							} elsif ( $char == 60 ) { # $char eq '<'
265							# Finalise any whitespace token...
266	2048	100				5615	$t->_finalize_token if $t->{token};
267
268							# This is either "less than" or "readline quote-like"
269							# Do some context stuff to guess which.
270	2048					4321	my $prev = $t->_last_significant_token;
271
272							# The most common group of less-thans are used like
273							# $foo < $bar
274							# 1 < $bar
275							# $#foo < $bar
276	2048	100	100			10700	return 'Operator' if $prev and $prev->isa('PPI::Token::Symbol');
277	1716	50	66			6765	return 'Operator' if $prev and $prev->isa('PPI::Token::Magic');
278	1716	100	100			6481	return 'Operator' if $prev and $prev->isa('PPI::Token::Number');
279	1693	50	66			6353	return 'Operator' if $prev and $prev->isa('PPI::Token::ArrayIndex');
280
281							# If it is <<... it's a here-doc instead
282	1693					3350	my $next_char = substr( $t->{line}, $t->{line_cursor} + 1, 2 );
283	1693	100				6095	return 'Operator' if $next_char =~ /<[^>]/;
284
285	899	100				2401	return 'Operator' if not $prev;
286
287							# The most common group of readlines are used like
288							# while ( <...> )
289							# while <>;
290	812					1857	my $prec = $prev->content;
291	812	100	100			10876	return 'QuoteLike::Readline'
			100
			100
			100
			100
			100
			100
			100
			100
292							if ( $prev->isa('PPI::Token::Structure') and $prec eq '(' )
293							or ( $prev->isa('PPI::Token::Structure') and $prec eq ';' )
294							or ( $prev->isa('PPI::Token::Word') and $prec eq 'while' )
295							or ( $prev->isa('PPI::Token::Operator') and $prec eq '=' )
296							or ( $prev->isa('PPI::Token::Operator') and $prec eq ',' );
297
298	709	100	100			2221	if ( $prev->isa('PPI::Token::Structure') and $prec eq '}' ) {
299							# Could go either way... do a regex check
300							# $foo->{bar} < 2;
301							# grep { .. } ;
302	17					60	pos $t->{line} = $t->{line_cursor};
303	17	100				84	if ( $t->{line} =~ m/\G<(?!\d)\w+>/gc ) {
304							# Almost definitely readline
305	3					11	return 'QuoteLike::Readline';
306							}
307							}
308
309							# Otherwise, we guess operator, which has been the default up
310							# until this more comprehensive section was created.
311	706					2039	return 'Operator';
312
313							} elsif ( $char == 47 ) { # $char eq '/'
314							# Finalise any whitespace token...
315	1071	100				3637	$t->_finalize_token if $t->{token};
316
317							# This is either a "divided by" or a "start regex"
318							# Do some context stuff to guess ( ack ) which.
319							# Hopefully the guess will be good enough.
320	1071					2451	my $prev = $t->_last_significant_token;
321
322							# Or as the very first thing in a file
323	1071	100				2900	return 'Regexp::Match' if not $prev;
324
325	972					2079	my $prec = $prev->content;
326
327							# Most times following an operator, we are a regex.
328							# This includes cases such as:
329							# , - As an argument in a list
330							# .. - The second condition in a flip flop
331							# =~ - A bound regex
332							# !~ - Ditto
333	972	100				4099	return 'Regexp::Match' if $prev->isa('PPI::Token::Operator');
334
335							# After a symbol
336	516	100				1588	return 'Operator' if $prev->isa('PPI::Token::Symbol');
337	447	100	66			1036	if ( $prec eq ']' and $prev->isa('PPI::Token::Structure') ) {
338	6					17	return 'Operator';
339							}
340
341							# After another number
342	441	100				1289	return 'Operator' if $prev->isa('PPI::Token::Number');
343
344							# After going into scope/brackets
345	408	100	100			1638	if (
			100
346							$prev->isa('PPI::Token::Structure')
347							and (
348							$prec eq '('
349							or
350							$prec eq '{'
351							or
352							$prec eq ';'
353							)
354							) {
355	55					183	return 'Regexp::Match';
356							}
357
358							# Functions and keywords
359	353	100	66			996	if (
360							$MATCHWORD{$prec}
361							and
362							$prev->isa('PPI::Token::Word')
363							) {
364	69					235	return 'Regexp::Match';
365							}
366
367							# What about the char after the slash? There's some things
368							# that would be highly illogical to see if it's an operator.
369	284					586	my $next_char = substr $t->{line}, $t->{line_cursor} + 1, 1;
370	284	50	33			935	if ( defined $next_char and length $next_char ) {
371	284	100				938	if ( $next_char =~ /(?:\^\|\[\|\\)/ ) {
372	15					60	return 'Regexp::Match';
373							}
374							}
375
376							# Otherwise... erm... assume operator?
377							# Add more tests here as potential cases come to light
378	269					811	return 'Operator';
379
380							} elsif ( $char == 120 ) { # $char eq 'x'
381							# Could be a word, the x= operator, the x operator
382							# followed by whitespace, or the x operator without any
383							# space between itself and its operand, e.g.: '$a x3',
384							# which is the same as '$a x 3'. _current_x_is_operator
385							# assumes we have a complete 'x' token, but we don't
386							# yet. We may need to split this x character apart from
387							# what follows it.
388	859	100				1921	if ( $t->_current_x_is_operator ) {
389	209					595	pos $t->{line} = $t->{line_cursor} + 1;
390	209	100				1161	return 'Operator' if $t->{line} =~ m/\G(?:
391							\d # x op with no whitespace e.g. 'x3'
392							\|
393							(?!( # negative lookahead
394							=> # not on left of fat comma
395							\|
396							\w # not a word like "xyzzy"
397							\|
398							\s # not x op plus whitespace
399							))
400							)/gcx;
401							}
402
403							# Otherwise, commit like a normal bareword, including x
404							# operator followed by whitespace.
405	739					2112	return PPI::Token::Word->__TOKENIZER__commit($t);
406
407							} elsif ( $char == 45 ) { # $char eq '-'
408							# Look for an obvious operator operand context
409	7002					14302	my $context = $t->_opcontext;
410	7002	100				13286	if ( $context eq 'operator' ) {
411	5034					13945	return 'Operator';
412							} else {
413							# More logic needed
414	1968					5640	return 'Unknown';
415							}
416
417							} elsif ( $char >= 128 ) { # Outside ASCII
418	6	100				37	return 'PPI::Token::Word'->__TOKENIZER__commit($t) if $c =~ /\w/;
419	1	50				18	return 'Whitespace' if $c =~ /\s/;
420							}
421
422
423							# All the whitespaces are covered, so what to do
424							### For now, die
425	1					8	PPI::Exception->throw("Encountered unexpected character '$char'");
426							}
427
428							sub __TOKENIZER__on_line_end {
429	39012	100		39012		133643	$_[1]->_finalize_token if $_[1]->{token};
430							}
431
432							1;
433
434							=pod
435
436							=head1 SUPPORT
437
438							See the L in the main module.
439
440							=head1 AUTHOR
441
442							Adam Kennedy Eadamk@cpan.orgE
443
444							=head1 COPYRIGHT
445
446							Copyright 2001 - 2011 Adam Kennedy.
447
448							This program is free software; you can redistribute
449							it and/or modify it under the same terms as Perl itself.
450
451							The full text of the license can be found in the
452							LICENSE file included with this module.
453
454							=cut