File Coverage

blib/lib/PPI/Token/Whitespace.pm

Criterion	Covered	Total	%
statement	94	96	97.9
branch	79	86	91.8
condition	77	90	85.5
subroutine	7	8	87.5
pod	2	2	100.0
total	259	282	91.8

line	stmt	bran	cond	sub	pod	time	code
1							package PPI::Token::Whitespace;
2
3							=pod
4
5							=head1 NAME
6
7							PPI::Token::Whitespace - Tokens representing ordinary white space
8
9							=head1 INHERITANCE
10
11							PPI::Token::Whitespace
12							isa PPI::Token
13							isa PPI::Element
14
15							=head1 DESCRIPTION
16
17							As a full "round-trip" parser, PPI records every last byte in a
18							file and ensure that it is included in the L object.
19
20							This even includes whitespace. In fact, Perl documents are seen
21							as "floating in a sea of whitespace", and thus any document will
22							contain vast quantities of C objects.
23
24							For the most part, you shouldn't notice them. Or at least, you
25							shouldn't B to notice them.
26
27							This means doing things like consistently using the "S for significant"
28							series of L and L methods to do things.
29
30							If you want the nth child element, you should be using C rather
31							than C, and likewise C, C, and
32							so on and so forth.
33
34							=head1 METHODS
35
36							Again, for the most part you should really B need to do anything
37							very significant with whitespace.
38
39							But there are a couple of convenience methods provided, beyond those
40							provided by the parent L and L classes.
41
42							=cut
43
44	64			64		56469	use strict;
	64					117
	64					1398
45	64			64		590	use Clone ();
	64					2148
	64					688
46	64			64		565	use PPI::Token ();
	64					137
	64					89715
47
48							our $VERSION = '1.275';
49
50							our @ISA = "PPI::Token";
51
52							=pod
53
54							=head2 null
55
56							Because L sees documents as sitting on a sort of substrate made of
57							whitespace, there are a couple of corner cases that get particularly
58							nasty if they don't find whitespace in certain places.
59
60							Imagine walking down the beach to go into the ocean, and then quite
61							unexpectedly falling off the side of the planet. Well it's somewhat
62							equivalent to that, including the whole screaming death bit.
63
64							The C method is a convenience provided to get some internals
65							out of some of these corner cases.
66
67							Specifically it create a whitespace token that represents nothing,
68							or at least the null string C<''>. It's a handy way to have some
69							"whitespace" right where you need it, without having to have any
70							actual characters.
71
72							=cut
73
74							my $null;
75
76							sub null {
77	0		0	0	1	0	$null \|\|= $_[0]->new('');
78	0					0	Clone::clone($null);
79							}
80
81							### XS -> PPI/XS.xs:_PPI_Token_Whitespace__significant 0.900+
82							sub significant() { '' }
83
84							=pod
85
86							=head2 tidy
87
88							C is a convenience method for removing unneeded whitespace.
89
90							Specifically, it removes any whitespace from the end of a line.
91
92							Note that this B include POD, where you may well need
93							to keep certain types of whitespace. The entire POD chunk lives
94							in its own L object.
95
96							=cut
97
98							sub tidy {
99	2			2	1	10	$_[0]->{content} =~ s/^\s+?(?>\n)//;
100	2					8	1;
101							}
102
103
104
105
106
107							#####################################################################
108							# Parsing Methods
109
110							# Build the class and commit maps
111							my %COMMITMAP = (
112							map( { ord $_ => 'PPI::Token::Word' } 'a' .. 'u', 'A' .. 'Z', qw" w y z _ " ), # no v or x
113							map( { ord $_ => 'PPI::Token::Structure' } qw" ; [ ] { } ) " ),
114							ord '#' => 'PPI::Token::Comment',
115							ord 'v' => 'PPI::Token::Number::Version',
116							);
117							my %CLASSMAP = (
118							map( { ord $_ => 'Number' } 0 .. 9 ),
119							map( { ord $_ => 'Operator' } qw" = ? \| + > . ! ~ ^ " ),
120							map( { ord $_ => 'Unknown' } qw" * $ @ & : % " ),
121							ord ',' => 'PPI::Token::Operator',
122							ord "'" => 'Quote::Single',
123							ord '"' => 'Quote::Double',
124							ord '`' => 'QuoteLike::Backtick',
125							ord '\\' => 'Cast',
126							ord '_' => 'Word',
127							9 => 'Whitespace', # A horizontal tab
128							10 => 'Whitespace', # A newline
129							12 => 'Whitespace', # A form feed
130							13 => 'Whitespace', # A carriage return
131							32 => 'Whitespace', # A normal space
132							);
133
134							# Words (functions and keywords) after which a following / is
135							# almost certainly going to be a regex
136							my %MATCHWORD = map { $_ => 1 } qw{
137							return
138							split
139							if
140							unless
141							grep
142							map
143							};
144
145							sub __TOKENIZER__on_line_start {
146	51018			51018		67417	my $t = $_[1];
147	51018					68455	my $line = $t->{line};
148
149							# Can we classify the entire line in one go
150	51018	100				216499	if ( $line =~ /^\s*$/ ) {
		100
		100
		100
151							# A whitespace line
152	6652					15922	$t->_new_token( 'Whitespace', $line );
153	6652					11006	return 0;
154
155							} elsif ( $line =~ /^\s*#/ ) {
156							# A comment line
157	4317					10933	$t->_new_token( 'Comment', $line );
158	4317					8988	$t->_finalize_token;
159	4317					6982	return 0;
160
161							} elsif ( $line =~ /^=(\w+)/ ) {
162							# A Pod tag... change to pod mode
163	742					2265	$t->_new_token( 'Pod', $line );
164	742	50				2145	if ( $1 eq 'cut' ) {
165							# This is an error, but one we'll ignore
166							# Don't go into Pod mode, since =cut normally
167							# signals the end of Pod mode
168							} else {
169	742					1302	$t->{class} = 'PPI::Token::Pod';
170							}
171	742					1289	return 0;
172
173							} elsif ( $line =~ /^use v6\-alpha\;/ ) {
174							# Indicates a Perl 6 block. Make the initial
175							# implementation just suck in the entire rest of the
176							# file.
177	2					3	my @perl6;
178	2					2	while ( 1 ) {
179	693					934	my $line6 = $t->_get_line;
180	693	100				911	last unless defined $line6;
181	691					819	push @perl6, $line6;
182							}
183	2					3	push @{ $t->{perl6} }, join '', @perl6;
	2					87
184
185							# We only sucked in the block, we don't actually do
186							# anything to the "use v6..." line. So return as if
187							# we didn't find anything at all.
188	2					43	return 1;
189							}
190
191	39305					70103	1;
192							}
193
194							sub __TOKENIZER__on_char {
195	378528			378528		427637	my $t = $_[1];
196	378528					540888	my $c = substr $t->{line}, $t->{line_cursor}, 1;
197	378528					426522	my $char = ord $c;
198
199							# Do we definitely know what something is?
200	378528	100				794188	return $COMMITMAP{$char}->__TOKENIZER__commit($t) if $COMMITMAP{$char};
201
202							# Handle the simple option first
203	260853	100				727443	return $CLASSMAP{$char} if $CLASSMAP{$char};
204
205	19305	100				45934	if ( $char == 40 ) { # $char eq '('
		100
		100
		100
		100
		50
206							# Finalise any whitespace token...
207	8416	100				19859	$t->_finalize_token if $t->{token};
208
209							# Is this the beginning of a sub prototype?
210							# We are a sub prototype IF
211							# 1. The previous significant token is a bareword.
212							# 2. The one before that is the word 'sub'.
213							# 3. The one before that is a 'structure'
214
215							# Get the three previous significant tokens
216	8416					15761	my @tokens = $t->_previous_significant_tokens(3);
217
218							# A normal subroutine declaration
219	8416					12109	my $p1 = $tokens[1];
220	8416					10154	my $p2 = $tokens[2];
221	8416	50	100			66006	if (
			100
			100
			100
			33
			100
222							$tokens[0]
223							and
224							$tokens[0]->isa('PPI::Token::Word')
225							and
226							$p1
227							and
228							$p1->isa('PPI::Token::Word')
229							and
230							$p1->content eq 'sub'
231							and (
232							not $p2
233							or
234							$p2->isa('PPI::Token::Structure')
235							or (
236							$p2->isa('PPI::Token::Whitespace')
237							and
238							$p2->content eq ''
239							)
240							or (
241							# Lexical subroutine
242							$p2->isa('PPI::Token::Word')
243							and
244							$p2->content =~ /^(?:my\|our\|state)$/
245							)
246							)
247							) {
248							# This is a sub prototype
249	377					1314	return 'Prototype';
250							}
251
252							# A prototyped anonymous subroutine
253	8039					10752	my $p0 = $tokens[0];
254	8039	100	100			33944	if ( $p0 and $p0->isa('PPI::Token::Word') and $p0->content eq 'sub'
			100
			66
			66
255							# Maybe it's invoking a method named 'sub'
256							and not ( $p1 and $p1->isa('PPI::Token::Operator') and $p1->content eq '->')
257							) {
258	62					218	return 'Prototype';
259							}
260
261							# This is a normal open bracket
262	7977					26711	return 'Structure';
263
264							} elsif ( $char == 60 ) { # $char eq '<'
265							# Finalise any whitespace token...
266	2061	100				5310	$t->_finalize_token if $t->{token};
267
268							# This is either "less than" or "readline quote-like"
269							# Do some context stuff to guess which.
270	2061					4221	my $prev = $t->_last_significant_token;
271
272							# The most common group of less-thans are used like
273							# $foo < $bar
274							# 1 < $bar
275							# $#foo < $bar
276	2061	100	100			10310	return 'Operator' if $prev and $prev->isa('PPI::Token::Symbol');
277	1729	50	66			6557	return 'Operator' if $prev and $prev->isa('PPI::Token::Magic');
278	1729	100	100			6760	return 'Operator' if $prev and $prev->isa('PPI::Token::Number');
279	1698	50	66			6021	return 'Operator' if $prev and $prev->isa('PPI::Token::ArrayIndex');
280
281							# If it is <<... it's a here-doc instead
282	1698					3471	my $next_char = substr( $t->{line}, $t->{line_cursor} + 1, 2 );
283	1698	100				6239	return 'Operator' if $next_char =~ /<[^>]/;
284
285	899	100				1901	return 'Operator' if not $prev;
286
287							# The most common group of readlines are used like
288							# while ( <...> )
289							# while <>;
290	814					1624	my $prec = $prev->content;
291	814	100	100			9865	return 'QuoteLike::Readline'
			100
			100
			100
			100
			100
			100
			100
			100
292							if ( $prev->isa('PPI::Token::Structure') and $prec eq '(' )
293							or ( $prev->isa('PPI::Token::Structure') and $prec eq ';' )
294							or ( $prev->isa('PPI::Token::Word') and $prec eq 'while' )
295							or ( $prev->isa('PPI::Token::Operator') and $prec eq '=' )
296							or ( $prev->isa('PPI::Token::Operator') and $prec eq ',' );
297
298	706	100	100			2150	if ( $prev->isa('PPI::Token::Structure') and $prec eq '}' ) {
299							# Could go either way... do a regex check
300							# $foo->{bar} < 2;
301							# grep { .. } ;
302	18					57	pos $t->{line} = $t->{line_cursor};
303	18	100				71	if ( $t->{line} =~ m/\G<(?!\d)\w+>/gc ) {
304							# Almost definitely readline
305	3					13	return 'QuoteLike::Readline';
306							}
307							}
308
309							# Otherwise, we guess operator, which has been the default up
310							# until this more comprehensive section was created.
311	703					2099	return 'Operator';
312
313							} elsif ( $char == 47 ) { # $char eq '/'
314							# Finalise any whitespace token...
315	1054	100				3350	$t->_finalize_token if $t->{token};
316
317							# This is either a "divided by" or a "start regex"
318							# Do some context stuff to guess ( ack ) which.
319							# Hopefully the guess will be good enough.
320	1054					2349	my $prev = $t->_last_significant_token;
321
322							# Or as the very first thing in a file
323	1054	100				3254	return 'Regexp::Match' if not $prev;
324
325	954					2147	my $prec = $prev->content;
326
327							# Most times following an operator, we are a regex.
328							# This includes cases such as:
329							# , - As an argument in a list
330							# .. - The second condition in a flip flop
331							# =~ - A bound regex
332							# !~ - Ditto
333	954	100				4085	return 'Regexp::Match' if $prev->isa('PPI::Token::Operator');
334
335							# After a symbol
336	511	100				1675	return 'Operator' if $prev->isa('PPI::Token::Symbol');
337	442	100	66			1049	if ( $prec eq ']' and $prev->isa('PPI::Token::Structure') ) {
338	9					31	return 'Operator';
339							}
340
341							# After another number
342	433	100				1247	return 'Operator' if $prev->isa('PPI::Token::Number');
343
344							# After going into scope/brackets
345	395	100	100			1480	if (
			100
346							$prev->isa('PPI::Token::Structure')
347							and (
348							$prec eq '('
349							or
350							$prec eq '{'
351							or
352							$prec eq ';'
353							)
354							) {
355	49					167	return 'Regexp::Match';
356							}
357
358							# Functions and keywords
359	346	100	66			1012	if (
360							$MATCHWORD{$prec}
361							and
362							$prev->isa('PPI::Token::Word')
363							) {
364	69					239	return 'Regexp::Match';
365							}
366
367							# What about the char after the slash? There's some things
368							# that would be highly illogical to see if it's an operator.
369	277					496	my $next_char = substr $t->{line}, $t->{line_cursor} + 1, 1;
370	277	50	33			795	if ( defined $next_char and length $next_char ) {
371	277	100				966	if ( $next_char =~ /(?:\^\|\[\|\\)/ ) {
372	10					36	return 'Regexp::Match';
373							}
374							}
375
376							# Otherwise... erm... assume operator?
377							# Add more tests here as potential cases come to light
378	267					791	return 'Operator';
379
380							} elsif ( $char == 120 ) { # $char eq 'x'
381							# Could be a word, the x= operator, the x operator
382							# followed by whitespace, or the x operator without any
383							# space between itself and its operand, e.g.: '$a x3',
384							# which is the same as '$a x 3'. _current_x_is_operator
385							# assumes we have a complete 'x' token, but we don't
386							# yet. We may need to split this x character apart from
387							# what follows it.
388	854	100				1827	if ( $t->_current_x_is_operator ) {
389	208					559	pos $t->{line} = $t->{line_cursor} + 1;
390	208	100				1053	return 'Operator' if $t->{line} =~ m/\G(?:
391							\d # x op with no whitespace e.g. 'x3'
392							\|
393							(?!( # negative lookahead
394							=> # not on left of fat comma
395							\|
396							\w # not a word like "xyzzy"
397							\|
398							\s # not x op plus whitespace
399							))
400							)/gcx;
401							}
402
403							# Otherwise, commit like a normal bareword, including x
404							# operator followed by whitespace.
405	722					1910	return PPI::Token::Word->__TOKENIZER__commit($t);
406
407							} elsif ( $char == 45 ) { # $char eq '-'
408							# Look for an obvious operator operand context
409	6914					13473	my $context = $t->_opcontext;
410	6914	100				12432	if ( $context eq 'operator' ) {
411	4981					12836	return 'Operator';
412							} else {
413							# More logic needed
414	1933					5151	return 'Unknown';
415							}
416
417							} elsif ( $char >= 128 ) { # Outside ASCII
418	6	100				38	return 'PPI::Token::Word'->__TOKENIZER__commit($t) if $c =~ /\w/;
419	1	50				13	return 'Whitespace' if $c =~ /\s/;
420							}
421
422
423							# All the whitespaces are covered, so what to do
424							### For now, die
425	1					8	PPI::Exception->throw("Encountered unexpected character '$char'");
426							}
427
428							sub __TOKENIZER__on_line_end {
429	38924	100		38924		128763	$_[1]->_finalize_token if $_[1]->{token};
430							}
431
432							1;
433
434							=pod
435
436							=head1 SUPPORT
437
438							See the L in the main module.
439
440							=head1 AUTHOR
441
442							Adam Kennedy Eadamk@cpan.orgE
443
444							=head1 COPYRIGHT
445
446							Copyright 2001 - 2011 Adam Kennedy.
447
448							This program is free software; you can redistribute
449							it and/or modify it under the same terms as Perl itself.
450
451							The full text of the license can be found in the
452							LICENSE file included with this module.
453
454							=cut