File Coverage

blib/lib/PPI/Token/Whitespace.pm
Criterion Covered Total %
statement 94 96 97.9
branch 79 86 91.8
condition 77 90 85.5
subroutine 7 8 87.5
pod 2 2 100.0
total 259 282 91.8


line stmt bran cond sub pod time code
1             package PPI::Token::Whitespace;
2              
3             =pod
4              
5             =head1 NAME
6              
7             PPI::Token::Whitespace - Tokens representing ordinary white space
8              
9             =head1 INHERITANCE
10              
11             PPI::Token::Whitespace
12             isa PPI::Token
13             isa PPI::Element
14              
15             =head1 DESCRIPTION
16              
17             As a full "round-trip" parser, PPI records every last byte in a
18             file and ensure that it is included in the L object.
19              
20             This even includes whitespace. In fact, Perl documents are seen
21             as "floating in a sea of whitespace", and thus any document will
22             contain vast quantities of C objects.
23              
24             For the most part, you shouldn't notice them. Or at least, you
25             shouldn't B to notice them.
26              
27             This means doing things like consistently using the "S for significant"
28             series of L and L methods to do things.
29              
30             If you want the nth child element, you should be using C rather
31             than C, and likewise C, C, and
32             so on and so forth.
33              
34             =head1 METHODS
35              
36             Again, for the most part you should really B need to do anything
37             very significant with whitespace.
38              
39             But there are a couple of convenience methods provided, beyond those
40             provided by the parent L and L classes.
41              
42             =cut
43              
44 65     65   72403 use strict;
  65         140  
  65         1937  
45 65     65   763 use Clone ();
  65         2648  
  65         790  
46 65     65   751 use PPI::Token ();
  65         177  
  65         118238  
47              
48             our $VERSION = '1.277';
49              
50             our @ISA = "PPI::Token";
51              
52             =pod
53              
54             =head2 null
55              
56             Because L sees documents as sitting on a sort of substrate made of
57             whitespace, there are a couple of corner cases that get particularly
58             nasty if they don't find whitespace in certain places.
59              
60             Imagine walking down the beach to go into the ocean, and then quite
61             unexpectedly falling off the side of the planet. Well it's somewhat
62             equivalent to that, including the whole screaming death bit.
63              
64             The C method is a convenience provided to get some internals
65             out of some of these corner cases.
66              
67             Specifically it create a whitespace token that represents nothing,
68             or at least the null string C<''>. It's a handy way to have some
69             "whitespace" right where you need it, without having to have any
70             actual characters.
71              
72             =cut
73              
74             my $null;
75              
76             sub null {
77 0   0 0 1 0 $null ||= $_[0]->new('');
78 0         0 Clone::clone($null);
79             }
80              
81             ### XS -> PPI/XS.xs:_PPI_Token_Whitespace__significant 0.900+
82             sub significant() { '' }
83              
84             =pod
85              
86             =head2 tidy
87              
88             C is a convenience method for removing unneeded whitespace.
89              
90             Specifically, it removes any whitespace from the end of a line.
91              
92             Note that this B include POD, where you may well need
93             to keep certain types of whitespace. The entire POD chunk lives
94             in its own L object.
95              
96             =cut
97              
98             sub tidy {
99 2     2 1 19 $_[0]->{content} =~ s/^\s+?(?>\n)//;
100 2         9 1;
101             }
102              
103              
104              
105              
106              
107             #####################################################################
108             # Parsing Methods
109              
110             # Build the class and commit maps
111             my %COMMITMAP = (
112             map( { ord $_ => 'PPI::Token::Word' } 'a' .. 'u', 'A' .. 'Z', qw" w y z _ " ), # no v or x
113             map( { ord $_ => 'PPI::Token::Structure' } qw" ; [ ] { } ) " ),
114             ord '#' => 'PPI::Token::Comment',
115             ord 'v' => 'PPI::Token::Number::Version',
116             );
117             my %CLASSMAP = (
118             map( { ord $_ => 'Number' } 0 .. 9 ),
119             map( { ord $_ => 'Operator' } qw" = ? | + > . ! ~ ^ " ),
120             map( { ord $_ => 'Unknown' } qw" * $ @ & : % " ),
121             ord ',' => 'PPI::Token::Operator',
122             ord "'" => 'Quote::Single',
123             ord '"' => 'Quote::Double',
124             ord '`' => 'QuoteLike::Backtick',
125             ord '\\' => 'Cast',
126             ord '_' => 'Word',
127             9 => 'Whitespace', # A horizontal tab
128             10 => 'Whitespace', # A newline
129             12 => 'Whitespace', # A form feed
130             13 => 'Whitespace', # A carriage return
131             32 => 'Whitespace', # A normal space
132             );
133              
134             # Words (functions and keywords) after which a following / is
135             # almost certainly going to be a regex
136             my %MATCHWORD = map { $_ => 1 } qw{
137             return
138             split
139             if
140             unless
141             grep
142             map
143             };
144              
145             sub __TOKENIZER__on_line_start {
146 51324     51324   77147 my $t = $_[1];
147 51324         89123 my $line = $t->{line};
148              
149             # Can we classify the entire line in one go
150 51324 100       263169 if ( $line =~ /^\s*$/ ) {
    100          
    100          
    100          
151             # A whitespace line
152 6679         19591 $t->_new_token( 'Whitespace', $line );
153 6679         13718 return 0;
154              
155             } elsif ( $line =~ /^\s*#/ ) {
156             # A comment line
157 4323         12764 $t->_new_token( 'Comment', $line );
158 4323         10693 $t->_finalize_token;
159 4323         8792 return 0;
160              
161             } elsif ( $line =~ /^=(\w+)/ ) {
162             # A Pod tag... change to pod mode
163 747         2664 $t->_new_token( 'Pod', $line );
164 747 50       2747 if ( $1 eq 'cut' ) {
165             # This is an error, but one we'll ignore
166             # Don't go into Pod mode, since =cut normally
167             # signals the end of Pod mode
168             } else {
169 747         1625 $t->{class} = 'PPI::Token::Pod';
170             }
171 747         1636 return 0;
172              
173             } elsif ( $line =~ /^use v6\-alpha\;/ ) {
174             # Indicates a Perl 6 block. Make the initial
175             # implementation just suck in the entire rest of the
176             # file.
177 2         5 my @perl6;
178 2         2 while ( 1 ) {
179 693         1110 my $line6 = $t->_get_line;
180 693 100       1067 last unless defined $line6;
181 691         980 push @perl6, $line6;
182             }
183 2         5 push @{ $t->{perl6} }, join '', @perl6;
  2         121  
184              
185             # We only sucked in the block, we don't actually do
186             # anything to the "use v6..." line. So return as if
187             # we didn't find anything at all.
188 2         42 return 1;
189             }
190              
191 39573         89516 1;
192             }
193              
194             sub __TOKENIZER__on_char {
195 380726     380726   525132 my $t = $_[1];
196 380726         672729 my $c = substr $t->{line}, $t->{line_cursor}, 1;
197 380726         539206 my $char = ord $c;
198              
199             # Do we definitely know what something is?
200 380726 100       1004421 return $COMMITMAP{$char}->__TOKENIZER__commit($t) if $COMMITMAP{$char};
201              
202             # Handle the simple option first
203 262862 100       900046 return $CLASSMAP{$char} if $CLASSMAP{$char};
204              
205 19098 100       61072 if ( $char == 40 ) { # $char eq '('
    100          
    100          
    100          
    100          
    50          
206             # Finalise any whitespace token...
207 8194 100       27151 $t->_finalize_token if $t->{token};
208              
209             # Is this the beginning of a sub prototype?
210             # We are a sub prototype IF
211             # 1. The previous significant token is a bareword.
212             # 2. The one before that is the word 'sub'.
213             # 3. The one before that is a 'structure'
214              
215             # Get the three previous significant tokens
216 8194         18705 my @tokens = $t->_previous_significant_tokens(3);
217              
218             # A normal subroutine declaration
219 8194         13917 my $p1 = $tokens[1];
220 8194         12921 my $p2 = $tokens[2];
221 8194 50 100     79049 if (
      100        
      100        
      100        
      33        
      100        
222             $tokens[0]
223             and
224             $tokens[0]->isa('PPI::Token::Word')
225             and
226             $p1
227             and
228             $p1->isa('PPI::Token::Word')
229             and
230             $p1->content eq 'sub'
231             and (
232             not $p2
233             or
234             $p2->isa('PPI::Token::Structure')
235             or (
236             $p2->isa('PPI::Token::Whitespace')
237             and
238             $p2->content eq ''
239             )
240             or (
241             # Lexical subroutine
242             $p2->isa('PPI::Token::Word')
243             and
244             $p2->content =~ /^(?:my|our|state)$/
245             )
246             )
247             ) {
248             # This is a sub prototype
249 317         1428 return 'Prototype';
250             }
251              
252             # A prototyped anonymous subroutine
253 7877         13069 my $p0 = $tokens[0];
254 7877 100 100     41990 if ( $p0 and $p0->isa('PPI::Token::Word') and $p0->content eq 'sub'
      100        
      66        
      66        
255             # Maybe it's invoking a method named 'sub'
256             and not ( $p1 and $p1->isa('PPI::Token::Operator') and $p1->content eq '->')
257             ) {
258 42         214 return 'Prototype';
259             }
260              
261             # This is a normal open bracket
262 7835         32545 return 'Structure';
263              
264             } elsif ( $char == 60 ) { # $char eq '<'
265             # Finalise any whitespace token...
266 2064 100       6795 $t->_finalize_token if $t->{token};
267              
268             # This is either "less than" or "readline quote-like"
269             # Do some context stuff to guess which.
270 2064         5654 my $prev = $t->_last_significant_token;
271              
272             # The most common group of less-thans are used like
273             # $foo < $bar
274             # 1 < $bar
275             # $#foo < $bar
276 2064 100 100     13837 return 'Operator' if $prev and $prev->isa('PPI::Token::Symbol');
277 1734 50 66     8266 return 'Operator' if $prev and $prev->isa('PPI::Token::Magic');
278 1734 100 100     7979 return 'Operator' if $prev and $prev->isa('PPI::Token::Number');
279 1705 50 66     7766 return 'Operator' if $prev and $prev->isa('PPI::Token::ArrayIndex');
280              
281             # If it is <<... it's a here-doc instead
282 1705         4426 my $next_char = substr( $t->{line}, $t->{line_cursor} + 1, 2 );
283 1705 100       7703 return 'Operator' if $next_char =~ /<[^>]/;
284              
285 908 100       2411 return 'Operator' if not $prev;
286              
287             # The most common group of readlines are used like
288             # while ( <...> )
289             # while <>;
290 829         2169 my $prec = $prev->content;
291 829 100 100     14276 return 'QuoteLike::Readline'
      100        
      100        
      100        
      100        
      100        
      100        
      100        
      100        
292             if ( $prev->isa('PPI::Token::Structure') and $prec eq '(' )
293             or ( $prev->isa('PPI::Token::Structure') and $prec eq ';' )
294             or ( $prev->isa('PPI::Token::Word') and $prec eq 'while' )
295             or ( $prev->isa('PPI::Token::Operator') and $prec eq '=' )
296             or ( $prev->isa('PPI::Token::Operator') and $prec eq ',' );
297              
298 712 100 100     2837 if ( $prev->isa('PPI::Token::Structure') and $prec eq '}' ) {
299             # Could go either way... do a regex check
300             # $foo->{bar} < 2;
301             # grep { .. } ;
302 12         45 pos $t->{line} = $t->{line_cursor};
303 12 100       69 if ( $t->{line} =~ m/\G<(?!\d)\w+>/gc ) {
304             # Almost definitely readline
305 3         14 return 'QuoteLike::Readline';
306             }
307             }
308              
309             # Otherwise, we guess operator, which has been the default up
310             # until this more comprehensive section was created.
311 709         2903 return 'Operator';
312              
313             } elsif ( $char == 47 ) { # $char eq '/'
314             # Finalise any whitespace token...
315 1027 100       4577 $t->_finalize_token if $t->{token};
316              
317             # This is either a "divided by" or a "start regex"
318             # Do some context stuff to guess ( ack ) which.
319             # Hopefully the guess will be good enough.
320 1027         3287 my $prev = $t->_last_significant_token;
321              
322             # Or as the very first thing in a file
323 1027 100       4030 return 'Regexp::Match' if not $prev;
324              
325 938         2495 my $prec = $prev->content;
326              
327             # Most times following an operator, we are a regex.
328             # This includes cases such as:
329             # , - As an argument in a list
330             # .. - The second condition in a flip flop
331             # =~ - A bound regex
332             # !~ - Ditto
333 938 100       4875 return 'Regexp::Match' if $prev->isa('PPI::Token::Operator');
334              
335             # After a symbol
336 492 100       2003 return 'Operator' if $prev->isa('PPI::Token::Symbol');
337 429 100 66     1475 if ( $prec eq ']' and $prev->isa('PPI::Token::Structure') ) {
338 11         50 return 'Operator';
339             }
340              
341             # After another number
342 418 100       1850 return 'Operator' if $prev->isa('PPI::Token::Number');
343              
344             # After going into scope/brackets
345 373 100 100     2190 if (
      100        
346             $prev->isa('PPI::Token::Structure')
347             and (
348             $prec eq '('
349             or
350             $prec eq '{'
351             or
352             $prec eq ';'
353             )
354             ) {
355 53         201 return 'Regexp::Match';
356             }
357              
358             # Functions and keywords
359 320 100 66     1394 if (
360             $MATCHWORD{$prec}
361             and
362             $prev->isa('PPI::Token::Word')
363             ) {
364 69         289 return 'Regexp::Match';
365             }
366              
367             # What about the char after the slash? There's some things
368             # that would be highly illogical to see if it's an operator.
369 251         759 my $next_char = substr $t->{line}, $t->{line_cursor} + 1, 1;
370 251 50 33     1035 if ( defined $next_char and length $next_char ) {
371 251 100       1071 if ( $next_char =~ /(?:\^|\[|\\)/ ) {
372 17         105 return 'Regexp::Match';
373             }
374             }
375              
376             # Otherwise... erm... assume operator?
377             # Add more tests here as potential cases come to light
378 234         1025 return 'Operator';
379              
380             } elsif ( $char == 120 ) { # $char eq 'x'
381             # Could be a word, the x= operator, the x operator
382             # followed by whitespace, or the x operator without any
383             # space between itself and its operand, e.g.: '$a x3',
384             # which is the same as '$a x 3'. _current_x_is_operator
385             # assumes we have a complete 'x' token, but we don't
386             # yet. We may need to split this x character apart from
387             # what follows it.
388 815 100       2299 if ( $t->_current_x_is_operator ) {
389 207         819 pos $t->{line} = $t->{line_cursor} + 1;
390 207 100       1560 return 'Operator' if $t->{line} =~ m/\G(?:
391             \d # x op with no whitespace e.g. 'x3'
392             |
393             (?!( # negative lookahead
394             => # not on left of fat comma
395             |
396             \w # not a word like "xyzzy"
397             |
398             \s # not x op plus whitespace
399             ))
400             )/gcx;
401             }
402              
403             # Otherwise, commit like a normal bareword, including x
404             # operator followed by whitespace.
405 696         2460 return PPI::Token::Word->__TOKENIZER__commit($t);
406              
407             } elsif ( $char == 45 ) { # $char eq '-'
408             # Look for an obvious operator operand context
409 6992         17005 my $context = $t->_opcontext;
410 6992 100       15249 if ( $context eq 'operator' ) {
411 5121         16300 return 'Operator';
412             } else {
413             # More logic needed
414 1871         6719 return 'Unknown';
415             }
416              
417             } elsif ( $char >= 128 ) { # Outside ASCII
418 6 100       52 return 'PPI::Token::Word'->__TOKENIZER__commit($t) if $c =~ /\w/;
419 1 50       5 return 'Whitespace' if $c =~ /\s/;
420             }
421              
422              
423             # All the whitespaces are covered, so what to do
424             ### For now, die
425 1         11 PPI::Exception->throw("Encountered unexpected character '$char'");
426             }
427              
428             sub __TOKENIZER__on_line_end {
429 39201 100   39201   171012 $_[1]->_finalize_token if $_[1]->{token};
430             }
431              
432             1;
433              
434             =pod
435              
436             =head1 SUPPORT
437              
438             See the L in the main module.
439              
440             =head1 AUTHOR
441              
442             Adam Kennedy Eadamk@cpan.orgE
443              
444             =head1 COPYRIGHT
445              
446             Copyright 2001 - 2011 Adam Kennedy.
447              
448             This program is free software; you can redistribute
449             it and/or modify it under the same terms as Perl itself.
450              
451             The full text of the license can be found in the
452             LICENSE file included with this module.
453              
454             =cut