File Coverage

blib/lib/PPI/Token/Whitespace.pm
Criterion Covered Total %
statement 94 96 97.9
branch 79 86 91.8
condition 77 90 85.5
subroutine 7 8 87.5
pod 2 2 100.0
total 259 282 91.8


line stmt bran cond sub pod time code
1             package PPI::Token::Whitespace;
2              
3             =pod
4              
5             =head1 NAME
6              
7             PPI::Token::Whitespace - Tokens representing ordinary white space
8              
9             =head1 INHERITANCE
10              
11             PPI::Token::Whitespace
12             isa PPI::Token
13             isa PPI::Element
14              
15             =head1 DESCRIPTION
16              
17             As a full "round-trip" parser, PPI records every last byte in a
18             file and ensure that it is included in the L object.
19              
20             This even includes whitespace. In fact, Perl documents are seen
21             as "floating in a sea of whitespace", and thus any document will
22             contain vast quantities of C objects.
23              
24             For the most part, you shouldn't notice them. Or at least, you
25             shouldn't B to notice them.
26              
27             This means doing things like consistently using the "S for significant"
28             series of L and L methods to do things.
29              
30             If you want the nth child element, you should be using C rather
31             than C, and likewise C, C, and
32             so on and so forth.
33              
34             =head1 METHODS
35              
36             Again, for the most part you should really B need to do anything
37             very significant with whitespace.
38              
39             But there are a couple of convenience methods provided, beyond those
40             provided by the parent L and L classes.
41              
42             =cut
43              
44 64     64   56469 use strict;
  64         117  
  64         1398  
45 64     64   590 use Clone ();
  64         2148  
  64         688  
46 64     64   565 use PPI::Token ();
  64         137  
  64         89715  
47              
48             our $VERSION = '1.275';
49              
50             our @ISA = "PPI::Token";
51              
52             =pod
53              
54             =head2 null
55              
56             Because L sees documents as sitting on a sort of substrate made of
57             whitespace, there are a couple of corner cases that get particularly
58             nasty if they don't find whitespace in certain places.
59              
60             Imagine walking down the beach to go into the ocean, and then quite
61             unexpectedly falling off the side of the planet. Well it's somewhat
62             equivalent to that, including the whole screaming death bit.
63              
64             The C method is a convenience provided to get some internals
65             out of some of these corner cases.
66              
67             Specifically it create a whitespace token that represents nothing,
68             or at least the null string C<''>. It's a handy way to have some
69             "whitespace" right where you need it, without having to have any
70             actual characters.
71              
72             =cut
73              
74             my $null;
75              
76             sub null {
77 0   0 0 1 0 $null ||= $_[0]->new('');
78 0         0 Clone::clone($null);
79             }
80              
81             ### XS -> PPI/XS.xs:_PPI_Token_Whitespace__significant 0.900+
82             sub significant() { '' }
83              
84             =pod
85              
86             =head2 tidy
87              
88             C is a convenience method for removing unneeded whitespace.
89              
90             Specifically, it removes any whitespace from the end of a line.
91              
92             Note that this B include POD, where you may well need
93             to keep certain types of whitespace. The entire POD chunk lives
94             in its own L object.
95              
96             =cut
97              
98             sub tidy {
99 2     2 1 10 $_[0]->{content} =~ s/^\s+?(?>\n)//;
100 2         8 1;
101             }
102              
103              
104              
105              
106              
107             #####################################################################
108             # Parsing Methods
109              
110             # Build the class and commit maps
111             my %COMMITMAP = (
112             map( { ord $_ => 'PPI::Token::Word' } 'a' .. 'u', 'A' .. 'Z', qw" w y z _ " ), # no v or x
113             map( { ord $_ => 'PPI::Token::Structure' } qw" ; [ ] { } ) " ),
114             ord '#' => 'PPI::Token::Comment',
115             ord 'v' => 'PPI::Token::Number::Version',
116             );
117             my %CLASSMAP = (
118             map( { ord $_ => 'Number' } 0 .. 9 ),
119             map( { ord $_ => 'Operator' } qw" = ? | + > . ! ~ ^ " ),
120             map( { ord $_ => 'Unknown' } qw" * $ @ & : % " ),
121             ord ',' => 'PPI::Token::Operator',
122             ord "'" => 'Quote::Single',
123             ord '"' => 'Quote::Double',
124             ord '`' => 'QuoteLike::Backtick',
125             ord '\\' => 'Cast',
126             ord '_' => 'Word',
127             9 => 'Whitespace', # A horizontal tab
128             10 => 'Whitespace', # A newline
129             12 => 'Whitespace', # A form feed
130             13 => 'Whitespace', # A carriage return
131             32 => 'Whitespace', # A normal space
132             );
133              
134             # Words (functions and keywords) after which a following / is
135             # almost certainly going to be a regex
136             my %MATCHWORD = map { $_ => 1 } qw{
137             return
138             split
139             if
140             unless
141             grep
142             map
143             };
144              
145             sub __TOKENIZER__on_line_start {
146 51018     51018   67417 my $t = $_[1];
147 51018         68455 my $line = $t->{line};
148              
149             # Can we classify the entire line in one go
150 51018 100       216499 if ( $line =~ /^\s*$/ ) {
    100          
    100          
    100          
151             # A whitespace line
152 6652         15922 $t->_new_token( 'Whitespace', $line );
153 6652         11006 return 0;
154              
155             } elsif ( $line =~ /^\s*#/ ) {
156             # A comment line
157 4317         10933 $t->_new_token( 'Comment', $line );
158 4317         8988 $t->_finalize_token;
159 4317         6982 return 0;
160              
161             } elsif ( $line =~ /^=(\w+)/ ) {
162             # A Pod tag... change to pod mode
163 742         2265 $t->_new_token( 'Pod', $line );
164 742 50       2145 if ( $1 eq 'cut' ) {
165             # This is an error, but one we'll ignore
166             # Don't go into Pod mode, since =cut normally
167             # signals the end of Pod mode
168             } else {
169 742         1302 $t->{class} = 'PPI::Token::Pod';
170             }
171 742         1289 return 0;
172              
173             } elsif ( $line =~ /^use v6\-alpha\;/ ) {
174             # Indicates a Perl 6 block. Make the initial
175             # implementation just suck in the entire rest of the
176             # file.
177 2         3 my @perl6;
178 2         2 while ( 1 ) {
179 693         934 my $line6 = $t->_get_line;
180 693 100       911 last unless defined $line6;
181 691         819 push @perl6, $line6;
182             }
183 2         3 push @{ $t->{perl6} }, join '', @perl6;
  2         87  
184              
185             # We only sucked in the block, we don't actually do
186             # anything to the "use v6..." line. So return as if
187             # we didn't find anything at all.
188 2         43 return 1;
189             }
190              
191 39305         70103 1;
192             }
193              
194             sub __TOKENIZER__on_char {
195 378528     378528   427637 my $t = $_[1];
196 378528         540888 my $c = substr $t->{line}, $t->{line_cursor}, 1;
197 378528         426522 my $char = ord $c;
198              
199             # Do we definitely know what something is?
200 378528 100       794188 return $COMMITMAP{$char}->__TOKENIZER__commit($t) if $COMMITMAP{$char};
201              
202             # Handle the simple option first
203 260853 100       727443 return $CLASSMAP{$char} if $CLASSMAP{$char};
204              
205 19305 100       45934 if ( $char == 40 ) { # $char eq '('
    100          
    100          
    100          
    100          
    50          
206             # Finalise any whitespace token...
207 8416 100       19859 $t->_finalize_token if $t->{token};
208              
209             # Is this the beginning of a sub prototype?
210             # We are a sub prototype IF
211             # 1. The previous significant token is a bareword.
212             # 2. The one before that is the word 'sub'.
213             # 3. The one before that is a 'structure'
214              
215             # Get the three previous significant tokens
216 8416         15761 my @tokens = $t->_previous_significant_tokens(3);
217              
218             # A normal subroutine declaration
219 8416         12109 my $p1 = $tokens[1];
220 8416         10154 my $p2 = $tokens[2];
221 8416 50 100     66006 if (
      100        
      100        
      100        
      33        
      100        
222             $tokens[0]
223             and
224             $tokens[0]->isa('PPI::Token::Word')
225             and
226             $p1
227             and
228             $p1->isa('PPI::Token::Word')
229             and
230             $p1->content eq 'sub'
231             and (
232             not $p2
233             or
234             $p2->isa('PPI::Token::Structure')
235             or (
236             $p2->isa('PPI::Token::Whitespace')
237             and
238             $p2->content eq ''
239             )
240             or (
241             # Lexical subroutine
242             $p2->isa('PPI::Token::Word')
243             and
244             $p2->content =~ /^(?:my|our|state)$/
245             )
246             )
247             ) {
248             # This is a sub prototype
249 377         1314 return 'Prototype';
250             }
251              
252             # A prototyped anonymous subroutine
253 8039         10752 my $p0 = $tokens[0];
254 8039 100 100     33944 if ( $p0 and $p0->isa('PPI::Token::Word') and $p0->content eq 'sub'
      100        
      66        
      66        
255             # Maybe it's invoking a method named 'sub'
256             and not ( $p1 and $p1->isa('PPI::Token::Operator') and $p1->content eq '->')
257             ) {
258 62         218 return 'Prototype';
259             }
260              
261             # This is a normal open bracket
262 7977         26711 return 'Structure';
263              
264             } elsif ( $char == 60 ) { # $char eq '<'
265             # Finalise any whitespace token...
266 2061 100       5310 $t->_finalize_token if $t->{token};
267              
268             # This is either "less than" or "readline quote-like"
269             # Do some context stuff to guess which.
270 2061         4221 my $prev = $t->_last_significant_token;
271              
272             # The most common group of less-thans are used like
273             # $foo < $bar
274             # 1 < $bar
275             # $#foo < $bar
276 2061 100 100     10310 return 'Operator' if $prev and $prev->isa('PPI::Token::Symbol');
277 1729 50 66     6557 return 'Operator' if $prev and $prev->isa('PPI::Token::Magic');
278 1729 100 100     6760 return 'Operator' if $prev and $prev->isa('PPI::Token::Number');
279 1698 50 66     6021 return 'Operator' if $prev and $prev->isa('PPI::Token::ArrayIndex');
280              
281             # If it is <<... it's a here-doc instead
282 1698         3471 my $next_char = substr( $t->{line}, $t->{line_cursor} + 1, 2 );
283 1698 100       6239 return 'Operator' if $next_char =~ /<[^>]/;
284              
285 899 100       1901 return 'Operator' if not $prev;
286              
287             # The most common group of readlines are used like
288             # while ( <...> )
289             # while <>;
290 814         1624 my $prec = $prev->content;
291 814 100 100     9865 return 'QuoteLike::Readline'
      100        
      100        
      100        
      100        
      100        
      100        
      100        
      100        
292             if ( $prev->isa('PPI::Token::Structure') and $prec eq '(' )
293             or ( $prev->isa('PPI::Token::Structure') and $prec eq ';' )
294             or ( $prev->isa('PPI::Token::Word') and $prec eq 'while' )
295             or ( $prev->isa('PPI::Token::Operator') and $prec eq '=' )
296             or ( $prev->isa('PPI::Token::Operator') and $prec eq ',' );
297              
298 706 100 100     2150 if ( $prev->isa('PPI::Token::Structure') and $prec eq '}' ) {
299             # Could go either way... do a regex check
300             # $foo->{bar} < 2;
301             # grep { .. } ;
302 18         57 pos $t->{line} = $t->{line_cursor};
303 18 100       71 if ( $t->{line} =~ m/\G<(?!\d)\w+>/gc ) {
304             # Almost definitely readline
305 3         13 return 'QuoteLike::Readline';
306             }
307             }
308              
309             # Otherwise, we guess operator, which has been the default up
310             # until this more comprehensive section was created.
311 703         2099 return 'Operator';
312              
313             } elsif ( $char == 47 ) { # $char eq '/'
314             # Finalise any whitespace token...
315 1054 100       3350 $t->_finalize_token if $t->{token};
316              
317             # This is either a "divided by" or a "start regex"
318             # Do some context stuff to guess ( ack ) which.
319             # Hopefully the guess will be good enough.
320 1054         2349 my $prev = $t->_last_significant_token;
321              
322             # Or as the very first thing in a file
323 1054 100       3254 return 'Regexp::Match' if not $prev;
324              
325 954         2147 my $prec = $prev->content;
326              
327             # Most times following an operator, we are a regex.
328             # This includes cases such as:
329             # , - As an argument in a list
330             # .. - The second condition in a flip flop
331             # =~ - A bound regex
332             # !~ - Ditto
333 954 100       4085 return 'Regexp::Match' if $prev->isa('PPI::Token::Operator');
334              
335             # After a symbol
336 511 100       1675 return 'Operator' if $prev->isa('PPI::Token::Symbol');
337 442 100 66     1049 if ( $prec eq ']' and $prev->isa('PPI::Token::Structure') ) {
338 9         31 return 'Operator';
339             }
340              
341             # After another number
342 433 100       1247 return 'Operator' if $prev->isa('PPI::Token::Number');
343              
344             # After going into scope/brackets
345 395 100 100     1480 if (
      100        
346             $prev->isa('PPI::Token::Structure')
347             and (
348             $prec eq '('
349             or
350             $prec eq '{'
351             or
352             $prec eq ';'
353             )
354             ) {
355 49         167 return 'Regexp::Match';
356             }
357              
358             # Functions and keywords
359 346 100 66     1012 if (
360             $MATCHWORD{$prec}
361             and
362             $prev->isa('PPI::Token::Word')
363             ) {
364 69         239 return 'Regexp::Match';
365             }
366              
367             # What about the char after the slash? There's some things
368             # that would be highly illogical to see if it's an operator.
369 277         496 my $next_char = substr $t->{line}, $t->{line_cursor} + 1, 1;
370 277 50 33     795 if ( defined $next_char and length $next_char ) {
371 277 100       966 if ( $next_char =~ /(?:\^|\[|\\)/ ) {
372 10         36 return 'Regexp::Match';
373             }
374             }
375              
376             # Otherwise... erm... assume operator?
377             # Add more tests here as potential cases come to light
378 267         791 return 'Operator';
379              
380             } elsif ( $char == 120 ) { # $char eq 'x'
381             # Could be a word, the x= operator, the x operator
382             # followed by whitespace, or the x operator without any
383             # space between itself and its operand, e.g.: '$a x3',
384             # which is the same as '$a x 3'. _current_x_is_operator
385             # assumes we have a complete 'x' token, but we don't
386             # yet. We may need to split this x character apart from
387             # what follows it.
388 854 100       1827 if ( $t->_current_x_is_operator ) {
389 208         559 pos $t->{line} = $t->{line_cursor} + 1;
390 208 100       1053 return 'Operator' if $t->{line} =~ m/\G(?:
391             \d # x op with no whitespace e.g. 'x3'
392             |
393             (?!( # negative lookahead
394             => # not on left of fat comma
395             |
396             \w # not a word like "xyzzy"
397             |
398             \s # not x op plus whitespace
399             ))
400             )/gcx;
401             }
402              
403             # Otherwise, commit like a normal bareword, including x
404             # operator followed by whitespace.
405 722         1910 return PPI::Token::Word->__TOKENIZER__commit($t);
406              
407             } elsif ( $char == 45 ) { # $char eq '-'
408             # Look for an obvious operator operand context
409 6914         13473 my $context = $t->_opcontext;
410 6914 100       12432 if ( $context eq 'operator' ) {
411 4981         12836 return 'Operator';
412             } else {
413             # More logic needed
414 1933         5151 return 'Unknown';
415             }
416              
417             } elsif ( $char >= 128 ) { # Outside ASCII
418 6 100       38 return 'PPI::Token::Word'->__TOKENIZER__commit($t) if $c =~ /\w/;
419 1 50       13 return 'Whitespace' if $c =~ /\s/;
420             }
421              
422              
423             # All the whitespaces are covered, so what to do
424             ### For now, die
425 1         8 PPI::Exception->throw("Encountered unexpected character '$char'");
426             }
427              
428             sub __TOKENIZER__on_line_end {
429 38924 100   38924   128763 $_[1]->_finalize_token if $_[1]->{token};
430             }
431              
432             1;
433              
434             =pod
435              
436             =head1 SUPPORT
437              
438             See the L in the main module.
439              
440             =head1 AUTHOR
441              
442             Adam Kennedy Eadamk@cpan.orgE
443              
444             =head1 COPYRIGHT
445              
446             Copyright 2001 - 2011 Adam Kennedy.
447              
448             This program is free software; you can redistribute
449             it and/or modify it under the same terms as Perl itself.
450              
451             The full text of the license can be found in the
452             LICENSE file included with this module.
453              
454             =cut