File Coverage

blib/lib/PPI/Token/Whitespace.pm
Criterion Covered Total %
statement 94 96 97.9
branch 79 86 91.8
condition 77 90 85.5
subroutine 7 8 87.5
pod 2 2 100.0
total 259 282 91.8


line stmt bran cond sub pod time code
1             package PPI::Token::Whitespace;
2              
3             =pod
4              
5             =head1 NAME
6              
7             PPI::Token::Whitespace - Tokens representing ordinary white space
8              
9             =head1 INHERITANCE
10              
11             PPI::Token::Whitespace
12             isa PPI::Token
13             isa PPI::Element
14              
15             =head1 DESCRIPTION
16              
17             As a full "round-trip" parser, PPI records every last byte in a
18             file and ensure that it is included in the L object.
19              
20             This even includes whitespace. In fact, Perl documents are seen
21             as "floating in a sea of whitespace", and thus any document will
22             contain vast quantities of C objects.
23              
24             For the most part, you shouldn't notice them. Or at least, you
25             shouldn't B to notice them.
26              
27             This means doing things like consistently using the "S for significant"
28             series of L and L methods to do things.
29              
30             If you want the nth child element, you should be using C rather
31             than C, and likewise C, C, and
32             so on and so forth.
33              
34             =head1 METHODS
35              
36             Again, for the most part you should really B need to do anything
37             very significant with whitespace.
38              
39             But there are a couple of convenience methods provided, beyond those
40             provided by the parent L and L classes.
41              
42             =cut
43              
44 65     65   66907 use strict;
  65         114  
  65         1470  
45 65     65   582 use Clone ();
  65         2162  
  65         746  
46 65     65   604 use PPI::Token ();
  65         132  
  65         91934  
47              
48             our $VERSION = '1.276';
49              
50             our @ISA = "PPI::Token";
51              
52             =pod
53              
54             =head2 null
55              
56             Because L sees documents as sitting on a sort of substrate made of
57             whitespace, there are a couple of corner cases that get particularly
58             nasty if they don't find whitespace in certain places.
59              
60             Imagine walking down the beach to go into the ocean, and then quite
61             unexpectedly falling off the side of the planet. Well it's somewhat
62             equivalent to that, including the whole screaming death bit.
63              
64             The C method is a convenience provided to get some internals
65             out of some of these corner cases.
66              
67             Specifically it create a whitespace token that represents nothing,
68             or at least the null string C<''>. It's a handy way to have some
69             "whitespace" right where you need it, without having to have any
70             actual characters.
71              
72             =cut
73              
74             my $null;
75              
76             sub null {
77 0   0 0 1 0 $null ||= $_[0]->new('');
78 0         0 Clone::clone($null);
79             }
80              
81             ### XS -> PPI/XS.xs:_PPI_Token_Whitespace__significant 0.900+
82             sub significant() { '' }
83              
84             =pod
85              
86             =head2 tidy
87              
88             C is a convenience method for removing unneeded whitespace.
89              
90             Specifically, it removes any whitespace from the end of a line.
91              
92             Note that this B include POD, where you may well need
93             to keep certain types of whitespace. The entire POD chunk lives
94             in its own L object.
95              
96             =cut
97              
98             sub tidy {
99 2     2 1 11 $_[0]->{content} =~ s/^\s+?(?>\n)//;
100 2         6 1;
101             }
102              
103              
104              
105              
106              
107             #####################################################################
108             # Parsing Methods
109              
110             # Build the class and commit maps
111             my %COMMITMAP = (
112             map( { ord $_ => 'PPI::Token::Word' } 'a' .. 'u', 'A' .. 'Z', qw" w y z _ " ), # no v or x
113             map( { ord $_ => 'PPI::Token::Structure' } qw" ; [ ] { } ) " ),
114             ord '#' => 'PPI::Token::Comment',
115             ord 'v' => 'PPI::Token::Number::Version',
116             );
117             my %CLASSMAP = (
118             map( { ord $_ => 'Number' } 0 .. 9 ),
119             map( { ord $_ => 'Operator' } qw" = ? | + > . ! ~ ^ " ),
120             map( { ord $_ => 'Unknown' } qw" * $ @ & : % " ),
121             ord ',' => 'PPI::Token::Operator',
122             ord "'" => 'Quote::Single',
123             ord '"' => 'Quote::Double',
124             ord '`' => 'QuoteLike::Backtick',
125             ord '\\' => 'Cast',
126             ord '_' => 'Word',
127             9 => 'Whitespace', # A horizontal tab
128             10 => 'Whitespace', # A newline
129             12 => 'Whitespace', # A form feed
130             13 => 'Whitespace', # A carriage return
131             32 => 'Whitespace', # A normal space
132             );
133              
134             # Words (functions and keywords) after which a following / is
135             # almost certainly going to be a regex
136             my %MATCHWORD = map { $_ => 1 } qw{
137             return
138             split
139             if
140             unless
141             grep
142             map
143             };
144              
145             sub __TOKENIZER__on_line_start {
146 51153     51153   62420 my $t = $_[1];
147 51153         71359 my $line = $t->{line};
148              
149             # Can we classify the entire line in one go
150 51153 100       218698 if ( $line =~ /^\s*$/ ) {
    100          
    100          
    100          
151             # A whitespace line
152 6697         15692 $t->_new_token( 'Whitespace', $line );
153 6697         10669 return 0;
154              
155             } elsif ( $line =~ /^\s*#/ ) {
156             # A comment line
157 4323         10787 $t->_new_token( 'Comment', $line );
158 4323         8874 $t->_finalize_token;
159 4323         7334 return 0;
160              
161             } elsif ( $line =~ /^=(\w+)/ ) {
162             # A Pod tag... change to pod mode
163 752         2135 $t->_new_token( 'Pod', $line );
164 752 50       2045 if ( $1 eq 'cut' ) {
165             # This is an error, but one we'll ignore
166             # Don't go into Pod mode, since =cut normally
167             # signals the end of Pod mode
168             } else {
169 752         1259 $t->{class} = 'PPI::Token::Pod';
170             }
171 752         1260 return 0;
172              
173             } elsif ( $line =~ /^use v6\-alpha\;/ ) {
174             # Indicates a Perl 6 block. Make the initial
175             # implementation just suck in the entire rest of the
176             # file.
177 2         3 my @perl6;
178 2         3 while ( 1 ) {
179 693         923 my $line6 = $t->_get_line;
180 693 100       886 last unless defined $line6;
181 691         802 push @perl6, $line6;
182             }
183 2         2 push @{ $t->{perl6} }, join '', @perl6;
  2         102  
184              
185             # We only sucked in the block, we don't actually do
186             # anything to the "use v6..." line. So return as if
187             # we didn't find anything at all.
188 2         33 return 1;
189             }
190              
191 39379         68938 1;
192             }
193              
194             sub __TOKENIZER__on_char {
195 379931     379931   430746 my $t = $_[1];
196 379931         544108 my $c = substr $t->{line}, $t->{line_cursor}, 1;
197 379931         442927 my $char = ord $c;
198              
199             # Do we definitely know what something is?
200 379931 100       808910 return $COMMITMAP{$char}->__TOKENIZER__commit($t) if $COMMITMAP{$char};
201              
202             # Handle the simple option first
203 261770 100       735146 return $CLASSMAP{$char} if $CLASSMAP{$char};
204              
205 19488 100       47669 if ( $char == 40 ) { # $char eq '('
    100          
    100          
    100          
    100          
    50          
206             # Finalise any whitespace token...
207 8502 100       21423 $t->_finalize_token if $t->{token};
208              
209             # Is this the beginning of a sub prototype?
210             # We are a sub prototype IF
211             # 1. The previous significant token is a bareword.
212             # 2. The one before that is the word 'sub'.
213             # 3. The one before that is a 'structure'
214              
215             # Get the three previous significant tokens
216 8502         15693 my @tokens = $t->_previous_significant_tokens(3);
217              
218             # A normal subroutine declaration
219 8502         11534 my $p1 = $tokens[1];
220 8502         10371 my $p2 = $tokens[2];
221 8502 50 100     67455 if (
      100        
      100        
      100        
      33        
      100        
222             $tokens[0]
223             and
224             $tokens[0]->isa('PPI::Token::Word')
225             and
226             $p1
227             and
228             $p1->isa('PPI::Token::Word')
229             and
230             $p1->content eq 'sub'
231             and (
232             not $p2
233             or
234             $p2->isa('PPI::Token::Structure')
235             or (
236             $p2->isa('PPI::Token::Whitespace')
237             and
238             $p2->content eq ''
239             )
240             or (
241             # Lexical subroutine
242             $p2->isa('PPI::Token::Word')
243             and
244             $p2->content =~ /^(?:my|our|state)$/
245             )
246             )
247             ) {
248             # This is a sub prototype
249 377         1318 return 'Prototype';
250             }
251              
252             # A prototyped anonymous subroutine
253 8125         11558 my $p0 = $tokens[0];
254 8125 100 100     34536 if ( $p0 and $p0->isa('PPI::Token::Word') and $p0->content eq 'sub'
      100        
      66        
      66        
255             # Maybe it's invoking a method named 'sub'
256             and not ( $p1 and $p1->isa('PPI::Token::Operator') and $p1->content eq '->')
257             ) {
258 62         241 return 'Prototype';
259             }
260              
261             # This is a normal open bracket
262 8063         28143 return 'Structure';
263              
264             } elsif ( $char == 60 ) { # $char eq '<'
265             # Finalise any whitespace token...
266 2048 100       5615 $t->_finalize_token if $t->{token};
267              
268             # This is either "less than" or "readline quote-like"
269             # Do some context stuff to guess which.
270 2048         4321 my $prev = $t->_last_significant_token;
271              
272             # The most common group of less-thans are used like
273             # $foo < $bar
274             # 1 < $bar
275             # $#foo < $bar
276 2048 100 100     10700 return 'Operator' if $prev and $prev->isa('PPI::Token::Symbol');
277 1716 50 66     6765 return 'Operator' if $prev and $prev->isa('PPI::Token::Magic');
278 1716 100 100     6481 return 'Operator' if $prev and $prev->isa('PPI::Token::Number');
279 1693 50 66     6353 return 'Operator' if $prev and $prev->isa('PPI::Token::ArrayIndex');
280              
281             # If it is <<... it's a here-doc instead
282 1693         3350 my $next_char = substr( $t->{line}, $t->{line_cursor} + 1, 2 );
283 1693 100       6095 return 'Operator' if $next_char =~ /<[^>]/;
284              
285 899 100       2401 return 'Operator' if not $prev;
286              
287             # The most common group of readlines are used like
288             # while ( <...> )
289             # while <>;
290 812         1857 my $prec = $prev->content;
291 812 100 100     10876 return 'QuoteLike::Readline'
      100        
      100        
      100        
      100        
      100        
      100        
      100        
      100        
292             if ( $prev->isa('PPI::Token::Structure') and $prec eq '(' )
293             or ( $prev->isa('PPI::Token::Structure') and $prec eq ';' )
294             or ( $prev->isa('PPI::Token::Word') and $prec eq 'while' )
295             or ( $prev->isa('PPI::Token::Operator') and $prec eq '=' )
296             or ( $prev->isa('PPI::Token::Operator') and $prec eq ',' );
297              
298 709 100 100     2221 if ( $prev->isa('PPI::Token::Structure') and $prec eq '}' ) {
299             # Could go either way... do a regex check
300             # $foo->{bar} < 2;
301             # grep { .. } ;
302 17         60 pos $t->{line} = $t->{line_cursor};
303 17 100       84 if ( $t->{line} =~ m/\G<(?!\d)\w+>/gc ) {
304             # Almost definitely readline
305 3         11 return 'QuoteLike::Readline';
306             }
307             }
308              
309             # Otherwise, we guess operator, which has been the default up
310             # until this more comprehensive section was created.
311 706         2039 return 'Operator';
312              
313             } elsif ( $char == 47 ) { # $char eq '/'
314             # Finalise any whitespace token...
315 1071 100       3637 $t->_finalize_token if $t->{token};
316              
317             # This is either a "divided by" or a "start regex"
318             # Do some context stuff to guess ( ack ) which.
319             # Hopefully the guess will be good enough.
320 1071         2451 my $prev = $t->_last_significant_token;
321              
322             # Or as the very first thing in a file
323 1071 100       2900 return 'Regexp::Match' if not $prev;
324              
325 972         2079 my $prec = $prev->content;
326              
327             # Most times following an operator, we are a regex.
328             # This includes cases such as:
329             # , - As an argument in a list
330             # .. - The second condition in a flip flop
331             # =~ - A bound regex
332             # !~ - Ditto
333 972 100       4099 return 'Regexp::Match' if $prev->isa('PPI::Token::Operator');
334              
335             # After a symbol
336 516 100       1588 return 'Operator' if $prev->isa('PPI::Token::Symbol');
337 447 100 66     1036 if ( $prec eq ']' and $prev->isa('PPI::Token::Structure') ) {
338 6         17 return 'Operator';
339             }
340              
341             # After another number
342 441 100       1289 return 'Operator' if $prev->isa('PPI::Token::Number');
343              
344             # After going into scope/brackets
345 408 100 100     1638 if (
      100        
346             $prev->isa('PPI::Token::Structure')
347             and (
348             $prec eq '('
349             or
350             $prec eq '{'
351             or
352             $prec eq ';'
353             )
354             ) {
355 55         183 return 'Regexp::Match';
356             }
357              
358             # Functions and keywords
359 353 100 66     996 if (
360             $MATCHWORD{$prec}
361             and
362             $prev->isa('PPI::Token::Word')
363             ) {
364 69         235 return 'Regexp::Match';
365             }
366              
367             # What about the char after the slash? There's some things
368             # that would be highly illogical to see if it's an operator.
369 284         586 my $next_char = substr $t->{line}, $t->{line_cursor} + 1, 1;
370 284 50 33     935 if ( defined $next_char and length $next_char ) {
371 284 100       938 if ( $next_char =~ /(?:\^|\[|\\)/ ) {
372 15         60 return 'Regexp::Match';
373             }
374             }
375              
376             # Otherwise... erm... assume operator?
377             # Add more tests here as potential cases come to light
378 269         811 return 'Operator';
379              
380             } elsif ( $char == 120 ) { # $char eq 'x'
381             # Could be a word, the x= operator, the x operator
382             # followed by whitespace, or the x operator without any
383             # space between itself and its operand, e.g.: '$a x3',
384             # which is the same as '$a x 3'. _current_x_is_operator
385             # assumes we have a complete 'x' token, but we don't
386             # yet. We may need to split this x character apart from
387             # what follows it.
388 859 100       1921 if ( $t->_current_x_is_operator ) {
389 209         595 pos $t->{line} = $t->{line_cursor} + 1;
390 209 100       1161 return 'Operator' if $t->{line} =~ m/\G(?:
391             \d # x op with no whitespace e.g. 'x3'
392             |
393             (?!( # negative lookahead
394             => # not on left of fat comma
395             |
396             \w # not a word like "xyzzy"
397             |
398             \s # not x op plus whitespace
399             ))
400             )/gcx;
401             }
402              
403             # Otherwise, commit like a normal bareword, including x
404             # operator followed by whitespace.
405 739         2112 return PPI::Token::Word->__TOKENIZER__commit($t);
406              
407             } elsif ( $char == 45 ) { # $char eq '-'
408             # Look for an obvious operator operand context
409 7002         14302 my $context = $t->_opcontext;
410 7002 100       13286 if ( $context eq 'operator' ) {
411 5034         13945 return 'Operator';
412             } else {
413             # More logic needed
414 1968         5640 return 'Unknown';
415             }
416              
417             } elsif ( $char >= 128 ) { # Outside ASCII
418 6 100       37 return 'PPI::Token::Word'->__TOKENIZER__commit($t) if $c =~ /\w/;
419 1 50       18 return 'Whitespace' if $c =~ /\s/;
420             }
421              
422              
423             # All the whitespaces are covered, so what to do
424             ### For now, die
425 1         8 PPI::Exception->throw("Encountered unexpected character '$char'");
426             }
427              
428             sub __TOKENIZER__on_line_end {
429 39012 100   39012   133643 $_[1]->_finalize_token if $_[1]->{token};
430             }
431              
432             1;
433              
434             =pod
435              
436             =head1 SUPPORT
437              
438             See the L in the main module.
439              
440             =head1 AUTHOR
441              
442             Adam Kennedy Eadamk@cpan.orgE
443              
444             =head1 COPYRIGHT
445              
446             Copyright 2001 - 2011 Adam Kennedy.
447              
448             This program is free software; you can redistribute
449             it and/or modify it under the same terms as Perl itself.
450              
451             The full text of the license can be found in the
452             LICENSE file included with this module.
453              
454             =cut