File Coverage

blib/lib/HTML/Selector/Element.pm
Criterion Covered Total %
statement 206 299 68.9
branch 129 220 58.6
condition 44 105 41.9
subroutine 23 32 71.8
pod 3 17 17.6
total 405 673 60.1


line stmt bran cond sub pod time code
1             package HTML::Selector::Element;
2             our $VERSION = '0.95_02';
3              
4             ## Adapted from HTML::Selector::XPath
5             ## The parser is basically the same, the difference is in what it produces.
6              
7 10     10   80575 use Carp;
  10         24  
  10         687  
8 10     10   62 use strict;
  10         31  
  10         40331  
9              
10             sub import {
11 7     7   65 my($class) = shift;
12 7 50       32 if(@_) {
13 7         4890 require HTML::Element;
14             package # hide from pause
15             HTML::Element;
16 7         121141 local $^W; # no warnings 'redefined' doesn't work over there in Exporter
17 7         695 HTML::Selector::Element::Trait->import(@_);
18             }
19             }
20            
21             my $ident = qr/(?![0-9]|-[-0-9])[-_a-zA-Z0-9]+/;
22              
23             my $reg = {
24             # tag name/id/class. Caveat: no namespace
25             element => qr/^([#.]?)([^\s'"#.\/:@,=~>()\[\]|+]+)/i,
26             # attribute presence
27             attr1 => qr/^\[ \s* ($ident) \s* \]/x,
28             # attribute value match
29             attr2 => qr/^\[ \s* ($ident) \s*
30             ( [~|*^\$!]? = ) \s*
31             (?: ($ident) | "([^"]*)" | '([^']*)') \s* \] /x,
32             badattr => qr/^\[/,
33             pseudoN => qr/^:(not|has|is)\(/i, # we chop off the closing parenthesis below in the code
34             pseudo => qr/^:([()a-z0-9_+-]+)/i,
35             # adjacency/direct descendance (test for comma first)
36             combinator => qr/^\s*([>+~])\s*|^\s+/i, # doesn't capture matched whitespace
37             # rule separator
38             comma => qr/^\s*,\s*/i,
39             };
40              
41             sub new {
42 35     35 1 19641 my($class, @expr) = @_;
43 35         105 my $self = bless {}, $class;
44 35         117 $self->selector(@expr);
45 35         158 return $self;
46             }
47              
48             sub selector {
49 35     35 0 59 my $self = shift;
50 35 50       97 if (@_) {
51 35         64 delete @{$self}{qw(find is)};
  35         109  
52 35         151 $self->{selector} = join ', ', @_;
53 35         108 $self->{parsed} = \my @parsed;
54 35         85 foreach (@_) {
55 35         174 my($parsed, $leftover) = $self->consume($_);
56 35 50       114 length $leftover
57             and die "Invalid rule, couldn't parse '$leftover'";
58 35         135 push @parsed, @$parsed;
59             }
60             }
61 35         73 return $self->{selector};
62             }
63              
64             sub convert_attribute_match {
65 20     20 0 64 my ($left,$op,$right) = @_;
66             # negation (e.g. [input!="text"]) isn't implemented in CSS, but include it anyway:
67 20 100       98 if ($op eq '!=') {
    100          
    100          
    100          
    100          
    100          
68 1         11 $left, qr/^(?!\Q$right\E$)/
69             } elsif ($op eq '~=') { # substring attribute match
70 13         297 $left, qr/(?
71             } elsif ($op eq '*=') { # real substring attribute match
72 1         14 $left, qr/\Q$right\E/
73             } elsif ($op eq '|=') {
74 1         26 $left, qr/^\Q$right\E(?![^-])/
75             } elsif ($op eq '^=') {
76 1         15 $left, qr/^\Q$right\E/
77             } elsif ($op eq '$=') {
78 1         14 $left, qr/\Q$right\E$/
79             } else { # exact match
80 2         6 $left, $right
81             }
82             }
83              
84             # for our purpose, "siblings" includes the element itself
85             sub siblings {
86 66     66 0 103 my($this) = @_;
87 66   50     153 return children($this->{_parent}||return $this);
88             }
89              
90             sub nth_of_type {
91 11     11 0 29 my($of_type, $backward, $n, $cycle) = @_;
92             # nth_child = nth_of_type without type filter
93 11   100     34 $cycle ||= 0;
94 11 50 33     29 if($n <= 0 && $cycle > 0) {
95             # permanent correction
96 0   0     0 $n %= $cycle ||= $cycle; # first value above 0
97             }
98 66     66   2120 return sub { my($this) = @_;
99 66         113 my @sibling = siblings($this);
100 66 100       293 @sibling = grep $_->{_tag} eq $this->{_tag}, @sibling if $of_type;
101 66 50 33     300 for(my $n = # lexical scratch copy
      100        
      100        
102             $n > @sibling && $cycle < 0
103             ? ($n-@sibling) % $cycle + @sibling # first value below upper bound as modulo <= 0
104             : $n ; # no correction
105             $n > 0 && $n <= @sibling; # give up as soon as we get out of range
106             $n += $cycle || last) # loop only once if $cycle is zero
107             {
108 142 100       548 return 1 if $this == $sibling[$backward ? -$n : $n - 1];
    100          
109             }
110 41         98 return;
111             }
112 11         66 }
113              
114             sub only_child {
115 0     0 0 0 my($this) = @_;
116 0         0 return 1 == siblings($this);
117             }
118              
119             # A hacky recursive descent
120             # Only descends for :not(...) and :has(...)
121             sub consume {
122 35     35 0 141 my ($self, $rule) = @_;
123              
124 35         98 my @alt;
125 35         69 my $last_rule = '';
126 35         97 my $set = { static => my $static = [] };
127 35         96 my $hold; # last valid set
128             my $sibling_root; # root element of search space is sibling of start element
129 35         0 my($any); # flags
130 35         56 my $start_combinators = '';
131              
132 35         132 $rule =~ s/^\s+//;
133             # Loop through each "unit" of the rule
134 35         71 while() {
135             # Match elements
136 67         122 for($any = 0;; $any++) { # endless loop
137 156 100       1538 if ($rule =~ s/$reg->{element}//) {
    100          
    100          
    50          
    50          
    100          
138 70         229 my ($id_class,$name) = ($1,$2);
139 70 100 33     408 if ($id_class eq '#') { # ID
    100          
    50          
140 1         4 unshift @$static, id => $name;
141             # a condition very likely to fail, so try this first
142             } elsif ($id_class eq '.') { # class
143 12         48 push @$static, convert_attribute_match('class', '~=', $name);
144             } elsif (!$set->{tag} && $name ne '*') {
145             # we're not adding '*' yet as that's a very loose condition that seldom fails
146             # It's often not even necessary to test when we have other, more stringent conditions.
147 57         127 $set->{tag} = $name;
148 57         144 push @$static, _tag => $name;
149             }
150             }
151             # Match attribute selectors
152             elsif ($rule =~ s/$reg->{attr2}//) {
153 7         20 push @$static, convert_attribute_match( $1, $2, $^N );
154             } elsif ($rule =~ s/$reg->{attr1}//) {
155             # any value, as long as it's defined
156 1         5 push @$static, convert_attribute_match( $1, '', qr// );
157             } elsif ($rule =~ $reg->{badattr}) {
158 0         0 Carp::croak "Invalid attribute-value selector '$rule'";
159             }
160             # Match :not and :has
161             elsif ($rule =~ s/$reg->{pseudoN}//) {
162 0         0 my $which = lc $1;
163             # Now we parse the rest, and after parsing the subexpression
164             # has stopped, we must find a matching closing parenthesis:
165 0         0 my( $subset, $leftover ) = $self->consume( $rule );
166 0         0 $rule = $leftover;
167 0 0       0 $rule =~ s!^\s*\)!!
168             or die "Unbalanced parentheses at '$rule'";
169 0 0       0 if($which eq 'not') {
    0          
    0          
170 0         0 my @params = criteria($subset, undef);
171 0     0   0 push @$static, sub { not look_self($_[0], @params) };
  0         0  
172             } elsif($which eq 'is') {
173 0         0 my @params = criteria($subset, undef);
174 0     0   0 push @$static, sub { look_self($_[0], @params) };
  0         0  
175             } elsif($which eq 'has') {
176             # This is possibly very slow, especially when executed very often, so we keep this criterium for last
177 0         0 push @{$set->{has}}, find_closure($subset);
  0         0  
178             }
179             }
180             # other pseudoclasses/pseudoelements
181             # "else" because there could be more than one :not/:has
182             elsif ($rule =~ s/$reg->{pseudo}//) {
183 11         33 my $simple = ":$1";
184 11 50       33 if ( my @expr = $self->parse_pseudo($1, \$rule) ) {
    50          
    50          
    0          
    0          
    0          
185 0         0 push @$static, @expr;
186             } elsif ( $1 eq 'only-child') {
187 0         0 push @$static, only_child();
188             } elsif (my @m = $1 =~ /^((?:first|last)-(?:child|of-type)$) | ^(nth-(?:last-)?(?:child|of-type)) \((odd|even|(\d+)|(-?\d*)n([\+\-]\d+)?)\)$/x) {
189             # Matches all pseudoelements of the following lists:
190             # - first-child, last-child, first-of-type, last-of-type: without expression
191             # - nth-child, nth-of-type, nth-last-child, nth-last-of-type: with an expression between parens
192             # of one of these types: odd, even, and an+b
193             # with a lot of freedom for that last one, for example:
194             # 3, 3n, 3n+1, n+5, -n+5, -3n+5, 3n-1
195 11         33 my($pseudo, $nth, $expr, $n, $cycle, $offset) = @m;
196 11 100       22 if($nth) {
197 9 100       27 if(defined $cycle) {
    100          
198 4 100       17 $cycle .= '1' if $cycle =~ /^(-?)$/;
199 4   66     14 $n = $offset || $cycle;
200             }
201             elsif(!defined $n) {
202             # even / odd
203 3         7 $cycle = 2;
204 3 100       12 $n = $expr eq 'odd' ? 1 : 2;
205             }
206 9         12 $pseudo = $nth;
207             }
208             else {
209             # first / last
210 2         5 $n = 1;
211             }
212 11         29 my $of_type = $pseudo =~ /of-type/;
213 11         25 my $backward = $pseudo =~ /last/;
214 11         35 push @$static, nth_of_type($of_type, $backward, $n+0, $cycle);
215             } elsif ($1 =~ /^contains\($/) {
216             # not sure if this will work well in practise, in regards to whitespace
217 0 0       0 if( $rule =~ s/^\s*"([^"]*)"\s*\)// ) { # "#stupid syntax highlighter
    0          
218 0         0 my $fragment = $1;
219 0     0   0 push @$static, sub { $_[0]->as_text() =~ /\Q$fragment/ };
  0         0  
220             } elsif( $rule =~ s/^\s*'([^']*)'\s*\)// ) { #'#stupid syntax highlighter
221 0         0 my $fragment = $1;
222 0     0   0 push @$static, sub { $_[0]->as_text() =~ /\Q$fragment/ };
  0         0  
223             } else {
224 0         0 return( $set, $rule );
225 0         0 die "Malformed string in :contains(): '$rule'";
226             };
227             } elsif ( $1 eq 'root') {
228             # matches document root, or starting element
229 0         0 $set->{is_root} = 1;
230             } elsif ( $1 eq 'empty') {
231 0     0   0 push @$static, sub { (shift)->is_empty };
  0         0  
232             } else {
233 0         0 Carp::croak "Can't translate '$1' pseudo-class";
234             }
235             }
236             else {
237             # failed to match anything
238 67         149 last;
239             }
240 89         169 $any++;
241 89 50       216 die "Endless loop?"if $any > 20000;
242 89         158 undef $hold;
243             }
244              
245             # Match commas
246 67 50       334 if ($rule =~ s/$reg->{comma}//o) {
    100          
247             # ending one rule and beginning another
248 0   0     0 $set->{tag} ||= do { push @$static, _tag => qr/^(?!~)/; '*' };
  0         0  
  0         0  
249 0 0 0     0 $set->{sibling_root} ||= $sibling_root if $sibling_root;
250 0         0 push @alt, $set;
251 0         0 $set = { static => $static = [] };
252 0         0 ($any, $hold, $sibling_root) = ();
253             }
254             # Match combinators (whitespace, >, + and ~)
255             elsif ($rule =~ s/$reg->{combinator}//) {
256 32   100     147 my $combinator = $1 || ' ';
257 32 100       92 unless($any) {
258 4 50       12 unless($set->{chained}) {
259             # rule starts with a combinator
260             # add match for start element
261 4         13 $set->{is_root} = 1; # root element / start element
262             } else {
263             # 2 subsequent combinators: interject a '*'
264 0         0 push @$static, _tag => qr/^(?!~)/;
265 0         0 $set->{tag} = '*';
266             }
267             }
268             # new context
269 32         63 ($any, $hold) = ();
270 32 100       90 $hold = $set unless $1;
271 32         123 $set = { static => $static = [], chained => my $chained = $set, combinator => $combinator };
272 32 100 66     189 if($chained->{is_root} || $chained->{sibling_root}) {
273 4 100       16 if($combinator =~ /([+~])/) {
274 3   50     21 $set->{sibling_root} = ($chained->{sibling_root} || '') . $1;
275 3         8 $sibling_root = $set;
276             }
277             }
278             }
279             else {
280 35         60 last;
281             }
282             }
283             # wrap up
284             # rule ended in whitespace - This can only happen in nested rules such as :not( ... )
285 35 50       84 $set = $hold if $hold;
286 35   66     105 $set->{tag} ||= do { push @$static, _tag => qr/^(?!~)/; '*' };
  3         12  
  3         12  
287 35 100 33     113 $set->{sibling_root} ||= $sibling_root if $sibling_root;
288              
289 35         79 push @alt, $set;
290 35         122 return \@alt, $rule;
291             }
292              
293             sub criteria {
294             # returns criteria for look_down, with bound closures
295 33     33 1 80 my($set, $refroot, $strategy) = @_;
296 33   50     189 $strategy ||= { banroot => 1 };
297 33 100       109 $strategy->{banroot} = 0 if not $refroot;
298 33         51 my $recurse;
299             $recurse = sub {
300             # embeds $refroot
301 57     57   124 my($set, $banroot) = @_;
302 57 50       98 my @params = @{$set->{static}||[]};
  57         231  
303              
304 57 100       159 if($set->{is_root}) {
305 3         5 $banroot = 0;
306 3 50       9 if($refroot) {
307             # relative, top of branch
308 3         11 unshift @params, sub { $_[0] == $$refroot }; # unlikely to succeed, so fail fast
  3         13  
309             }
310             else {
311             # absolute, root of DOM
312 0         0 unshift @params, _parent => undef; # unlikely to succeed, so fail fast
313             }
314             }
315              
316 57 100 66     218 if($set->{chained}) {
    100          
317 27         46 push @params, do {
318             # Value is an anonymous sub
319             # Recurse into linked list
320 27         208 my @params = $recurse->($set->{chained}, $banroot);
321 27         60 my $combinator = $set->{combinator};
322             # we're in a chained set, so we have to wrap the criteria into a test in a closure.
323 27 100       143 if ($combinator =~ /^\s+$/) {
    100          
    50          
    0          
324 62         1875 $banroot ? sub { my($this) = @_;
325 62   100     181 my $rootparent = $$refroot->{_parent} || 0;
326 62   66     218 while($this = $this->{_parent} and $this != $rootparent) {
327 107 100       228 look_self($this, @params)
328             and return 1;
329             }
330 5         11 return 0;
331             }
332 0         0 : sub { my($this) = @_;
333 0         0 while($this = $this->{_parent}) {
334 0 0       0 look_self($this, @params)
335             and return 1;
336             }
337 0         0 return 0;
338             }
339 14 50       75 }
340             elsif ($combinator =~ />/) {
341 3         58 $banroot ? sub { my($this) = @_;
342 3   100     12 my $rootparent = $$refroot->{_parent} || 0;
343 3 50 33     18 if($this = $this->{_parent} and $this != $rootparent) {
344 3 50       19 look_self($this, @params)
345             and return 1;
346             }
347 0         0 return 0;
348             }
349 3         9 : sub { my($this) = @_;
350 3 50       19 if($this = $this->{_parent}) {
351 3 50       16 look_self($this, @params)
352             and return 1;
353             }
354 0         0 return 0;
355 6 100       36 };
356             }
357             elsif ($combinator =~ /\+/) {
358 11         542 sub { my($this) = @_;
359 11         46 my @left = $this->left;
360 11         206 while(@left) {
361 7         16 $this = pop @left;
362 7 50 33     62 ref $this && $this->{_tag} && $this->{_tag} !~ /^~/ or next;
      33        
363 7 50       30 look_self($this, @params)
364             and return 1;
365 0         0 return 0;
366             }
367 4         11 return 0;
368 7         46 };
369             }
370             elsif ($combinator =~ /\~/) {
371 0         0 sub { my($this) = @_;
372 0         0 my @left = $this->left;
373 0         0 while(@left) {
374 0         0 $this = pop @left;
375 0 0 0     0 ref $this && $this->{_tag} !~ /^~/ or next;
376 0 0       0 look_self($this, @params)
377             and return 1;
378             }
379 0         0 return 0;
380             }
381 0         0 }
382             else {
383 0         0 die "Weird combinator '$combinator'"
384             }
385             };
386             }
387             elsif($banroot && !$set->{is_root}) {
388             # if :root was not specified, $root should never match
389 24         107 push @params, sub { $_[0] != $$refroot }; # likely to succeed, so fail late
  142         4320  
390             }
391              
392             # do the :has tests last, because it's a complete subtree scan and that may be very slow.
393 57 50       143 push @params, @{$set->{has}} if $set->{has};
  0         0  
394 57 50       298 return wantarray ? @params : \@params;
395 33         205 };
396              
397              
398 33 100       108 if(ref $set eq 'ARRAY') {
399 30 50       84 if(@$set > 1) {
400 0         0 my %tags;
401 0   0     0 my @alt = map { $tags{$_->{tag}||'*'} = 1; [ $recurse->($_, !$refroot) ] } @$set;
  0         0  
  0         0  
402 0   0 0   0 my @params = sub { my($this) = @_; look_self($this, @$_) and return 1 foreach @alt; return 0 };;
  0         0  
  0         0  
  0         0  
403 0 0   0   0 unshift @params, sub { $tags{(shift)->{_tag}} } unless $tags{'*'};
  0         0  
404 0 0       0 return wantarray ? @params : \@params;
405             }
406 30         61 ($set) = @$set; # non-destructive
407             }
408 33 100       105 if(ref $set eq 'HASH') {
    50          
409 30         116 return $recurse->($set, $refroot);
410             }
411             elsif(ref $set) {
412             # assumed method call
413 3         22 return criteria($set->{parsed}, $refroot);
414             }
415             }
416              
417       11 0   sub parse_pseudo {
418             # nop, for subclassing
419             }
420              
421             sub find_closure {
422 26     26 0 48 my $sets = shift;
423 26 50       71 $sets = $sets->{parsed} if ref $sets ne 'ARRAY';
424 26         38 my $root; # The embedded variable
425 26         46 my(@down, @via_right, $right_down, @right_filter);
426 26         58 foreach my $set (@$sets) {
427 26 50       68 unless($set->{sibling_root}) {
    100          
428 25         56 push @down, $set;
429             }
430 0         0 elsif(ref $set->{sibling_root}) {
431 0         0 push @via_right, $set;
432 0         0 push @right_filter, $set->{sibling_root};
433 0         0 $right_down = 1;
434             }
435             else {
436 1         2 push @via_right, $set;
437 1         3 push @right_filter, $set;
438             }
439             }
440 26         66 foreach my $array(\@down, \@via_right, \@right_filter) {
441 78 100       193 @$array = criteria($array, \$root) if @$array;
442             }
443 26 100       67 unless(@via_right) {
444             # the most common case: down only
445             return sub {
446 25     25   52 $root = shift; return $root->look_down(@down)
  25         114  
447 25         163 };
448             }
449             else {
450             return sub {
451 2     2   4 $root = shift;
452 2         6 my($result, @result);
453 2 50       6 if(@down) {
454             # unlikely, but possible
455 0 0       0 unless(wantarray) {
456 0 0       0 $result = $root->look_down(@down) and return $result;
457             }
458             else {
459 0         0 @result = $root->look_down(@down);
460             }
461             }
462 2 50       9 if(my @right = grep { ref and look_self($_, @right_filter) } $root->right) {
  2 50       57  
463 2 50       21 unless($right_down) {
464 2 50       23 return wantarray ? (@result, @right) : shift @right;
465             }
466 0 0       0 unless(wantarray) {
467 0   0     0 $result = $_->look_down(@via_right) and return $result foreach @right;
468             }
469             else {
470 0         0 push @result, $_->look_down(@via_right) foreach @right;
471             }
472             }
473 0         0 return @result;
474 1         10 };
475             }
476             }
477              
478             # flipped
479             sub find {
480 27     27 1 69 my($self, $element) = @_;
481 27   66     104 return ($self->{find} ||= find_closure($self->{parsed}))->($element)
482             }
483              
484             package HTML::Selector::Element::Trait;
485             # core methods for trait that adds or overrides Selector support in HTML::Element
486             # use as a superclass in a subclass of HTML::Element, putting it before HTML::Element in @ISA
487             # or monkeypatch HTML::Element: import it into the HTML::Element package
488              
489             require Carp;
490              
491 10     10   106 use Exporter 'import';
  10         30  
  10         7027  
492             our @EXPORT = qw(&find is closest);
493             our @EXPORT_OK = qw(look_self siblings children &select &query);
494              
495             sub children { # child elements, no fake elements
496 66     66 0 108 my($this) = @_;
497 66 50 33     92 return grep { ref and $_->{_tag} and $_->{_tag} !~ /^~/ } @{$this->{_content}||return};
  462 50       1925  
  66         140  
498             }
499              
500             sub look_self {
501 123     123 0 182 my $this = shift;
502 123         189 my($attr, $value, $matcher);
503 123         249 while(@_) {
504             # For speed reasons, no nested scopes and no block scope lexical variables
505             ref ($attr = shift) or
506 195 0 50     1141 2 != (defined($matcher = shift) + defined($value = $this->{$attr}) || next) ? return
    100 0        
    50 66        
    100          
    50          
    100          
    100          
507             : ref $value ? # identical class and stringification or fail
508             ref $matcher eq ref $value && $matcher eq $value ? next : return
509             : ref $matcher
510             ? ref $matcher eq 'Regexp' && $value =~ $matcher ? next : return
511             : $value eq $matcher ? next : return;
512 70 50       190 ref $attr eq 'CODE' and $attr->($this) ? next : return;
    50          
513             # standard processing ends here
514 0 0       0 if(ref $attr eq 'ARRAY') {
515 0         0 my $success;
516 0         0 foreach my $rule (@$attr) {
517 0 0       0 next if ref $rule ne 'ARRAY';
518 0 0       0 $success = look_self($this, @$rule) and last;
519             }
520 0 0       0 $success and next;
521             }
522             # unknown doesn't match
523 0         0 return;
524             }
525 73         276 return $this; # matches
526             }
527              
528             my %store;
529             sub find {
530             # backward compatible with find_by_tag_name in HTMl::Element if you stick to normal tags
531             # If you do need special tags (= starting with "~"), find_by_tag_name is still available, and faster than look_down anyway
532 23     23 0 55478 my($element) = shift;
533 23   33     174 my $selector = $store{join ', ', @_} ||= HTML::Selector::Element->new(@_);
534 23         68 return $selector->find($element);
535             }
536              
537             sub is {
538 1     1 0 14176 my($element) = shift;
539 1 50       7 @_ or return;
540 1   33     16 my $selector = $store{join ', ', @_} ||= HTML::Selector::Element->new(@_);
541 1   50     10 $selector->{is} ||= [$selector->criteria];
542 1         2 return look_self($element, @{$selector->{is}});
  1         6  
543             }
544              
545             sub closest {
546 0     0 0 0 my($element) = shift;
547 0   0     0 my $selector = $store{join ', ', @_} ||= HTML::Selector::Element->new(@_);
548 0   0     0 $selector->{is} ||= [$selector->criteria];
549 0         0 return $element->look_up(@{$selector->{is}});
  0         0  
550             }
551              
552             sub select {
553             # same as find except the criteria are absolute in the DOM, instead of relative to the start element
554             # only searches down, never below siblings
555 2     2 0 26913 my($element) = shift;
556 2   33     25 my $selector = $store{join ', ', @_} ||= HTML::Selector::Element->new(@_);
557 2   50     34 $selector->{is} ||= [$selector->criteria];
558 2         5 return $element->look_down(@{$selector->{is}});
  2         28  
559             }
560              
561             # alias
562             *query = \&select;
563              
564             package HTML::Selector::Element;
565             # round up: import subs from Trait
566              
567             HTML::Selector::Element::Trait->import(qw(look_self children));
568              
569             1;