File Coverage

lib/HTML/Query.pm
Criterion Covered Total %
statement 225 237 94.9
branch 106 126 84.1
condition 31 39 79.4
subroutine 23 24 95.8
pod 9 11 81.8
total 394 437 90.1


line stmt bran cond sub pod time code
1             package HTML::Query;
2              
3             our $VERSION = '0.09';
4              
5             use Badger::Class
6 11         274 version => $VERSION,
7             debug => 0,
8             base => 'Badger::Base',
9             utils => 'blessed',
10             import => 'class CLASS',
11             vars => 'AUTOLOAD',
12             constants => 'ARRAY',
13             constant => {
14             ELEMENT => 'HTML::Element',
15             BUILDER => 'HTML::TreeBuilder',
16             },
17             exports => {
18             any => 'Query',
19             hooks => {
20             query => \&_export_query_to_element,
21             },
22             },
23             messages => {
24             no_elements => 'No elements specified to query',
25             no_query => 'No query specified',
26             no_source => 'No argument specified for source: %s',
27             bad_element => 'Invalid element specified: %s',
28             bad_source => 'Invalid source specified: %s',
29             bad_query => 'Invalid query specified: %s',
30             bad_spec => 'Invalid specification "%s" in query: %s',
31             is_empty => 'The query does not contain any elements',
32 11     11   1754546 };
  11         22275  
33              
34             our $SOURCES = {
35             text => sub {
36             class(BUILDER)->load;
37             BUILDER->new_from_content(shift);
38             },
39             file => sub {
40             class(BUILDER)->load;
41             BUILDER->new_from_file(shift);
42             },
43             tree => sub {
44             $_[0]
45             },
46             query => sub {
47             ref $_[0] eq ARRAY
48             ? @{ $_[0] }
49             : $_[0];
50             },
51             };
52              
53             sub Query (@) {
54 48     48 1 94857 CLASS->new(@_);
55             }
56              
57             sub new {
58 169     169 1 14112 my $class = shift;
59 169         212 my ($element, @elements, $type, $code, $select);
60              
61             # expand a single list ref into items
62 169 100 100     645 unshift @_, @{ shift @_ }
  2         5  
63             if @_ == 1 && ref $_[0] eq ARRAY;
64              
65 169   66     540 $class = ref $class || $class;
66              
67 169         848 my $self = {
68             error => undef,
69             suppress_errors => undef,
70             match_self => undef,
71             elements => \@elements,
72             specificity => {}
73             };
74              
75             # each element should be an HTML::Element object, although we might
76             # want to subclass this module to recognise a different kind of object,
77             # so we get the element class from the ELEMENT constant method which a
78             # subclass can re-define.
79 169         528 my $element_class = $class->ELEMENT;
80              
81 169         587 while (@_) {
82 293         386 $element = shift;
83 293         304 $class->debug("argument: ".$element) if DEBUG;
84              
85 293 100       1223 if (! ref $element) {
    50          
86             # a non-reference item is a source type (text, file, tree)
87             # followed by the source, or if it's the last argument following
88             # one ore more element options or named argument pairs then it's
89             # a selection query
90 19 100       55 if (@_) {
    100          
91 12         23 $type = $element;
92 12   50     55 $code = $SOURCES->{ $type }
93             || return $class->error_msg( bad_source => $type );
94 12         20 $element = shift;
95 12         12 $class->debug("source $type: $element") if DEBUG;
96 12         48 unshift(@_, $code->($element));
97 12         53024 next;
98             }
99             elsif (@elements) {
100 5         10 $select = $element;
101 5         10 last;
102             }
103             }
104             elsif (blessed $element) {
105             # otherwise it should be an HTML::Element object or another
106             # HTML::Query object
107 274 100       962 if ($element->isa($element_class)) {
    50          
108 272         408 push(@elements, $element);
109 272         709 next;
110             }
111             elsif ($element->isa($class)) {
112 2         3 push(@elements, @{$element->get_elements});
  2         7  
113 2         6 next;
114             }
115             }
116              
117 2         18 return $class->error_msg( bad_element => $element );
118             }
119              
120 167         363 bless $self, $class;
121              
122 167 100       532 return defined $select ? $self->query($select) : $self;
123             }
124              
125             sub query {
126 131     131 1 10542 my ($self, $query) = @_;
127 131         206 my @result;
128 131         158 my $ops = 0;
129 131         145 my $pos = 0;
130              
131 131         545 $self->{error} = undef;
132              
133 131 100 100     628 return $self->error_msg('no_query')
134             unless defined $query && length $query;
135              
136             # multiple specs can be comma separated, e.g. "table tr td, li a, div.foo"
137 129         339 COMMA: while (1) {
138             # each comma-separated traversal spec is applied downward from
139             # the source elements in the $self->{elements} query
140 136         153 my @elements = @{$self->get_elements};
  136         283  
141 136         179 my $comops = 0;
142              
143 136         141 my $specificity = 0;
144 136   100     1227 my $startpos = pos($query) || 0;
145              
146 136         336 my $hack_sequence = 0; # look for '* html'
147              
148 136         135 warn "Starting new COMMA" if DEBUG;
149              
150             # for each whitespace delimited descendant spec we grok the correct
151             # parameters for look_down() and apply them to each source element
152             # e.g. "table tr td"
153 136         131 SEQUENCE: while (1) {
154 380         364 my @args;
155 380   100     1309 $pos = pos($query) || 0;
156 380         447 my $relationship = '';
157 380         348 my $leading_whitespace;
158              
159 380         342 warn "Starting new SEQUENCE" if DEBUG;
160              
161             # ignore any leading whitespace
162 380 100       1297 if ($query =~ / \G (\s+) /cgsx) {
163 104 50       246 $leading_whitespace = defined($1) ? 1 : 0;
164 104         105 warn "removing leading whitespace\n" if DEBUG;
165             }
166              
167             # grandchild selector is whitespace sensitive, requires leading whitespace
168 380 100 100     1608 if ($leading_whitespace && $comops && ($query =~ / \G (\*) \s+ /cgx)) {
      100        
169             # can't have a relationship modifier as the first part of the query
170 6         12 $relationship = $1;
171 6         6 warn "relationship = $relationship\n" if DEBUG;
172             }
173              
174             # get other relationship modifiers
175 380 100       1178 if ($query =~ / \G (>|\+) \s* /cgx) {
176             # can't have a relationship modifier as the first part of the query
177 27         45 $relationship = $1;
178 27         28 warn "relationship = $relationship\n" if DEBUG;
179 27 50       71 if (!$comops) {
180 0         0 return $self->_report_error( $self->message( bad_spec => $relationship, $query ) );
181             }
182             }
183              
184             # optional leading word is a tag name
185 380 100       1176 if ($query =~ / \G ([\w\*]+) /cgx) {
186 228         598 my $tag = $1;
187              
188 228 100       468 if ($tag =~ m/\*/) {
189 18 100 66     110 if (($leading_whitespace || $comops == 0) && ($tag eq '*')) {
      66        
190 14         21 warn "universal tag\n" if DEBUG;
191 14         60 push(@args, _tag => qr/\w+/);
192              
193 14 100       40 if ($comops == 0) { #we need to catch the case where we see '* html'
194 7         13 $hack_sequence++;
195             }
196             }
197             else {
198 4         16 return $self->_report_error( $self->message( bad_spec => $tag, $query ) );
199             }
200             }
201             else {
202 210         368 warn "html tag\n" if DEBUG;
203 210         272 $specificity += 1; # standard tags are worth 1 point
204 210         363 push( @args, _tag => $tag );
205              
206 210 100 100     1208 if ($comops == 1 && $tag eq 'html') {
207 1         3 $hack_sequence++;
208             }
209             }
210             }
211              
212             # loop to collect a description about this specific part of the rule
213 376         603 while (1) {
214 528         677 my $work = scalar @args;
215              
216             # that can be followed by (or the query can start with) a #id
217 528 100       1350 if ($query =~ / \G \# ([\w\-]+) /cgx) {
218 39         48 $specificity += 100;
219 39         96 push( @args, id => $1 );
220             }
221              
222             # and/or a .class
223 528 100       1239 if ($query =~ / \G \. ([\w\-]+) /cgx) {
224 73         102 $specificity += 10;
225 73         1766 push( @args, class => qr/ (^|\s+) $1 ($|\s+) /x );
226             }
227              
228             # and/or none or more [ ] attribute specs
229 528 100       1776 if ($query =~ / \G \[ (.*?) \] /cgx) {
230 47         124 my $attribute = $1;
231 47         61 $specificity += 10;
232              
233             #if we have an operator
234 47 100       395 if ($attribute =~ m/(.*?)\s*([\|\~]?=)\s*(.*)/) {
235 38         522 my ($name,$attribute_op,$value) = ($1,$2,$3);
236              
237 38 50 33     180 unless (defined($name) && length($name)) {
238 0         0 return $self->_report_error( $self->message( bad_spec => $name, $query ) );
239             }
240              
241 38         39 warn "operator $attribute_op" if DEBUG;
242              
243 38 50       75 if (defined $value) {
244 38         67 for ($value) {
245 38         108 s/^['"]//;
246 38         126 s/['"]$//;
247             }
248 38 100       102 if ($attribute_op eq '=') {
    100          
    50          
249 33         92 push( @args, $name => $value);
250             }
251             elsif ($attribute_op eq '|=') {
252 2         51 push(@args, $name => qr/\b${value}-?/)
253             }
254             elsif ($attribute_op eq '~=') {
255 3         41 push(@args, $name => qr/\b${value}\b/)
256             }
257             else {
258 0         0 return $self->_report_error( $self->message( bad_spec => $attribute_op, $query ) );
259             }
260             }
261             else {
262 0         0 return $self->_report_error( $self->message( bad_spec => $attribute_op, $query ) );
263             }
264             }
265             else {
266 9 100 66     55 unless (defined($attribute) && length($attribute)) {
267 2         35 return $self->_report_error( $self->message( bad_spec => $attribute, $query ) );
268             }
269              
270             # add a regex to match anything (or nothing)
271 7         39 push( @args, $attribute => qr/.*/ );
272             }
273             }
274             # and/or one or more pseudo-classes
275 526 100       1185 if ($query =~ / \G : :? ([\w\-]+) /cgx) {
276 7         12 my $pseudoclass = $1;
277 7         9 $specificity += 10;
278              
279 7 100       18 if ($pseudoclass eq 'first-child') {
    50          
280 4     21   36 push( @args, sub { ! grep { ref $_ } $_[0]->left() } );
  21         592  
  21         243  
281             } elsif ($pseudoclass eq 'last-child') {
282 3     15   12 push( @args, sub { ! grep { ref $_ } $_[0]->right() } );
  15         493  
  15         207  
283             } else {
284 0         0 warn "Pseudoclass :$pseudoclass not supported";
285 0         0 next;
286             }
287             }
288              
289             # keep going until this particular expression is fully processed
290 526 100       1555 last unless scalar(@args) > $work;
291             }
292              
293             # we must have something in @args by now or we didn't find any
294             # valid query specification this time around
295 374 100       819 last SEQUENCE unless @args;
296              
297 244         415 $self->debug(
298             'Parsed ', substr($query, $pos, pos($query) - $pos),
299             ' into args [', join(', ', @args), ']'
300             ) if DEBUG;
301              
302             # we want to skip certain hack sequences like '* html'
303 244 100       806 if ($hack_sequence == 2) {
    100          
    100          
    100          
    50          
304 1         4 @elements = []; # clear out our stored elements to match behaviour of modern browsers
305             }
306             # we're just looking for any descendent
307             elsif( !$relationship ) {
308 210 100       620 if ($self->{match_self}) {
309             # if we are re-querying, be sure to match ourselves not just descendents
310 2         5 @elements = map { $_->look_down(@args) } @elements;
  4         72  
311             } else {
312             # look_down() will match self in addition to descendents,
313             # so we explicitly disallow matches on self as we iterate
314             # thru the list. The other cases below already exclude self.
315             # https://rt.cpan.org/Public/Bug/Display.html?id=58918
316 208         208 my @accumulator;
317 208         323 foreach my $e (@elements) {
318 253 100       3762 if ($e->root() == $e) {
319 110         2337 push(@accumulator, $e->look_down(@args));
320             }
321             else {
322 143         2299 push(@accumulator, grep { $_ != $e } $e->look_down(@args));
  242         19796  
323             }
324             }
325 208         44360 @elements = @accumulator;
326             }
327             }
328             # immediate child selector
329             elsif( $relationship eq '>' ) {
330             @elements = map {
331 16         27 $_->look_down(
332             @args,
333             sub {
334 61     61   2377 my $tag = shift;
335 61         68 my $root = $_;
336              
337 61         152 return $tag->depth == $root->depth + 1;
338             }
339             )
340 36         1293 } @elements;
341             }
342             # immediate sibling selector
343             elsif( $relationship eq '+' ) {
344             @elements = map {
345 11         20 $_->parent->look_down(
346             @args,
347             sub {
348 253     253   6773 my $tag = shift;
349 253         281 my $root = $_;
350 253         617 my @prev_sibling = $tag->left;
351             # get prev next non-text sibling
352 253         12967 foreach my $sibling (reverse @prev_sibling) {
353 280 100       967 next unless ref $sibling;
354 69         225 return $sibling == $root;
355             }
356             }
357             )
358 61         1153 } @elements;
359             }
360             # grandchild selector
361             elsif( $relationship eq '*' ) {
362             @elements = map {
363 6         9 $_->look_down(
364             @args,
365             sub {
366 40     40   1688 my $tag = shift;
367 40         46 my $root = $_;
368              
369 40         89 return $tag->depth > $root->depth + 1;
370             }
371             )
372 9         259 } @elements;
373             }
374              
375             # so we can check we've done something
376 244         1389 $comops++;
377              
378             # dedup the results we've gotten
379 244         576 @elements = $self->_dedup(\@elements);
380              
381 244         598 map { warn $_->as_HTML } @elements if DEBUG;
382             }
383              
384 130 100       239 if ($comops) {
385 129         133 $self->debug(
386             'Added', scalar(@elements), ' elements to results'
387             ) if DEBUG;
388              
389 129         310 my $selector = substr ($query,$startpos, $pos - $startpos);
390 129         5827 $self->_add_specificity($selector,$specificity);
391              
392             #add in the recent pass
393 129         185 push(@result,@elements);
394              
395             # dedup the results across the result sets, necessary for comma based selectors
396 129         291 @result = $self->_dedup(\@result);
397              
398             # sort the result set...
399 129         504 @result = sort _by_address @result;
400              
401             # update op counter for complete query to include ops performed
402             # in this fragment
403 129         195 $ops += $comops;
404             }
405             else {
406             # looks like we got an empty comma section, e.g. : ",x, ,y,"
407             # so we'll ignore it
408             }
409              
410 130 100       443 last COMMA unless $query =~ / \G \s*,\s* /cgsx;
411             }
412              
413             # check for any trailing text in the query that we couldn't parse
414 123 50       274 if ($query =~ / \G (.+?) \s* $ /cgsx) {
415 0         0 return $self->_report_error( $self->message( bad_spec => $1, $query ) );
416             }
417              
418             # check that we performed at least one query operation
419 123 100       249 unless ($ops) {
420 1         9 return $self->_report_error( $self->message( bad_query => $query ) );
421             }
422              
423 122 100       400 return wantarray ? @result : $self->_new_match_self(@result);
424             }
425              
426             # return elements stored from last query
427             sub get_elements {
428 334     334 1 526 my $self = shift;
429              
430 334 50       1385 return wantarray ? @{$self->{elements}} : $self->{elements};
  0         0  
431             }
432              
433             ###########################################################################################################
434             # from CSS spec at http://www.w3.org/TR/CSS21/cascade.html#specificity
435             ###########################################################################################################
436             # A selector's specificity is calculated as follows:
437             #
438             # * count the number of ID attributes in the selector (= a)
439             # * count the number of other attributes and pseudo-classes in the selector (= b)
440             # * count the number of element names in the selector (= c)
441             # * ignore pseudo-elements.
442             #
443             # Concatenating the three numbers a-b-c (in a number system with a large base) gives the specificity.
444             #
445             # Example(s):
446             #
447             # Some examples:
448             #
449             # * {} /* a=0 b=0 c=0 -> specificity = 0 */
450             # LI {} /* a=0 b=0 c=1 -> specificity = 1 */
451             # UL LI {} /* a=0 b=0 c=2 -> specificity = 2 */
452             # UL OL+LI {} /* a=0 b=0 c=3 -> specificity = 3 */
453             # H1 + *[REL=up]{} /* a=0 b=1 c=1 -> specificity = 11 */
454             # UL OL LI.red {} /* a=0 b=1 c=3 -> specificity = 13 */
455             # LI.red.level {} /* a=0 b=2 c=1 -> specificity = 21 */
456             # #x34y {} /* a=1 b=0 c=0 -> specificity = 100 */
457             ###########################################################################################################
458              
459             # calculate and return the specificity for the provided selector
460             sub get_specificity {
461 24     24 1 29 my ($self,$selector) = @_;
462              
463 24 50       64 unless (exists $self->{specificity}->{$selector}) {
464              
465             # if the invoking tree happened to be large this could get expensive real fast
466             # instead load up an empty instance and query that.
467 24         48 local $self->{elements} = [];
468 24         43 $self->query($selector);
469             }
470              
471 24         57 return $self->{specificity}->{$selector};
472             }
473              
474             sub suppress_errors {
475 11     11 0 247 my ($self, $setting) = @_;
476              
477 11 100       29 if (defined($setting)) {
478 2         5 $self->{suppress_errors} = $setting;
479             }
480              
481 11         34 return $self->{suppress_errors};
482             }
483              
484             sub get_error {
485 6     6 0 550 my ($self) = @_;
486              
487 6         24 return $self->{error};
488             }
489              
490             sub list {
491             # return list of items or return unblessed list ref of items
492 0 0   0 1 0 return wantarray ? @{ $_[0] } : [ @{ $_[0] } ];
  0         0  
  0         0  
493             }
494              
495             sub size {
496 100     100 1 11354 my $self = shift;
497 100         131 return scalar @{$self->get_elements};
  100         202  
498             }
499              
500             sub first {
501 6     6 1 8 my $self = shift;
502              
503 6 50       6 return @{$self->get_elements} ? $self->get_elements->[0] : $self->error_msg('is_empty');
  6         14  
504             }
505              
506             sub last {
507 4     4 1 685 my $self = shift;
508              
509 4 50       6 return @{$self->get_elements} ? $self->get_elements->[-1] : $self->error_msg('is_empty');
  4         7  
510             }
511              
512             ####################################################################
513             #
514             # Everything below here is a private method subject to change
515             #
516             ####################################################################
517              
518             sub _add_specificity {
519 129     129   203 my ($self, $selector, $specificity) = @_;
520              
521 129         347 $self->{specificity}->{$selector} = $specificity;
522              
523 129         204 return();
524             }
525              
526             sub _report_error {
527 7     7   976 my ($self, $message) = @_;
528              
529 7 100       17 if ($self->suppress_errors()) {
530 6 50       15 if (defined($message)) {
531 6         10 $self->{error} = $message;
532             }
533 6         28 return undef;
534             }
535             else {
536 1         4 $self->error($message); # this will DIE
537             }
538             }
539              
540             # this Just Works[tm] because first arg is HTML::Element object
541             sub _export_query_to_element {
542 2     2   475 class(ELEMENT)->load->method(
543             query => \&Query,
544             );
545             }
546              
547             # remove duplicate elements in the case where elements are nested between multiple matching elements
548             sub _dedup {
549 373     373   498 my ($self,$elements) = @_;
550              
551 373         609 my %seen = ();
552 373         440 my @unique = ();
553              
554 373         340 foreach my $item (@{$elements}) {
  373         638  
555 925 100       2672 if (!exists($seen{$item})) {
556 830         3181165 push(@unique, $item);
557             }
558              
559 925         2280 $seen{$item}++;
560             }
561              
562 373         1389 return @unique;
563             }
564              
565             # utility method to assist in sorting of query return sets
566             sub _by_address
567             {
568 241     241   332 my $self = shift;
569              
570 241         695 my @a = split /\./, $a->address();
571 241         19158 my @b = split /\./, $b->address();
572              
573 241 100       18305 my $max = (scalar @a > scalar @b) ? scalar @a : scalar @b;
574              
575 241         795 for (my $index=0; $index<$max; $index++) {
576              
577 1098 50 66     4360 if (!defined($a[$index]) && !defined($b[$index])) {
    100          
    100          
578 0         0 return 0;
579             }
580             elsif (!defined($a[$index])) {
581 49         147 return -1;
582             }
583             elsif(!defined($b[$index])) {
584 26         76 return 1;
585             }
586              
587 1023 100       2051 if ($a[$index] == $b[$index]) {
588 863         1986 next; #move to the next
589             }
590             else {
591 160         563 return $a[$index] <=> $b[$index];
592             }
593             }
594             }
595              
596             # instantiate an instance with match_self turned on, for use with
597             # follow-up queries, so they match the top-most elements.
598             sub _new_match_self {
599 118     118   151 my $self = shift;
600              
601 118         287 my $result = $self->new(@_);
602              
603 118         197 $result->{match_self} = 1;
604 118         640 return $result;
605             }
606              
607             sub AUTOLOAD {
608 76     76   297 my $self = shift;
609 76         587 my ($method) = ($AUTOLOAD =~ /([^:]+)$/ );
610 76 50       215 return if $method eq 'DESTROY';
611              
612             # we allow Perl to catch any unknown methods that the user might
613             # try to call against the HTML::Element objects in the query
614 162         5528 my @results =
615 76         153 map { $_->$method(@_) }
616 76         92 @{$self->get_elements};
617              
618 76 50       13485 return wantarray ? @results : \@results;
619             }
620              
621             1;
622              
623             =head1 NAME
624              
625             HTML::Query - jQuery-like selection queries for HTML::Element
626              
627             =head1 SYNOPSIS
628              
629             Creating an C object using the L constructor
630             subroutine:
631              
632             use HTML::Query 'Query';
633              
634             # using named parameters
635             $q = Query( text => $text ); # HTML text
636             $q = Query( file => $file ); # HTML file
637             $q = Query( tree => $tree ); # HTML::Element object
638             $q = Query( query => $query ); # HTML::Query object
639             $q = Query(
640             text => $text1, # or any combination
641             text => $text2, # of the above
642             file => $file1,
643             file => $file2,
644             tree => $tree,
645             query => $query,
646             );
647              
648             # passing elements as positional arguments
649             $q = Query( $tree ); # HTML::Element object(s)
650             $q = Query( $tree1, $tree2, $tree3, ... );
651              
652             # or from one or more existing queries
653             $q = Query( $query1 ); # HTML::Query object(s)
654             $q = Query( $query1, $query2, $query3, ... );
655              
656             # or a mixture
657             $q = Query( $tree1, $query1, $tree2, $query2 );
658              
659             # the final argument (in all cases) can be a selector
660             my $spec = 'ul.menu li a'; #
661              
662             $q = Query( $tree, $spec );
663             $q = Query( $query, $spec );
664             $q = Query( $tree1, $tree2, $query1, $query2, $spec );
665             $q = Query( text => $text, $spec );
666             $q = Query( file => $file, $spec );
667             $q = Query( tree => $tree, $spec );
668             $q = Query( query => $query, $spec );
669             $q = Query(
670             text => $text,
671             file => $file,
672             # ...etc...
673             $spec
674             );
675              
676             Or using the OO L constructor method (which the L
677             subroutine maps onto):
678              
679             use HTML::Query;
680              
681             $q = HTML::Query->new(
682             # accepts the same arguments as Query()
683             )
684              
685             Or by monkey-patching a L method into L.
686              
687             use HTML::Query 'query'; # note lower case 'q'
688             use HTML::TreeBuilder;
689              
690             # build a tree
691             my $tree = HTML::TreeBuilder->new;
692             $tree->parse_file($filename);
693              
694             # call the query() method on any element
695             my $query = $tree->query($spec);
696              
697             Once you have a query, you can start selecting elements:
698              
699             @r = $q->query('a')->get_elements(); # all ... elements
700             @r = $q->query('a#menu')->get_elements(); # all with "menu" id
701             @r = $q->query('#menu')->get_elements(); # all elements with "menu" id
702             @r = $q->query('a.menu')->get_elements(); # all with "menu" class
703             @r = $q->query('.menu')->get_elements(); # all elements with "menu" class
704             @r = $q->query('a[href]')->get_elements(); # all with 'href' attr
705             @r = $q->query('a[href=foo]')->get_elements(); # all with 'href="foo"' attr
706              
707             # you can specify elements within elements...
708             @r = $q->query('ul.menu li a')->get_elements(); #
709              
710             # and use commas to delimit multiple path specs for different elements
711             @r = $q->query('table tr td a, form input[type=submit]')->get_elements();
712              
713             # query() in scalar context returns a new query
714             $r = $q->query('table')->get_elements();; # find all tables
715             $s = $r->query('tr')->get_elements(); # find all rows in all those tables
716             $t = $s->query('td')->get_elements(); # and all cells in those rows...
717              
718             Inspecting query elements:
719              
720             # get number of elements in query
721             my $size = $q->size
722              
723             # get first/last element in query
724             my $first = $q->first;
725             my $last = $q->last;
726              
727             # convert query to list or list ref of HTML::Element objects
728             my $list = $q->list; # list ref in scalar context
729             my @list = $q->list; # list in list context
730              
731             All other methods are mapped onto the L objects
732             in the query:
733              
734             print $query->as_trimmed_text; # print trimmed text for each element
735             print $query->as_HTML; # print each element as HTML
736             $query->delete; # call delete() on each element
737              
738             =head1 DESCRIPTION
739              
740             The C module is an add-on for the L module
741             set. It provides a simple way to select one or more elements from a tree using
742             a query syntax inspired by jQuery. This selector syntax will be reassuringly
743             familiar to anyone who has ever written a CSS selector.
744              
745             C is not an attempt to provide a complete (or even near-complete)
746             implementation of jQuery in Perl (see Ingy's L module for a
747             more ambitious attempt at that). Rather, it borrows some of the tried and
748             tested selector syntax from jQuery (and CSS) that can easily be mapped onto
749             the C method provided by the L
750             module.
751              
752             =head2 Creating a Query
753              
754             The easiest way to create a query is using the exportable L
755             subroutine.
756              
757             use HTML::Query 'Query'; # note capital 'Q'
758              
759             It accepts a C or C named parameter and will create an
760             C object from the HTML source text or file, respectively.
761              
762             my $query = Query( text => $text );
763             my $query = Query( file => $file );
764              
765             This delegates to L to parse the
766             HTML into a tree of L objects. The root
767             element returned is then wrapped in an C object.
768              
769             If you already have one or more L objects that
770             you want to query then you can pass them to the L subroutine as
771             arguments. For example, you can explicitly use
772             L to parse an HTML document into a tree:
773              
774             use HTML::TreeBuilder;
775             my $tree = HTML::TreeBuilder->new;
776             $tree->parse_file($filename);
777              
778             And then create an C object for the tree either using an
779             explicit C named parameter:
780              
781             my $query = Query( tree => $tree );
782              
783             Or implicitly using positional arguments.
784              
785             my $query = Query( $tree );
786              
787             If you want to query across multiple elements, then pass each one as a
788             positional argument.
789              
790             my $query = Query( $tree1, $tree2, $tree3 );
791              
792             You can also create a new query from one or more existing queries,
793              
794             my $query = Query( query => $query ); # named parameter
795             my $query = Query( $query1, $query2 ); # positional arguments.
796              
797             You can mix and match these different parameters and positional arguments
798             to create a query across several different sources.
799              
800             $q = Query(
801             text => $text1,
802             text => $text2,
803             file => $file1,
804             file => $file2,
805             tree => $tree,
806             query => $query,
807             );
808              
809             The L subroutine is a simple wrapper around the L
810             constructor method. You can instantiate your objects manually if you prefer.
811             The L method accepts the same arguments as for the L
812             subroutine (in fact, the L subroutine simply forwards all
813             arguments to the L method).
814              
815             use HTML::Query;
816              
817             my $query = HTML::Query->new(
818             # same argument format as for Query()
819             );
820              
821             A final way to use C is to have it add a L method
822             to L. The C import hook (all lower
823             case) can be specified to make this so.
824              
825             use HTML::Query 'query'; # note lower case 'q'
826             use HTML::TreeBuilder;
827              
828             my $tree = HTML::TreeBuilder->new;
829             $tree->parse_file($filename);
830              
831             # now all HTML::Elements have a query() method
832             my @items = $tree->query('ul li')->get_elements(); # find all list items
833              
834             This approach, often referred to as I, should be used
835             carefully and sparingly. It involves a violation of
836             L's namespace that could have unpredictable
837             results with a future version of the module (e.g. one which defines its own
838             C method that does something different). Treat it as something that
839             is great to get a quick job done right now, but probably not something to be
840             used in production code without careful consideration of the implications.
841              
842             =head2 Selecting Elements
843              
844             Having created an C object by one of the methods outlined above,
845             you can now fetch descendant elements in the tree using a simple query syntax.
846             For example, to fetch all the C<< EaE >> elements in the tree, you can
847             write:
848              
849             @links = $query->query('a')->get_elements();
850              
851             Or, if you want the elements that have a specific C attribute defined
852             with a value of, say C, you can write:
853              
854             @links = $query->query('a.menu')->get_elements();
855              
856             More generally, you can look for the existence of any attribute and optionally
857             provide a specific value for it.
858              
859             @links = $query->query('a[href]')->get_elements(); # any href attribute
860             @links = $query->query('a[href=index.html]')->get_elements(); # specific value
861              
862             You can also find an element (or elements) by specifying an id.
863              
864             @links = $query->query('#menu')->get_elements(); # any element with id="menu"
865             @links = $query->query('ul#menu')->get_elements(); # ul element with id="menu"
866              
867             You can provide multiple selection criteria to find elements within elements
868             within elements, and so on. For example, to find all links in a menu,
869             you can write:
870              
871             # matches:
872             @links = $query->query('ul.menu li a')->get_elements();
873              
874             You can separate different criteria using commas. For example, to fetch all
875             table rows and C elements with a C class:
876              
877             @elems = $query->('table tr, span.foo')->get_elements();
878              
879             =head2 Query Results
880              
881             When called in list context, as shown in the examples above, the L
882             method returns a list of L objects matching the
883             search criteria. In scalar context, the L method returns a new
884             C object containing the L objects
885             found. You can then call the L method against that object to further
886             refine the query. The L method applies the selection to all elements
887             stored in the query.
888              
889             my $tables = $query->query('table'); # query for tables
890             my $rows = $tables->query('tr'); # requery for all rows in those tables
891             my $cells = $rows->query('td')->get_elements(); # return back all the cells in those rows
892              
893             =head2 Inspection Methods
894              
895             The L method returns the number of elements in the query. The
896             L and L methods return the first and last items in the
897             query, respectively.
898              
899             if ($query->size) {
900             print "from ", $query->first->as_trimmed_text, " to ", $query->last->as_trimmed_text;
901             }
902              
903             If you want to extract the L objects from the
904             query you can call the L method. This returns a list of
905             L objects in list context, or a reference to a
906             list in scalar context.
907              
908             @elems = $query->list;
909             $elems = $query->list;
910              
911             =head2 Element Methods
912              
913             Any other methods are automatically applied to each element in the list. For
914             example, to call the C method on all the
915             L objects in the query, you can write:
916              
917             print $query->as_trimmed_text;
918              
919             In list context, this method returns a list of the return values from
920             calling the method on each element. In scalar context it returns a
921             reference to a list of return values.
922              
923             @text_blocks = $query->as_trimmed_text;
924             $text_blocks = $query->as_trimmed_text;
925              
926             See L for further information on the methods it
927             provides.
928              
929             =head1 QUERY SYNTAX
930              
931             =head2 Basic Selectors
932              
933             =head3 element
934              
935             Matches all elements of a particular type.
936              
937             @elems = $query->query('table')->get_elements(); #
938              
939             =head3 #id
940              
941             Matches all elements with a specific id attribute.
942              
943             @elems = $query->query('#menu')->get_elements() #
944              
945             This can be combined with an element type:
946              
947             @elems = $query->query('ul#menu')->get_elements(); #
948              
949             =head3 .class
950              
951             Matches all elements with a specific class attribute.
952              
953             @elems = $query->query('.info')->get_elements(); #
954              
955             This can be combined with an element type and/or element id:
956              
957             @elems = $query->query('p.info')->get_elements(); #

958             @elems = $query->query('p#foo.info')->get_elements(); #

959             @elems = $query->query('#foo.info')->get_elements(); #
960              
961             The selectors listed above can be combined in a whitespace delimited
962             sequence to select down through a hierarchy of elements. Consider the
963             following table:
964              
965            
974              
975             To locate the cells that we're interested in, we can write:
976              
977             @elems = $query->query('table.search tr.result td.value')->get_elements();
978              
979             =head2 Attribute Selectors
980              
981             W3C CSS 2 specification defines new constructs through which to select
982             based on specific attributes within elements. See the following link for the spec:
983             L
984              
985             =head3 [attr]
986              
987             Matches elements that have the specified attribute, including any where
988             the attribute has no value.
989              
990             @elems = $query->query('[href]')->get_elements(); #
991              
992             This can be combined with any of the above selectors. For example:
993              
994             @elems = $query->query('a[href]')->get_elements(); #
995             @elems = $query->query('a.menu[href]')->get_elements(); #
996              
997             You can specify multiple attribute selectors. Only those elements that
998             match I of them will be selected.
999              
1000             @elems = $query->query('a[href][rel]')->get_elements(); #
1001              
1002             =head3 [attr=value]
1003              
1004             Matches elements that have an attribute set to a specific value. The
1005             value can be quoted in either single or double quotes, or left unquoted.
1006              
1007             @elems = $query->query('[href=index.html]')->get_elements();
1008             @elems = $query->query('[href="index.html"]')->get_elements();
1009             @elems = $query->query("[href='index.html']")->get_elements();
1010              
1011             You can specify multiple attribute selectors. Only those elements that
1012             match I of them will be selected.
1013              
1014             @elems = $query->query('a[href=index.html][rel=home]')->get_elements();
1015              
1016             =head3 [attr|=value]
1017              
1018             Matches any element X whose foo attribute has a hyphen-separated list of
1019             values beginning (from the left) with bar. The value can be quoted in either
1020             single or double quotes, or left unquoted.
1021              
1022             @elems = $query->query('[lang|=en]')->get_elements();
1023             @elems = $query->query('p[class|="example"]')->get_elements();
1024             @elems = $query->query("img[alt|='fig']")->get_elements();
1025              
1026             You can specify multiple attribute selectors. Only those elements that
1027             match I of them will be selected.
1028              
1029             @elems = $query->query('p[class|="external"][lang|="en"]')->get_elements();
1030              
1031             =head3 [attr~=value]
1032              
1033             Matches any element X whose foo attribute value is a list of space-separated
1034             values, one of which is exactly equal to bar. The value can be quoted in either
1035             single or double quotes, or left unquoted.
1036              
1037             @elems = $query->query('[lang~=en]')->get_elements();
1038             @elems = $query->query('p[class~="example"]')->get_elements();
1039             @elems = $query->query("img[alt~='fig']")->get_elements();
1040              
1041             You can specify multiple attribute selectors. Only those elements that
1042             match I of them will be selected.
1043              
1044             @elems = $query->query('p[class~="external"][lang~="en"]')->get_elements();
1045              
1046             KNOWN BUG: you can't have a C<]> character in the attribute value because
1047             it confuses the query parser. Fixing this is TODO.
1048              
1049             =head2 Universal Selector
1050              
1051             W3C CSS 2 specification defines a new construct through which to select
1052             any element within the document below a given hierarchy.
1053              
1054             http://www.w3.org/TR/css3-selectors/#universal-selector
1055              
1056             @elems = $query->query('*')->get_elements();
1057              
1058             =head2 Combinator Selectors
1059              
1060             W3C CSS 2 specification defines new constructs through which to select
1061             based on heirarchy with the DOM. See the following link for the spec:
1062             L
1063              
1064             =head3 Immediate Descendents (children)
1065              
1066             When you combine selectors with whitespace elements are selected if
1067             they are descended from the parent in some way. But if you just want
1068             to select the children (and not the grandchildren, great-grandchildren,
1069             etc) then you can combine the selectors with the C<< > >> character.
1070              
1071             @elems = $query->query('a > img')->get_elements();
1072              
1073             =head3 Non-Immediate Descendents
1074              
1075             If you just want any descendents that aren't children then you can combine
1076             selectors with the C<*> character.
1077              
1078             @elems = $query->query('div * a')->get_elements();
1079              
1080             =head3 Immediate Siblings
1081              
1082             If you want to use a sibling relationship then you can can join selectors
1083             with the C<+> character.
1084              
1085             @elems = $query->query('img + span')->get_elements();
1086              
1087             =head2 Pseudo-classes
1088              
1089             W3C CSS 2 and CSS 3 specifications define new concepts of pseudo-classes to
1090             permit formatting based on information that lies outside the document tree.
1091             See the following link for the most recent spec:
1092             L
1093              
1094             HTML::Query currently has limited support for CSS 2, and no support for CSS 3.
1095              
1096             Patches are *highly* encouraged to help add support here.
1097              
1098             =head3 -child pseudo-classes
1099              
1100             If you want to return child elements within a certain position then -child
1101             pseudo-classes (:first-child, :last-child) are what you're looking for.
1102              
1103             @elems = $query->query('table td:first-child')->get_elements;
1104              
1105             =head3 Link pseudo-classes: :link and :visited
1106              
1107             Unsupported.
1108              
1109             The :link pseudo-class is to be implemented, currently unsupported.
1110              
1111             It is not possible to locate :visited outside of a browser context due to it's
1112             dynamic nature.
1113              
1114             =head3 Dynamic pseudo-classes
1115              
1116             Unsupported.
1117              
1118             It is not possible to locate these classes(:hover, :active, :focus) outside
1119             of a browser context due to their dynamic nature.
1120              
1121             =head3 Language pseudo-class
1122              
1123             Unsupported.
1124              
1125             Functionality for the :lang pseudo-class is largely replicated by using an
1126             attribute selector for lang combined with a universal selector query.
1127              
1128             If this is insufficient I'd love to see a patch adding support for it.
1129              
1130             =head3 Other pseudo-classes
1131              
1132             W3C CSS 3 added a number of new behaviors that need support. At
1133             this time there is no support for them, but we should work on adding support.
1134              
1135             Patches are very welcome.
1136              
1137             =head2 Pseudo-elements
1138              
1139             W3C CSS 2 and CSS 3 specification defines new concepts of pseudo-elements to
1140             permit formatting based on information that lies outside the document tree.
1141             See the following link for the most recent spec:
1142             L
1143              
1144             At this time there is no support for pseudo-elements, but we are working
1145             on adding support.
1146              
1147             Patches are very welcome.
1148              
1149             =head2 Combining Selectors
1150              
1151             You can combine basic and hierarchical selectors into a single query
1152             by separating each part with a comma. The query will select all matching
1153             elements for each of the comma-delimited selectors. For example, to
1154             find all C, C and C elements in a tree:
1155              
1156             @elems = $query->query('a, b, i')->get_elements();
1157              
1158             Each of these selectors can be arbitrarily complex.
1159              
1160             @elems = $query->query(
1161             'table.search[width=100%] tr.result[valign=top] td.value,
1162             form.search input[type=submit],
1163             a[href=index.html]'
1164             )->get_elements();
1165              
1166             =head1 EXPORT HOOKS
1167              
1168             =head2 Query
1169              
1170             The C constructor subroutine (note the capital letter) can be
1171             exported as a convenient way to create C objects. It simply
1172             forwards all arguments to the L constructor method.
1173              
1174             use HTML::Query 'Query';
1175              
1176             my $query = Query( file => $file, 'ul.menu li a' );
1177              
1178             =head2 query
1179              
1180             The C export hook can be called to monkey-patch a L method
1181             into the L module.
1182              
1183             This is considered questionable behaviour in polite society which regards it
1184             as a violation of the inner sanctity of the L.
1185              
1186             But if you're the kind of person that doesn't mind a bit of occasional
1187             namespace abuse for the sake of getting the job done, then go right ahead.
1188             Just don't blame me if it all blows up later.
1189              
1190             use HTML::Query 'query'; # note lower case 'q'
1191             use HTML::TreeBuilder;
1192              
1193             # build a tree
1194             my $tree = HTML::TreeBuilder->new;
1195             $tree->parse_file($filename);
1196              
1197             # call the query() method on any element
1198             my $query = $tree->query('ul li a');
1199              
1200             =head1 METHODS
1201              
1202             The C object is a subclass of L and
1203             inherits all of its method.
1204              
1205             =head2 new(@elements,$selector)
1206              
1207             This constructor method is used to create a new C object. It
1208             expects a list of any number (including zero) of
1209             L or C objects.
1210              
1211             # single HTML::Element object
1212             my $query = HTML::Query->new($elem);
1213              
1214             # multiple element object
1215             my $query = HTML::Query->new($elem1, $elem2, $elem3, ...);
1216              
1217             # copy elements from an existing query
1218             my $query = HTML::Query->new($another_query);
1219              
1220             # copy elements from several queries
1221             my $query = HTML::Query->new($query1, $query2, $query3);
1222              
1223             # or a mixture
1224             my $query = HTML::Query->new($elem1, $query1, $elem2, $query3);
1225              
1226             You can also use named parameters to specify an alternate source for a
1227             element.
1228              
1229             $query = HTML::Query->new( file => $file );
1230             $query = HTML::Query->new( text => $text );
1231              
1232             In this case, the L module is used to
1233             parse the source file or text into a tree of L
1234             objects.
1235              
1236             For the sake of completeness, you can also specify element trees and queries
1237             using named parameters:
1238              
1239             $query = HTML::Query->new( tree => $tree );
1240             $query = HTML::Query->new( query => $query );
1241              
1242             You can freely mix and match elements, queries and named sources. The
1243             query will be constructed as an aggregate across them all.
1244              
1245             $q = HTML::Query->new(
1246             text => $text1,
1247             text => $text2,
1248             file => $file1,
1249             file => $file2,
1250             tree => $tree,
1251             query => $query1,
1252             );
1253              
1254             The final, optional argument can be a selector specification. This is
1255             immediately passed to the L method which will return a new query
1256             with only those elements selected.
1257              
1258             my $spec = 'ul.menu li a'; #
1259              
1260             my $query = HTML::Query->new( $tree, $spec );
1261             my $query = HTML::Query->new( text => $text, $spec );
1262             my $query = HTML::Query->new(
1263             text => $text,
1264             file => $file,
1265             $spec
1266             );
1267              
1268             The list of arguments can also be passed by reference to a list.
1269              
1270             my $query = HTML::Query->new(\@args);
1271              
1272             =head2 query($spec)
1273              
1274             This method locates the descendant elements identified by the C<$spec>
1275             argument for each element in the query. It then interally stores the results
1276             for requerying or return. See get_elements().
1277              
1278             my $query = HTML::Query->new(\@args);
1279             my $results = $query->query($spec);
1280              
1281             See L<"QUERY SYNTAX"> for the permitted syntax of the C<$spec> argument.
1282              
1283             =head2 get_elements()
1284              
1285             This method returns the stored results from a query. In list context it returns a list of
1286             matching L objects. In scalar context it returns a reference to
1287             the results array.
1288              
1289             my $query = HTML::Query->new(\@args);
1290             my $results = $query->query($spec);
1291              
1292             my @elements = $results->query($spec)->get_elements();
1293             my $elements = $results->query($spec)->get_elements();
1294              
1295             =head2 get_specificity()
1296              
1297             Calculate the specificity for any given passed selector, a critical factor in determining how best to apply the cascade
1298              
1299             A selector's specificity is calculated as follows:
1300              
1301             * count the number of ID attributes in the selector (= a)
1302             * count the number of other attributes and pseudo-classes in the selector (= b)
1303             * count the number of element names in the selector (= c)
1304             * ignore pseudo-elements.
1305              
1306             The specificity is based only on the form of the selector. In particular, a selector of the form "[id=p33]" is counted
1307             as an attribute selector (a=0, b=0, c=1, d=0), even if the id attribute is defined as an "ID" in the source document's DTD.
1308              
1309             See the following spec for additional details:
1310             L
1311              
1312             =head2 size()
1313              
1314             Returns the number of elements in the query.
1315              
1316             =head2 first()
1317              
1318             Returns the first element in the query.
1319              
1320             my $elem = $query->first;
1321              
1322             If the query is empty then an exception will be thrown. If you would rather
1323             have an undefined value returned then you can use the C method inherited
1324             from L. This effectively wraps the call to
1325             C in an C block to catch any exceptions thrown.
1326              
1327             my $elem = $query->try('first') || warn "no first element\n";
1328              
1329             =head2 last()
1330              
1331             Similar to L, but returning the last element in the query.
1332              
1333             my $elem = $query->last;
1334              
1335             =head2 list()
1336              
1337             Returns a list of the L object in the query in
1338             list context, or a reference to a list in scalar context.
1339              
1340             my @elems = $query->list;
1341             my $elems = $query->list;
1342              
1343             =head2 AUTOLOAD
1344              
1345             The C method maps any other method calls to the
1346             L objects in the list. When called in list
1347             context it returns a list of the values returned from calling the method on
1348             each element. In scalar context it returns a reference to a list of return
1349             values.
1350              
1351             my @text_blocks = $query->as_trimmed_text;
1352             my $text_blocks = $query->as_trimmed_text;
1353              
1354             =head1 KNOWN BUGS
1355              
1356             =head2 Attribute Values
1357              
1358             It is not possible to use C<]> in an attribute value. This is due to a
1359             limitation in the parser which will be fixed RSN.
1360              
1361             =head1 AUTHOR
1362              
1363             Andy Wardley L
1364              
1365             =head1 MAINTAINER
1366              
1367             Kevin Kamel
1368              
1369             =head1 CONTRIBUTORS
1370              
1371             Vivek Khera
1372             Michael Peters
1373             David Gray
1374              
1375             =head1 COPYRIGHT
1376              
1377             Copyright (C) 2010 Andy Wardley. All Rights Reserved.
1378              
1379             This module is free software; you can redistribute it and/or modify it
1380             under the same terms as Perl itself.
1381              
1382             =head1 SEE ALSO
1383              
1384             L, L,
1385             L, L, L
1386              
1387             =cut
1388              
1389             # Local Variables:
1390             # mode: Perl
1391             # perl-indent-level: 4
1392             # indent-tabs-mode: nil
1393             # End:
1394             #
1395             # vim: expandtab shiftwidth=4: