File Coverage

lib/HTML/Query.pm
Criterion Covered Total %
statement 225 237 94.9
branch 106 126 84.1
condition 31 39 79.4
subroutine 23 24 95.8
pod 8 11 72.7
total 393 437 89.9


line stmt bran cond sub pod time code
1             package HTML::Query;
2              
3             our $VERSION = '0.08';
4              
5             use Badger::Class
6 11         266 version => $VERSION,
7             debug => 0,
8             base => 'Badger::Base',
9             utils => 'blessed',
10             import => 'class CLASS',
11             vars => 'AUTOLOAD',
12             constants => 'ARRAY',
13             constant => {
14             ELEMENT => 'HTML::Element',
15             BUILDER => 'HTML::TreeBuilder',
16             },
17             exports => {
18             any => 'Query',
19             hooks => {
20             query => \&_export_query_to_element,
21             },
22             },
23             messages => {
24             no_elements => 'No elements specified to query',
25             no_query => 'No query specified',
26             no_source => 'No argument specified for source: %s',
27             bad_element => 'Invalid element specified: %s',
28             bad_source => 'Invalid source specified: %s',
29             bad_query => 'Invalid query specified: %s',
30             bad_spec => 'Invalid specification "%s" in query: %s',
31             is_empty => 'The query does not contain any elements',
32 11     11   1957925 };
  11         20764  
33              
34             our $SOURCES = {
35             text => sub {
36             class(BUILDER)->load;
37             BUILDER->new_from_content(shift);
38             },
39             file => sub {
40             class(BUILDER)->load;
41             BUILDER->new_from_file(shift);
42             },
43             tree => sub {
44             $_[0]
45             },
46             query => sub {
47             ref $_[0] eq ARRAY
48             ? @{ $_[0] }
49             : $_[0];
50             },
51             };
52              
53             sub Query (@) {
54 48     48 1 90876 CLASS->new(@_);
55             }
56              
57             sub new {
58 167     167 1 16020 my $class = shift;
59 167         187 my ($element, @elements, $type, $code, $select);
60              
61             # expand a single list ref into items
62 167 100 100     661 unshift @_, @{ shift @_ }
  2         7  
63             if @_ == 1 && ref $_[0] eq ARRAY;
64              
65 167   66     514 $class = ref $class || $class;
66              
67 167         798 my $self = {
68             error => undef,
69             suppress_errors => undef,
70             match_self => undef,
71             elements => \@elements,
72             specificity => {}
73             };
74              
75             # each element should be an HTML::Element object, although we might
76             # want to subclass this module to recognise a different kind of object,
77             # so we get the element class from the ELEMENT constant method which a
78             # subclass can re-define.
79 167         586 my $element_class = $class->ELEMENT;
80              
81 167         376 while (@_) {
82 287         327 $element = shift;
83 287         264 $class->debug("argument: ".$element) if DEBUG;
84              
85 287 100       1089 if (! ref $element) {
    50          
86             # a non-reference item is a source type (text, file, tree)
87             # followed by the source, or if it's the last argument following
88             # one ore more element options or named argument pairs then it's
89             # a selection query
90 19 100       61 if (@_) {
    100          
91 12         18 $type = $element;
92 12   50     93 $code = $SOURCES->{ $type }
93             || return $class->error_msg( bad_source => $type );
94 12         20 $element = shift;
95 12         13 $class->debug("source $type: $element") if DEBUG;
96 12         41 unshift(@_, $code->($element));
97 12         51462 next;
98             }
99             elsif (@elements) {
100 5         9 $select = $element;
101 5         8 last;
102             }
103             }
104             elsif (blessed $element) {
105             # otherwise it should be an HTML::Element object or another
106             # HTML::Query object
107 268 100       863 if ($element->isa($element_class)) {
    50          
108 266         370 push(@elements, $element);
109 266         685 next;
110             }
111             elsif ($element->isa($class)) {
112 2         4 push(@elements, @{$element->get_elements});
  2         8  
113 2         6 next;
114             }
115             }
116              
117 2         19 return $class->error_msg( bad_element => $element );
118             }
119              
120 165         353 bless $self, $class;
121              
122 165 100       12810 return defined $select ? $self->query($select) : $self;
123             }
124              
125             sub query {
126 129     129 1 4117 my ($self, $query) = @_;
127 129         146 my @result;
128 129         165 my $ops = 0;
129 129         151 my $pos = 0;
130              
131 129         193 $self->{error} = undef;
132              
133 129 100 100     717 return $self->error_msg('no_query')
134             unless defined $query && length $query;
135              
136             # multiple specs can be comma separated, e.g. "table tr td, li a, div.foo"
137 127         156 COMMA: while (1) {
138             # each comma-separated traversal spec is applied downward from
139             # the source elements in the $self->{elements} query
140 134         141 my @elements = @{$self->get_elements};
  134         256  
141 134         174 my $comops = 0;
142              
143 134         135 my $specificity = 0;
144 134   100     465 my $startpos = pos($query) || 0;
145              
146 134         146 my $hack_sequence = 0; # look for '* html'
147              
148 134         127 warn "Starting new COMMA" if DEBUG;
149              
150             # for each whitespace delimited descendant spec we grok the correct
151             # parameters for look_down() and apply them to each source element
152             # e.g. "table tr td"
153 134         127 SEQUENCE: while (1) {
154 373         363 my @args;
155 373   100     1007 $pos = pos($query) || 0;
156 373         468 my $relationship = '';
157 373         343 my $leading_whitespace;
158              
159 373         380 warn "Starting new SEQUENCE" if DEBUG;
160              
161             # ignore any leading whitespace
162 373 100       1270 if ($query =~ / \G (\s+) /cgsx) {
163 101 50       228 $leading_whitespace = defined($1) ? 1 : 0;
164 101         104 warn "removing leading whitespace\n" if DEBUG;
165             }
166              
167             # grandchild selector is whitespace sensitive, requires leading whitespace
168 373 100 100     1192 if ($leading_whitespace && $comops && ($query =~ / \G (\*) \s+ /cgx)) {
      100        
169             # can't have a relationship modifier as the first part of the query
170 6         11 $relationship = $1;
171 6         20 warn "relationship = $relationship\n" if DEBUG;
172             }
173              
174             # get other relationship modifiers
175 373 100       917 if ($query =~ / \G (>|\+) \s* /cgx) {
176             # can't have a relationship modifier as the first part of the query
177 27         47 $relationship = $1;
178 27         27 warn "relationship = $relationship\n" if DEBUG;
179 27 50       52 if (!$comops) {
180 0         0 return $self->_report_error( $self->message( bad_spec => $relationship, $query ) );
181             }
182             }
183              
184             # optional leading word is a tag name
185 373 100       1035 if ($query =~ / \G ([\w\*]+) /cgx) {
186 223         419 my $tag = $1;
187              
188 223 100       458 if ($tag =~ m/\*/) {
189 18 100 66     119 if (($leading_whitespace || $comops == 0) && ($tag eq '*')) {
      66        
190 14         14 warn "universal tag\n" if DEBUG;
191 14         60 push(@args, _tag => qr/\w+/);
192              
193 14 100       47 if ($comops == 0) { #we need to catch the case where we see '* html'
194 7         15 $hack_sequence++;
195             }
196             }
197             else {
198 4         18 return $self->_report_error( $self->message( bad_spec => $tag, $query ) );
199             }
200             }
201             else {
202 205         208 warn "html tag\n" if DEBUG;
203 205         229 $specificity += 1; # standard tags are worth 1 point
204 205         372 push( @args, _tag => $tag );
205              
206 205 100 100     686 if ($comops == 1 && $tag eq 'html') {
207 1         2 $hack_sequence++;
208             }
209             }
210             }
211              
212             # loop to collect a description about this specific part of the rule
213 369         448 while (1) {
214 519         607 my $work = scalar @args;
215              
216             # that can be followed by (or the query can start with) a #id
217 519 100       2308 if ($query =~ / \G \# ([\w\-]+) /cgx) {
218 39         56 $specificity += 100;
219 39         105 push( @args, id => $1 );
220             }
221              
222             # and/or a .class
223 519 100       1314 if ($query =~ / \G \. ([\w\-]+) /cgx) {
224 73         100 $specificity += 10;
225 73         12316 push( @args, class => qr/ (^|\s+) $1 ($|\s+) /x );
226             }
227              
228             # and/or none or more [ ] attribute specs
229 519 100       1315 if ($query =~ / \G \[ (.*?) \] /cgx) {
230 47         95 my $attribute = $1;
231 47         49 $specificity += 10;
232              
233             #if we have an operator
234 47 100       269 if ($attribute =~ m/(.*?)\s*([\|\~]?=)\s*(.*)/) {
235 38         124 my ($name,$attribute_op,$value) = ($1,$2,$3);
236              
237 38 50 33     199 unless (defined($name) && length($name)) {
238 0         0 return $self->_report_error( $self->message( bad_spec => $name, $query ) );
239             }
240              
241 38         38 warn "operator $attribute_op" if DEBUG;
242              
243 38 50       71 if (defined $value) {
244 38         64 for ($value) {
245 38         98 s/^['"]//;
246 38         134 s/['"]$//;
247             }
248 38 100       101 if ($attribute_op eq '=') {
    100          
    50          
249 33         90 push( @args, $name => $value);
250             }
251             elsif ($attribute_op eq '|=') {
252 2         24 push(@args, $name => qr/\b${value}-?/)
253             }
254             elsif ($attribute_op eq '~=') {
255 3         39 push(@args, $name => qr/\b${value}\b/)
256             }
257             else {
258 0         0 return $self->_report_error( $self->message( bad_spec => $attribute_op, $query ) );
259             }
260             }
261             else {
262 0         0 return $self->_report_error( $self->message( bad_spec => $attribute_op, $query ) );
263             }
264             }
265             else {
266 9 100 66     55 unless (defined($attribute) && length($attribute)) {
267 2         25 return $self->_report_error( $self->message( bad_spec => $attribute, $query ) );
268             }
269              
270             # add a regex to match anything (or nothing)
271 7         46 push( @args, $attribute => qr/.*/ );
272             }
273             }
274             # and/or one or more pseudo-classes
275 517 100       957 if ($query =~ / \G : ([\w\-]+) /cgx) {
276 5         8 my $pseudoclass = $1;
277 5         7 $specificity += 10;
278              
279 5 100       14 if ($pseudoclass eq 'first-child') {
    50          
280 3     12   37 push( @args, sub { ! grep { ref $_ } $_[0]->left() } );
  12         347  
  12         132  
281             } elsif ($pseudoclass eq 'last-child') {
282 2     12   14 push( @args, sub { ! grep { ref $_ } $_[0]->right() } );
  12         505  
  12         175  
283             } else {
284 0         0 warn "Pseudoclass :$pseudoclass not supported";
285 0         0 next;
286             }
287             }
288              
289             # keep going until this particular expression is fully processed
290 517 100       1098 last unless scalar(@args) > $work;
291             }
292              
293             # we must have something in @args by now or we didn't find any
294             # valid query specification this time around
295 367 100       753 last SEQUENCE unless @args;
296              
297 239         234 $self->debug(
298             'Parsed ', substr($query, $pos, pos($query) - $pos),
299             ' into args [', join(', ', @args), ']'
300             ) if DEBUG;
301              
302             # we want to skip certain hack sequences like '* html'
303 239 100       611 if ($hack_sequence == 2) {
    100          
    100          
    100          
    50          
304 1         4 @elements = []; # clear out our stored elements to match behaviour of modern browsers
305             }
306             # we're just looking for any descendent
307             elsif( !$relationship ) {
308 205 100       435 if ($self->{match_self}) {
309             # if we are re-querying, be sure to match ourselves not just descendents
310 2         5 @elements = map { $_->look_down(@args) } @elements;
  4         82  
311             } else {
312             # look_down() will match self in addition to descendents,
313             # so we explicitly disallow matches on self as we iterate
314             # thru the list. The other cases below already exclude self.
315             # https://rt.cpan.org/Public/Bug/Display.html?id=58918
316 203         238 my @accumulator;
317 203         292 foreach my $e (@elements) {
318 248 100       3245 if ($e->root() == $e) {
319 108         950 push(@accumulator, $e->look_down(@args));
320             }
321             else {
322 140         1718 push(@accumulator, grep { $_ != $e } $e->look_down(@args));
  235         12438  
323             }
324             }
325 203         44929 @elements = @accumulator;
326             }
327             }
328             # immediate child selector
329             elsif( $relationship eq '>' ) {
330             @elements = map {
331 16         44 $_->look_down(
332             @args,
333             sub {
334 61     61   2238 my $tag = shift;
335 61         65 my $root = $_;
336              
337 61         146 return $tag->depth == $root->depth + 1;
338             }
339             )
340 36         1302 } @elements;
341             }
342             # immediate sibling selector
343             elsif( $relationship eq '+' ) {
344             @elements = map {
345 11         19 $_->parent->look_down(
346             @args,
347             sub {
348 253     253   6095 my $tag = shift;
349 253         256 my $root = $_;
350 253         534 my @prev_sibling = $tag->left;
351             # get prev next non-text sibling
352 253         3845 foreach my $sibling (reverse @prev_sibling) {
353 280 100       842 next unless ref $sibling;
354 69         220 return $sibling == $root;
355             }
356             }
357             )
358 61         1012 } @elements;
359             }
360             # grandchild selector
361             elsif( $relationship eq '*' ) {
362             @elements = map {
363 6         9 $_->look_down(
364             @args,
365             sub {
366 40     40   1886 my $tag = shift;
367 40         41 my $root = $_;
368              
369 40         93 return $tag->depth > $root->depth + 1;
370             }
371             )
372 9         169 } @elements;
373             }
374              
375             # so we can check we've done something
376 239         1390 $comops++;
377              
378             # dedup the results we've gotten
379 239         568 @elements = $self->_dedup(\@elements);
380              
381 239         593 map { warn $_->as_HTML } @elements if DEBUG;
382             }
383              
384 128 100       233 if ($comops) {
385 127         119 $self->debug(
386             'Added', scalar(@elements), ' elements to results'
387             ) if DEBUG;
388              
389 127         481 my $selector = substr ($query,$startpos, $pos - $startpos);
390 127         314 $self->_add_specificity($selector,$specificity);
391              
392             #add in the recent pass
393 127         179 push(@result,@elements);
394              
395             # dedup the results across the result sets, necessary for comma based selectors
396 127         476 @result = $self->_dedup(\@result);
397              
398             # sort the result set...
399 127         473 @result = sort _by_address @result;
400              
401             # update op counter for complete query to include ops performed
402             # in this fragment
403 127         198 $ops += $comops;
404             }
405             else {
406             # looks like we got an empty comma section, e.g. : ",x, ,y,"
407             # so we'll ignore it
408             }
409              
410 128 100       411 last COMMA unless $query =~ / \G \s*,\s* /cgsx;
411             }
412              
413             # check for any trailing text in the query that we couldn't parse
414 121 50       257 if ($query =~ / \G (.+?) \s* $ /cgsx) {
415 0         0 return $self->_report_error( $self->message( bad_spec => $1, $query ) );
416             }
417              
418             # check that we performed at least one query operation
419 121 100       229 unless ($ops) {
420 1         6 return $self->_report_error( $self->message( bad_query => $query ) );
421             }
422              
423 120 100       382 return wantarray ? @result : $self->_new_match_self(@result);
424             }
425              
426             # return elements stored from last query
427             sub get_elements {
428 328     328 1 362 my $self = shift;
429              
430 328 50       1390 return wantarray ? @{$self->{elements}} : $self->{elements};
  0         0  
431             }
432              
433             ###########################################################################################################
434             # from CSS spec at http://www.w3.org/TR/CSS21/cascade.html#specificity
435             ###########################################################################################################
436             # A selector's specificity is calculated as follows:
437             #
438             # * count the number of ID attributes in the selector (= a)
439             # * count the number of other attributes and pseudo-classes in the selector (= b)
440             # * count the number of element names in the selector (= c)
441             # * ignore pseudo-elements.
442             #
443             # Concatenating the three numbers a-b-c (in a number system with a large base) gives the specificity.
444             #
445             # Example(s):
446             #
447             # Some examples:
448             #
449             # * {} /* a=0 b=0 c=0 -> specificity = 0 */
450             # LI {} /* a=0 b=0 c=1 -> specificity = 1 */
451             # UL LI {} /* a=0 b=0 c=2 -> specificity = 2 */
452             # UL OL+LI {} /* a=0 b=0 c=3 -> specificity = 3 */
453             # H1 + *[REL=up]{} /* a=0 b=1 c=1 -> specificity = 11 */
454             # UL OL LI.red {} /* a=0 b=1 c=3 -> specificity = 13 */
455             # LI.red.level {} /* a=0 b=2 c=1 -> specificity = 21 */
456             # #x34y {} /* a=1 b=0 c=0 -> specificity = 100 */
457             ###########################################################################################################
458              
459             =pod
460              
461             =item specificity()
462              
463             Calculate the specificity for any given passed selector, a critical factor in determining how best to apply the cascade
464              
465             A selector's specificity is calculated as follows:
466              
467             * count the number of ID attributes in the selector (= a)
468             * count the number of other attributes and pseudo-classes in the selector (= b)
469             * count the number of element names in the selector (= c)
470             * ignore pseudo-elements.
471              
472             The specificity is based only on the form of the selector. In particular, a selector of the form "[id=p33]" is counted
473             as an attribute selector (a=0, b=0, c=1, d=0), even if the id attribute is defined as an "ID" in the source document's DTD.
474              
475             See the following spec for additional details:
476             L
477              
478             =back
479              
480             =cut
481              
482             sub get_specificity {
483 24     24 0 38 my ($self,$selector) = @_;
484              
485 24 50       70 unless (exists $self->{specificity}->{$selector}) {
486              
487             # if the invoking tree happened to be large this could get expensive real fast
488             # instead load up an empty instance and query that.
489 24         58 local $self->{elements} = [];
490 24         53 $self->query($selector);
491             }
492              
493 24         71 return $self->{specificity}->{$selector};
494             }
495              
496             sub suppress_errors {
497 11     11 0 228 my ($self, $setting) = @_;
498              
499 11 100       29 if (defined($setting)) {
500 2         6 $self->{suppress_errors} = $setting;
501             }
502              
503 11         31 return $self->{suppress_errors};
504             }
505              
506             sub get_error {
507 6     6 0 524 my ($self) = @_;
508              
509 6         24 return $self->{error};
510             }
511              
512             sub list {
513             # return list of items or return unblessed list ref of items
514 0 0   0 1 0 return wantarray ? @{ $_[0] } : [ @{ $_[0] } ];
  0         0  
  0         0  
515             }
516              
517             sub size {
518 98     98 1 8913 my $self = shift;
519 98         108 return scalar @{$self->get_elements};
  98         200  
520             }
521              
522             sub first {
523 6     6 1 8 my $self = shift;
524              
525 6 50       5 return @{$self->get_elements} ? $self->get_elements->[0] : $self->error_msg('is_empty');
  6         12  
526             }
527              
528             sub last {
529 4     4 1 716 my $self = shift;
530              
531 4 50       6 return @{$self->get_elements} ? $self->get_elements->[-1] : $self->error_msg('is_empty');
  4         7  
532             }
533              
534             ####################################################################
535             #
536             # Everything below here is a private method subject to change
537             #
538             ####################################################################
539              
540             sub _add_specificity {
541 127     127   201 my ($self, $selector, $specificity) = @_;
542              
543 127         353 $self->{specificity}->{$selector} = $specificity;
544              
545 127         193 return();
546             }
547              
548             sub _report_error {
549 7     7   900 my ($self, $message) = @_;
550              
551 7 100       18 if ($self->suppress_errors()) {
552 6 50       15 if (defined($message)) {
553 6         12 $self->{error} = $message;
554             }
555 6         32 return undef;
556             }
557             else {
558 1         3 $self->error($message); # this will DIE
559             }
560             }
561              
562             # this Just Works[tm] because first arg is HTML::Element object
563             sub _export_query_to_element {
564 2     2   558 class(ELEMENT)->load->method(
565             query => \&Query,
566             );
567             }
568              
569             # remove duplicate elements in the case where elements are nested between multiple matching elements
570             sub _dedup {
571 366     366   491 my ($self,$elements) = @_;
572              
573 366         528 my %seen = ();
574 366         425 my @unique = ();
575              
576 366         372 foreach my $item (@{$elements}) {
  366         667  
577 910 100       2239 if (!exists($seen{$item})) {
578 815         1011 push(@unique, $item);
579             }
580              
581 910         2162 $seen{$item}++;
582             }
583              
584 366         1517 return @unique;
585             }
586              
587             # utility method to assist in sorting of query return sets
588             sub _by_address
589             {
590 235     235   350 my $self = shift;
591              
592 235         669 my @a = split /\./, $a->address();
593 235         21960 my @b = split /\./, $b->address();
594              
595 235 100       17016 my $max = (scalar @a > scalar @b) ? scalar @a : scalar @b;
596              
597 235         726 for (my $index=0; $index<$max; $index++) {
598              
599 1071 50 66     3875 if (!defined($a[$index]) && !defined($b[$index])) {
    100          
    100          
600 0         0 return 0;
601             }
602             elsif (!defined($a[$index])) {
603 49         151 return -1;
604             }
605             elsif(!defined($b[$index])) {
606 26         80 return 1;
607             }
608              
609 996 100       1939 if ($a[$index] == $b[$index]) {
610 842         1892 next; #move to the next
611             }
612             else {
613 154         579 return $a[$index] <=> $b[$index];
614             }
615             }
616             }
617              
618             # instantiate an instance with match_self turned on, for use with
619             # follow-up queries, so they match the top-most elements.
620             sub _new_match_self {
621 116     116   147 my $self = shift;
622              
623 116         296 my $result = $self->new(@_);
624              
625 116         201 $result->{match_self} = 1;
626 116         611 return $result;
627             }
628              
629             sub AUTOLOAD {
630 74     74   296 my $self = shift;
631 74         547 my ($method) = ($AUTOLOAD =~ /([^:]+)$/ );
632 74 50       211 return if $method eq 'DESTROY';
633              
634             # we allow Perl to catch any unknown methods that the user might
635             # try to call against the HTML::Element objects in the query
636 156         5160 my @results =
637 74         149 map { $_->$method(@_) }
638 74         92 @{$self->get_elements};
639              
640 74 50       3871 return wantarray ? @results : \@results;
641             }
642              
643             1;
644              
645             =head1 NAME
646              
647             HTML::Query - jQuery-like selection queries for HTML::Element
648              
649             =head1 SYNOPSIS
650              
651             Creating an C object using the L constructor
652             subroutine:
653              
654             use HTML::Query 'Query';
655              
656             # using named parameters
657             $q = Query( text => $text ); # HTML text
658             $q = Query( file => $file ); # HTML file
659             $q = Query( tree => $tree ); # HTML::Element object
660             $q = Query( query => $query ); # HTML::Query object
661             $q = Query(
662             text => $text1, # or any combination
663             text => $text2, # of the above
664             file => $file1,
665             file => $file2,
666             tree => $tree,
667             query => $query,
668             );
669              
670             # passing elements as positional arguments
671             $q = Query( $tree ); # HTML::Element object(s)
672             $q = Query( $tree1, $tree2, $tree3, ... );
673              
674             # or from one or more existing queries
675             $q = Query( $query1 ); # HTML::Query object(s)
676             $q = Query( $query1, $query2, $query3, ... );
677              
678             # or a mixture
679             $q = Query( $tree1, $query1, $tree2, $query2 );
680              
681             # the final argument (in all cases) can be a selector
682             my $spec = 'ul.menu li a'; #
683              
684             $q = Query( $tree, $spec );
685             $q = Query( $query, $spec );
686             $q = Query( $tree1, $tree2, $query1, $query2, $spec );
687             $q = Query( text => $text, $spec );
688             $q = Query( file => $file, $spec );
689             $q = Query( tree => $tree, $spec );
690             $q = Query( query => $query, $spec );
691             $q = Query(
692             text => $text,
693             file => $file,
694             # ...etc...
695             $spec
696             );
697              
698             Or using the OO L constructor method (which the L
699             subroutine maps onto):
700              
701             use HTML::Query;
702              
703             $q = HTML::Query->new(
704             # accepts the same arguments as Query()
705             )
706              
707             Or by monkey-patching a L method into L.
708              
709             use HTML::Query 'query'; # note lower case 'q'
710             use HTML::TreeBuilder;
711              
712             # build a tree
713             my $tree = HTML::TreeBuilder->new;
714             $tree->parse_file($filename);
715              
716             # call the query() method on any element
717             my $query = $tree->query($spec);
718              
719             Once you have a query, you can start selecting elements:
720              
721             @r = $q->query('a')->get_elements(); # all ... elements
722             @r = $q->query('a#menu')->get_elements(); # all with "menu" id
723             @r = $q->query('#menu')->get_elements(); # all elements with "menu" id
724             @r = $q->query('a.menu')->get_elements(); # all with "menu" class
725             @r = $q->query('.menu')->get_elements(); # all elements with "menu" class
726             @r = $q->query('a[href]')->get_elements(); # all with 'href' attr
727             @r = $q->query('a[href=foo]')->get_elements(); # all with 'href="foo"' attr
728              
729             # you can specify elements within elements...
730             @r = $q->query('ul.menu li a')->get_elements(); #
731              
732             # and use commas to delimit multiple path specs for different elements
733             @r = $q->query('table tr td a, form input[type=submit]')->get_elements();
734              
735             # query() in scalar context returns a new query
736             $r = $q->query('table')->get_elements();; # find all tables
737             $s = $r->query('tr')->get_elements(); # find all rows in all those tables
738             $t = $s->query('td')->get_elements(); # and all cells in those rows...
739              
740             Inspecting query elements:
741              
742             # get number of elements in query
743             my $size = $q->size
744              
745             # get first/last element in query
746             my $first = $q->first;
747             my $last = $q->last;
748              
749             # convert query to list or list ref of HTML::Element objects
750             my $list = $q->list; # list ref in scalar context
751             my @list = $q->list; # list in list context
752              
753             All other methods are mapped onto the L objects
754             in the query:
755              
756             print $query->as_trimmed_text; # print trimmed text for each element
757             print $query->as_HTML; # print each element as HTML
758             $query->delete; # call delete() on each element
759              
760             =head1 DESCRIPTION
761              
762             The C module is an add-on for the L module
763             set. It provides a simple way to select one or more elements from a tree using
764             a query syntax inspired by jQuery. This selector syntax will be reassuringly
765             familiar to anyone who has ever written a CSS selector.
766              
767             C is not an attempt to provide a complete (or even near-complete)
768             implementation of jQuery in Perl (see Ingy's L module for a
769             more ambitious attempt at that). Rather, it borrows some of the tried and
770             tested selector syntax from jQuery (and CSS) that can easily be mapped onto
771             the C method provided by the L
772             module.
773              
774             =head2 Creating a Query
775              
776             The easiest way to create a query is using the exportable L
777             subroutine.
778              
779             use HTML::Query 'Query'; # note capital 'Q'
780              
781             It accepts a C or C named parameter and will create an
782             C object from the HTML source text or file, respectively.
783              
784             my $query = Query( text => $text );
785             my $query = Query( file => $file );
786              
787             This delegates to L to parse the
788             HTML into a tree of L objects. The root
789             element returned is then wrapped in an C object.
790              
791             If you already have one or more L objects that
792             you want to query then you can pass them to the L subroutine as
793             arguments. For example, you can explicitly use
794             L to parse an HTML document into a tree:
795              
796             use HTML::TreeBuilder;
797             my $tree = HTML::TreeBuilder->new;
798             $tree->parse_file($filename);
799              
800             And then create an C object for the tree either using an
801             explicit C named parameter:
802              
803             my $query = Query( tree => $tree );
804              
805             Or implicitly using positional arguments.
806              
807             my $query = Query( $tree );
808              
809             If you want to query across multiple elements, then pass each one as a
810             positional argument.
811              
812             my $query = Query( $tree1, $tree2, $tree3 );
813              
814             You can also create a new query from one or more existing queries,
815              
816             my $query = Query( query => $query ); # named parameter
817             my $query = Query( $query1, $query2 ); # positional arguments.
818              
819             You can mix and match these different parameters and positional arguments
820             to create a query across several different sources.
821              
822             $q = Query(
823             text => $text1,
824             text => $text2,
825             file => $file1,
826             file => $file2,
827             tree => $tree,
828             query => $query,
829             );
830              
831             The L subroutine is a simple wrapper around the L
832             constructor method. You can instantiate your objects manually if you prefer.
833             The L method accepts the same arguments as for the L
834             subroutine (in fact, the L subroutine simply forwards all
835             arguments to the L method).
836              
837             use HTML::Query;
838              
839             my $query = HTML::Query->new(
840             # same argument format as for Query()
841             );
842              
843             A final way to use C is to have it add a L method
844             to L. The C import hook (all lower
845             case) can be specified to make this so.
846              
847             use HTML::Query 'query'; # note lower case 'q'
848             use HTML::TreeBuilder;
849              
850             my $tree = HTML::TreeBuilder->new;
851             $tree->parse_file($filename);
852              
853             # now all HTML::Elements have a query() method
854             my @items = $tree->query('ul li')->get_elements(); # find all list items
855              
856             This approach, often referred to as I, should be used
857             carefully and sparingly. It involves a violation of
858             L's namespace that could have unpredictable
859             results with a future version of the module (e.g. one which defines its own
860             C method that does something different). Treat it as something that
861             is great to get a quick job done right now, but probably not something to be
862             used in production code without careful consideration of the implications.
863              
864             =head2 Selecting Elements
865              
866             Having created an C object by one of the methods outlined above,
867             you can now fetch descendant elements in the tree using a simple query syntax.
868             For example, to fetch all the C<< EaE >> elements in the tree, you can
869             write:
870              
871             @links = $query->query('a')->get_elements();
872              
873             Or, if you want the elements that have a specific C attribute defined
874             with a value of, say C, you can write:
875              
876             @links = $query->query('a.menu')->get_elements();
877              
878             More generally, you can look for the existence of any attribute and optionally
879             provide a specific value for it.
880              
881             @links = $query->query('a[href]')->get_elements(); # any href attribute
882             @links = $query->query('a[href=index.html]')->get_elements(); # specific value
883              
884             You can also find an element (or elements) by specifying an id.
885              
886             @links = $query->query('#menu')->get_elements(); # any element with id="menu"
887             @links = $query->query('ul#menu')->get_elements(); # ul element with id="menu"
888              
889             You can provide multiple selection criteria to find elements within elements
890             within elements, and so on. For example, to find all links in a menu,
891             you can write:
892              
893             # matches:
894             @links = $query->query('ul.menu li a')->get_elements();
895              
896             You can separate different criteria using commas. For example, to fetch all
897             table rows and C elements with a C class:
898              
899             @elems = $query->('table tr, span.foo')->get_elements();
900              
901             =head2 Query Results
902              
903             When called in list context, as shown in the examples above, the L
904             method returns a list of L objects matching the
905             search criteria. In scalar context, the L method returns a new
906             C object containing the L objects
907             found. You can then call the L method against that object to further
908             refine the query. The L method applies the selection to all elements
909             stored in the query.
910              
911             my $tables = $query->query('table'); # query for tables
912             my $rows = $tables->query('tr'); # requery for all rows in those tables
913             my $cells = $rows->query('td')->get_elements(); # return back all the cells in those rows
914              
915             =head2 Inspection Methods
916              
917             The L method returns the number of elements in the query. The
918             L and L methods return the first and last items in the
919             query, respectively.
920              
921             if ($query->size) {
922             print "from ", $query->first->as_trimmed_text, " to ", $query->last->as_trimmed_text;
923             }
924              
925             If you want to extract the L objects from the
926             query you can call the L method. This returns a list of
927             L objects in list context, or a reference to a
928             list in scalar context.
929              
930             @elems = $query->list;
931             $elems = $query->list;
932              
933             =head2 Element Methods
934              
935             Any other methods are automatically applied to each element in the list. For
936             example, to call the C method on all the
937             L objects in the query, you can write:
938              
939             print $query->as_trimmed_text;
940              
941             In list context, this method returns a list of the return values from
942             calling the method on each element. In scalar context it returns a
943             reference to a list of return values.
944              
945             @text_blocks = $query->as_trimmed_text;
946             $text_blocks = $query->as_trimmed_text;
947              
948             See L for further information on the methods it
949             provides.
950              
951             =head1 QUERY SYNTAX
952              
953             =head2 Basic Selectors
954              
955             =head3 element
956              
957             Matches all elements of a particular type.
958              
959             @elems = $query->query('table')->get_elements(); #
960              
961             =head3 #id
962              
963             Matches all elements with a specific id attribute.
964              
965             @elems = $query->query('#menu')->get_elements() #
966              
967             This can be combined with an element type:
968              
969             @elems = $query->query('ul#menu')->get_elements(); #
970              
971             =head3 .class
972              
973             Matches all elements with a specific class attribute.
974              
975             @elems = $query->query('.info')->get_elements(); #
976              
977             This can be combined with an element type and/or element id:
978              
979             @elems = $query->query('p.info')->get_elements(); #

980             @elems = $query->query('p#foo.info')->get_elements(); #

981             @elems = $query->query('#foo.info')->get_elements(); #
982              
983             The selectors listed above can be combined in a whitespace delimited
984             sequence to select down through a hierarchy of elements. Consider the
985             following table:
986              
987            
996              
997             To locate the cells that we're interested in, we can write:
998              
999             @elems = $query->query('table.search tr.result td.value')->get_elements();
1000              
1001             =head2 Attribute Selectors
1002              
1003             W3C CSS 2 specification defines new constructs through which to select
1004             based on specific attributes within elements. See the following link for the spec:
1005             L
1006              
1007             =head3 [attr]
1008              
1009             Matches elements that have the specified attribute, including any where
1010             the attribute has no value.
1011              
1012             @elems = $query->query('[href]')->get_elements(); #
1013              
1014             This can be combined with any of the above selectors. For example:
1015              
1016             @elems = $query->query('a[href]')->get_elements(); #
1017             @elems = $query->query('a.menu[href]')->get_elements(); #
1018              
1019             You can specify multiple attribute selectors. Only those elements that
1020             match I of them will be selected.
1021              
1022             @elems = $query->query('a[href][rel]')->get_elements(); #
1023              
1024             =head3 [attr=value]
1025              
1026             Matches elements that have an attribute set to a specific value. The
1027             value can be quoted in either single or double quotes, or left unquoted.
1028              
1029             @elems = $query->query('[href=index.html]')->get_elements();
1030             @elems = $query->query('[href="index.html"]')->get_elements();
1031             @elems = $query->query("[href='index.html']")->get_elements();
1032              
1033             You can specify multiple attribute selectors. Only those elements that
1034             match I of them will be selected.
1035              
1036             @elems = $query->query('a[href=index.html][rel=home]')->get_elements();
1037              
1038             =head3 [attr|=value]
1039              
1040             Matches any element X whose foo attribute has a hyphen-separated list of
1041             values beginning (from the left) with bar. The value can be quoted in either
1042             single or double quotes, or left unquoted.
1043              
1044             @elems = $query->query('[lang|=en]')->get_elements();
1045             @elems = $query->query('p[class|="example"]')->get_elements();
1046             @elems = $query->query("img[alt|='fig']")->get_elements();
1047              
1048             You can specify multiple attribute selectors. Only those elements that
1049             match I of them will be selected.
1050              
1051             @elems = $query->query('p[class|="external"][lang|="en"]')->get_elements();
1052              
1053             =head3 [attr~=value]
1054              
1055             Matches any element X whose foo attribute value is a list of space-separated
1056             values, one of which is exactly equal to bar. The value can be quoted in either
1057             single or double quotes, or left unquoted.
1058              
1059             @elems = $query->query('[lang~=en]')->get_elements();
1060             @elems = $query->query('p[class~="example"]')->get_elements();
1061             @elems = $query->query("img[alt~='fig']")->get_elements();
1062              
1063             You can specify multiple attribute selectors. Only those elements that
1064             match I of them will be selected.
1065              
1066             @elems = $query->query('p[class~="external"][lang~="en"]')->get_elements();
1067              
1068             KNOWN BUG: you can't have a C<]> character in the attribute value because
1069             it confuses the query parser. Fixing this is TODO.
1070              
1071             =head2 Universal Selector
1072              
1073             W3C CSS 2 specification defines a new construct through which to select
1074             any element within the document below a given hierarchy.
1075              
1076             http://www.w3.org/TR/css3-selectors/#universal-selector
1077              
1078             @elems = $query->query('*')->get_elements();
1079              
1080             =head2 Combinator Selectors
1081              
1082             W3C CSS 2 specification defines new constructs through which to select
1083             based on heirarchy with the DOM. See the following link for the spec:
1084             L
1085              
1086             =head3 Immediate Descendents (children)
1087              
1088             When you combine selectors with whitespace elements are selected if
1089             they are descended from the parent in some way. But if you just want
1090             to select the children (and not the grandchildren, great-grandchildren,
1091             etc) then you can combine the selectors with the C<< > >> character.
1092              
1093             @elems = $query->query('a > img')->get_elements();
1094              
1095             =head3 Non-Immediate Descendents
1096              
1097             If you just want any descendents that aren't children then you can combine
1098             selectors with the C<*> character.
1099              
1100             @elems = $query->query('div * a')->get_elements();
1101              
1102             =head3 Immediate Siblings
1103              
1104             If you want to use a sibling relationship then you can can join selectors
1105             with the C<+> character.
1106              
1107             @elems = $query->query('img + span')->get_elements();
1108              
1109             =head2 Pseudo-classes
1110              
1111             W3C CSS 2 and CSS 3 specifications define new concepts of pseudo-classes to
1112             permit formatting based on information that lies outside the document tree.
1113             See the following link for the most recent spec:
1114             L
1115              
1116             HTML::Query currently has limited support for CSS 2, and no support for CSS 3.
1117              
1118             Patches are *highly* encouraged to help add support here.
1119              
1120             =head3 -child pseudo-classes
1121              
1122             If you want to return child elements within a certain position then -child
1123             pseudo-classes (:first-child, :last-child) are what you're looking for.
1124              
1125             @elems = $query->query('table td:first-child')->get_elements;
1126              
1127             =head3 Link pseudo-classes: :link and :visited
1128              
1129             Unsupported.
1130              
1131             The :link pseudo-class is to be implemented, currently unsupported.
1132              
1133             It is not possible to locate :visited outside of a browser context due to it's
1134             dynamic nature.
1135              
1136             =head3 Dynamic pseudo-classes
1137              
1138             Unsupported.
1139              
1140             It is not possible to locate these classes(:hover, :active, :focus) outside
1141             of a browser context due to their dynamic nature.
1142              
1143             =head3 Language pseudo-class
1144              
1145             Unsupported.
1146              
1147             Functionality for the :lang psuedo-class is largely replicated by using an
1148             attribute selector for lang combined with a universal selector query.
1149              
1150             If this is insufficient I'd love to see a patch adding support for it.
1151              
1152             =head3 Other pseudo-classes
1153              
1154             W3C CSS 3 added a number of new behaviors that need support. At
1155             this time there is no support for them, but we should work on adding support.
1156              
1157             Patches are very welcome.
1158              
1159             =head2 Pseudo-elements
1160              
1161             W3C CSS 2 and CSS 3 specification defines new concepts of pseudo-elements to
1162             permit formatting based on information that lies outside the document tree.
1163             See the following link for the most recent spec:
1164             L
1165              
1166             At this time there is no support for pseudo-elements, but we are working
1167             on adding support.
1168              
1169             Patches are very welcome.
1170              
1171             =head2 Combining Selectors
1172              
1173             You can combine basic and hierarchical selectors into a single query
1174             by separating each part with a comma. The query will select all matching
1175             elements for each of the comma-delimited selectors. For example, to
1176             find all C, C and C elements in a tree:
1177              
1178             @elems = $query->query('a, b, i')->get_elements();
1179              
1180             Each of these selectors can be arbitrarily complex.
1181              
1182             @elems = $query->query(
1183             'table.search[width=100%] tr.result[valign=top] td.value,
1184             form.search input[type=submit],
1185             a[href=index.html]'
1186             )->get_elements();
1187              
1188             =head1 EXPORT HOOKS
1189              
1190             =head2 Query
1191              
1192             The C constructor subroutine (note the capital letter) can be
1193             exported as a convenient way to create C objects. It simply
1194             forwards all arguments to the L constructor method.
1195              
1196             use HTML::Query 'Query';
1197              
1198             my $query = Query( file => $file, 'ul.menu li a' );
1199              
1200             =head2 query
1201              
1202             The C export hook can be called to monkey-patch a L method
1203             into the L module.
1204              
1205             This is considered questionable behaviour in polite society which regards it
1206             as a violation of the inner sanctity of the L.
1207              
1208             But if you're the kind of person that doesn't mind a bit of occasional
1209             namespace abuse for the sake of getting the job done, then go right ahead.
1210             Just don't blame me if it all blows up later.
1211              
1212             use HTML::Query 'query'; # note lower case 'q'
1213             use HTML::TreeBuilder;
1214              
1215             # build a tree
1216             my $tree = HTML::TreeBuilder->new;
1217             $tree->parse_file($filename);
1218              
1219             # call the query() method on any element
1220             my $query = $tree->query('ul li a');
1221              
1222             =head1 METHODS
1223              
1224             The C object is a subclass of L and
1225             inherits all of its method.
1226              
1227             =head2 new(@elements,$selector)
1228              
1229             This constructor method is used to create a new C object. It
1230             expects a list of any number (including zero) of
1231             L or C objects.
1232              
1233             # single HTML::Element object
1234             my $query = HTML::Query->new($elem);
1235              
1236             # multiple element object
1237             my $query = HTML::Query->new($elem1, $elem2, $elem3, ...);
1238              
1239             # copy elements from an existing query
1240             my $query = HTML::Query->new($another_query);
1241              
1242             # copy elements from several queries
1243             my $query = HTML::Query->new($query1, $query2, $query3);
1244              
1245             # or a mixture
1246             my $query = HTML::Query->new($elem1, $query1, $elem2, $query3);
1247              
1248             You can also use named parameters to specify an alternate source for a
1249             element.
1250              
1251             $query = HTML::Query->new( file => $file );
1252             $query = HTML::Query->new( text => $text );
1253              
1254             In this case, the L module is used to
1255             parse the source file or text into a tree of L
1256             objects.
1257              
1258             For the sake of completeness, you can also specify element trees and queries
1259             using named parameters:
1260              
1261             $query = HTML::Query->new( tree => $tree );
1262             $query = HTML::Query->new( query => $query );
1263              
1264             You can freely mix and match elements, queries and named sources. The
1265             query will be constructed as an aggregate across them all.
1266              
1267             $q = HTML::Query->new(
1268             text => $text1,
1269             text => $text2,
1270             file => $file1,
1271             file => $file2,
1272             tree => $tree,
1273             query => $query1,
1274             );
1275              
1276             The final, optional argument can be a selector specification. This is
1277             immediately passed to the L method which will return a new query
1278             with only those elements selected.
1279              
1280             my $spec = 'ul.menu li a'; #
1281              
1282             my $query = HTML::Query->new( $tree, $spec );
1283             my $query = HTML::Query->new( text => $text, $spec );
1284             my $query = HTML::Query->new(
1285             text => $text,
1286             file => $file,
1287             $spec
1288             );
1289              
1290             The list of arguments can also be passed by reference to a list.
1291              
1292             my $query = HTML::Query->new(\@args);
1293              
1294             =head2 query($spec)
1295              
1296             This method locates the descendant elements identified by the C<$spec>
1297             argument for each element in the query. It then interally stores the results
1298             for requerying or return. See get_elements().
1299              
1300             my $query = HTML::Query->new(\@args);
1301             my $results = $query->query($spec);
1302              
1303             See L<"QUERY SYNTAX"> for the permitted syntax of the C<$spec> argument.
1304              
1305             =head2 get_elements()
1306              
1307             This method returns the stored results from a query. In list context it returns a list of
1308             matching L objects. In scalar context it returns a reference to
1309             the results array.
1310              
1311             my $query = HTML::Query->new(\@args);
1312             my $results = $query->query($spec);
1313              
1314             my @elements = $results->query($spec)->get_elements();
1315             my $elements = $results->query($spec)->get_elements();
1316              
1317             =head2 size()
1318              
1319             Returns the number of elements in the query.
1320              
1321             =head2 first()
1322              
1323             Returns the first element in the query.
1324              
1325             my $elem = $query->first;
1326              
1327             If the query is empty then an exception will be thrown. If you would rather
1328             have an undefined value returned then you can use the C method inherited
1329             from L. This effectively wraps the call to
1330             C in an C block to catch any exceptions thrown.
1331              
1332             my $elem = $query->try('first') || warn "no first element\n";
1333              
1334             =head2 last()
1335              
1336             Similar to L, but returning the last element in the query.
1337              
1338             my $elem = $query->last;
1339              
1340             =head2 list()
1341              
1342             Returns a list of the L object in the query in
1343             list context, or a reference to a list in scalar context.
1344              
1345             my @elems = $query->list;
1346             my $elems = $query->list;
1347              
1348             =head2 AUTOLOAD
1349              
1350             The C method maps any other method calls to the
1351             L objects in the list. When called in list
1352             context it returns a list of the values returned from calling the method on
1353             each element. In scalar context it returns a reference to a list of return
1354             values.
1355              
1356             my @text_blocks = $query->as_trimmed_text;
1357             my $text_blocks = $query->as_trimmed_text;
1358              
1359             =head1 KNOWN BUGS
1360              
1361             =head2 Attribute Values
1362              
1363             It is not possible to use C<]> in an attribute value. This is due to a
1364             limitation in the parser which will be fixed RSN.
1365              
1366             =head1 AUTHOR
1367              
1368             Andy Wardley L
1369              
1370             =head1 MAINTAINER
1371              
1372             Kevin Kamel
1373              
1374             =head1 CONTRIBUTORS
1375              
1376             Vivek Khera
1377             Michael Peters
1378             David Gray
1379              
1380             =head1 COPYRIGHT
1381              
1382             Copyright (C) 2010 Andy Wardley. All Rights Reserved.
1383              
1384             This module is free software; you can redistribute it and/or modify it
1385             under the same terms as Perl itself.
1386              
1387             =head1 SEE ALSO
1388              
1389             L, L,
1390             L, L, L
1391              
1392             =cut
1393              
1394             # Local Variables:
1395             # mode: Perl
1396             # perl-indent-level: 4
1397             # indent-tabs-mode: nil
1398             # End:
1399             #
1400             # vim: expandtab shiftwidth=4: