File Coverage

blib/lib/WWW/Mechanize/Plugin/DOM.pm
Criterion Covered Total %
statement 42 213 19.7
branch 0 66 0.0
condition 0 15 0.0
subroutine 14 46 30.4
pod 5 8 62.5
total 61 348 17.5


line stmt bran cond sub pod time code
1             package WWW::Mechanize::Plugin::DOM;
2              
3             # DOM is in a separate module from JavaScript because other scripting
4             # languages may use DOM as well. Anyone have time to implement Acme::Chef
5             # bindings for Mech? :-)
6              
7             $VERSION = '0.014';
8              
9 2     2   1703 use 5.006;
  2         8  
  2         87  
10              
11 2     2   14 use strict;
  2         3  
  2         71  
12 2     2   22 use warnings; no warnings qw 'utf8 parenthesis bareword';
  2     2   4  
  2         84  
  2         9  
  2         4  
  2         108  
13              
14 2     2   2100 use Encode qw'encode decode';
  2         25329  
  2         178  
15 2     2   1688 use Hash::Util::FieldHash::Compat 'fieldhash';
  2         5171  
  2         15  
16 2     2   2273 use HTML::DOM 0.021;
  2         287520  
  2         141  
17 2     2   1762 use HTTP::Headers::Util 'split_header_words';
  2         1749  
  2         144  
18 2     2   16 use Scalar::Util 'weaken';
  2         4  
  2         1465  
19 2     2   21 no URI();
  2         5  
  2         34  
20 2     2   2622 no WWW::Mechanize ();
  2         338870  
  2         58  
21 2     2   2059 no WWW::Mechanize::Plugin::DOM::Window ();
  2         10  
  2         4539  
22              
23             fieldhash my %parathia; # keyed by mech
24             fieldhash my %mech_per_frame; # keyed by (i)frame element
25              
26             sub init { # expected to return a plugin object that the mech object will
27             # use to communicate with the plugin.
28              
29 0     0 0   my ($package, $mech) = @_;
30              
31 0           my $self = bless {
32             script_handlers => {},
33             event_attr_handlers => {},
34             s => 1, # scriptable
35             mech => $mech,
36             }, $package;
37 0           weaken $self->{mech};
38              
39 0           $mech->set_my_handler(
40             parse_html => \&_parse_html
41             );
42             $mech->set_my_handler( get_content =>
43             sub {
44 0     0     shift;
45 0           my $mech = shift;
46 0 0         $mech->is_html or return;
47 0           my $stuff = (my $self = $mech->plugin('DOM'))
48             ->tree->innerHTML;
49 0 0         defined $$self{charset} ? encode $$self{charset}, $stuff :
50             $stuff;
51             }
52 0           );
53             $mech->set_my_handler( get_text_content =>
54             sub {
55 0     0     shift;
56 0           my $mech = shift;
57 0 0         $mech->is_html or return;
58 0           my $stuff = (my $self = $mech->plugin('DOM'))
59             ->tree->documentElement->as_text;
60 0 0         defined $$self{charset} ? encode $$self{charset}, $stuff :
61             $stuff;
62             }
63 0           );
64             $mech->set_my_handler( extract_forms =>
65             sub {
66 0     0     shift;
67 0           shift->plugin('DOM')->tree->forms
68             }
69 0           );
70             $mech->set_my_handler( extract_links => sub {
71 0     0     shift;
72 0           tie my @links, WWW'Mechanize'Plugin'DOM'Links:: =>
73             scalar shift->plugin('DOM')->tree->links
74 0           ;\@links;
75 0           });
76             $mech->set_my_handler( extract_images => sub {
77 0     0     shift;
78 0           my $doc = shift->plugin('DOM')->tree;
79             my $list = HTML::DOM::NodeList::Magic->new(
80 0           sub { grep tag $_ =~ /^i(?:mg|nput)\z/,
81             $doc->descendants },
82 0           $doc
83             );
84              
85 0           tie my @images, WWW'Mechanize'Plugin'DOM'Images:: => $list;
86 0           ;\@images;
87 0           });
88              
89 0           $self;
90             }
91              
92             sub _parse_html {
93 0     0     my (undef,$mech,undef,$src) = @_;
94 0           weaken $mech;
95 0           my $self = $mech->plugin('DOM');
96 0           weaken $self;
97              
98 0           $$self{tree} = my $tree = new HTML::DOM
99             response => $mech->response,
100             cookie_jar => $mech->cookie_jar;
101              
102 0     0     $tree->error_handler(sub{$mech->warn($@)});
  0            
103              
104             $tree->default_event_handler_for( link => sub {
105 0     0     $mech->get(shift->target->href)
106 0           });
107             $tree->default_event_handler_for( submit => sub {
108 0     0     $mech->request(shift->target->make_request);
109 0           });
110              
111 0 0 0       if(%{$$self{script_handlers}} || %{$$self{event_attr_handlers}}) {
  0            
  0            
112 0           my $script_type = $mech->response->header(
113             'Content-Script-Type');
114             defined $script_type or $tree->elem_handler(meta =>
115             sub {
116 0     0     my($tree, $elem) = @_;
117 0 0         return unless lc $elem->attr('http-equiv')
118             eq 'content-script-type';
119 0           $script_type = $elem->attr('content');
120 0 0         });
121              
122 0 0         if(%{$$self{script_handlers}}) {
  0            
123             $tree->elem_handler(script => sub {
124 0 0   0     return unless $self->{s};
125 0           my($tree, $elem) = @_;
126              
127 0           my $lang = $elem->attr('type');
128 0 0         defined $lang
129             or $lang = $elem->attr('language');
130 0 0         defined $lang or $lang = $script_type;
131              
132 0           my $uri;
133 0           my($inline, $code, $line) = 0;
134 0 0         if($uri = $elem->attr('src')) {
135 0           my $clone = $mech->clone->clear_history(1);
136 0           my $base = $mech->base;
137 0 0         $uri = URI->new_abs( $uri, $base )
138             if $base;
139 0           my $res = $clone->get($uri);
140 0 0         $res->is_success or
141             $mech->warn("couldn't get script $uri: "
142             . $res->status_line
143             );
144              
145             # Find out the encoding:
146 0           my $cs = {
147             map @$_,
148             split_header_words $res->header(
149             'Content-Type'
150             )
151             }->{charset};
152              
153 0   0       $code = decode $cs||$elem->charset
154             ||$tree->charset||'latin1',
155             $res->decoded_content(charset=>'none');
156            
157            
158 0           $line = 1;
159             }
160             else {
161 0           $code = $elem->firstChild->data;
162 0           ++$inline;
163 0           $uri = $mech->uri;
164 0           $line = _line_no(
165             $src,$elem->content_offset
166             );
167             };
168            
169             SCRIPT_HANDLER: {
170 0 0         if(defined $lang) {
  0            
171 0           while(my($lang_re,$handler) = each
  0            
172             %{$$self{script_handlers}}) {
173 0 0         next if $lang_re eq 'default';
174 0           $lang =~ $lang_re and
175             &$handler($mech, $tree, $code,
176             $uri, $line, $inline),
177             # reset iterator:
178 0 0         keys %{$$self{script_handlers}},
179             last SCRIPT_HANDLER;
180             }} # end of if-while
181 0 0         &{ $$self{script_handlers}{default} ||
  0            
182             return }($mech,$tree, $code,
183             $uri, $line, $inline);
184             } # end of S_H
185 0           });
186              
187             $tree->elem_handler(noscript => sub {
188 0 0   0     return unless $self->{s};
189 0           $_[1]->detach#->delete;
190             # ~~~ delete currently stops it from work-
191             # ing; I need to looook into this.
192 0           });
193             }
194              
195 0 0         if(%{$$self{event_attr_handlers}}) {
  0            
196             $tree->event_attr_handler(sub {
197 0 0   0     return unless $self->{s};
198 0           my($elem, $event, $code, $offset) = @_;
199 0           my $lang = $elem->attr('language');
200 0 0         defined $lang or $lang = $script_type;
201              
202 0           my $uri = $mech->uri;
203 0 0         my $line = defined $offset ? _line_no(
204             $src, $offset
205             ) : undef;
206              
207             HANDLER: {
208 0 0         if(defined $lang) {
  0            
209 0           while(my($lang_re,$handler) = each
  0            
210             %{$$self{event_attr_handlers}}) {
211 0 0         next if $lang_re eq 'default';
212 0           $lang =~ $lang_re and
213             &$handler($mech, $elem,
214             $event,$code,$uri,$line),
215             # reset the hash iterator:
216             keys
217 0 0         %{$$self{event_attr_handlers}},
218             last HANDLER;
219             }} # end of if-while
220 0 0         &{ $$self{event_attr_handlers}{default} ||
  0            
221             return }(
222             $mech,$elem,$event,$code,$uri,$line
223             );
224             } # end of HANDLER
225 0           });
226             }
227             }
228             # ~~~ Should we use the content of
229             # handler is provided but an event attribute handler *is*
230             # provided? (Now who would be crazy enough to do that?)
231             $tree->elem_handler(noscript => sub {
232 0 0 0 0     return if $self->{s} && %{$$self{script_handlers}};
  0            
233 0           $_[1]->replace_with_content->delete;
234             # ~~~ why does this need delete?
235 0           });
236              
237 0           $tree->defaultView(
238             my $view = $self->window
239             );
240 0           $tree->event_parent($view);
241 0           $view->document($tree);
242 0           $tree->set_location_object($view->location);
243              
244             $tree->elem_handler(iframe => my $frame_handler = sub {
245 0     0     my ($doc,$elem) = @_;
246 0           my $m = $mech->clone->clear_history(1);
247             # We have to have this extra reference, or the mech object
248             # won’t have any strong refs at all:
249 0           $mech_per_frame{$elem} = $m;
250 0           $elem->contentWindow(my $subwin=$m->plugin("DOM")->window);
251 0           $subwin->_set_parent($doc->defaultView);
252 0 0         defined(my $src = $elem->src) or return;
253 0           $m->get(new_abs URI $src, $mech->base);
254 0           });
255 0           $tree->elem_handler(frame => $frame_handler);
256              
257             # Find out the encoding:
258 0           $$self{charset} = my $cs = {
259             map @$_,
260             split_header_words $mech->response->header('Content-Type')
261             }->{charset};
262 0   0       $tree->charset($cs||'iso-8859-1');
263              
264 0 0         $tree->write(defined $cs ? decode $cs, $src : $src);
265 0           $tree->close;
266              
267 0           $tree->body->trigger_event('load');
268             # ~~~ Problem: Ever since JavaScript 1.0000000, the
269             # (un)load events on the body attribute have associated event
270             # handlers with the Window object. But the DOM 2 Events spec
271             # doesn’t provide for events on the window (view) at all; only
272             # on Nodes. The load event is supposed to be triggered on the
273             # document. In HTML 5 (10 June 2008 draft), what we are doing
274             # here is correct. In
275             # Safari & FF 3, the body element’s attributes create event
276             # handlers on the window, which are called with the document as
277             # the event’s target.
278              
279 0           return 1;
280             }
281              
282             sub _line_no {
283 0     0     my ($src,$offset) = @_;
284 0           return 1 + (() =
285             substr($src,0,$offset)
286             =~ /\cm\cj?|[\cj\x{2028}\x{2029}]/g
287             );
288             }
289              
290             sub options {
291 0     0 0   my($self,%opts) = @_;
292 0           for (keys %opts) {
293 0 0         if($_ eq 'script_handlers') {
    0          
294 0           %{$$self{script_handlers}} = (
  0            
295 0           %{$$self{script_handlers}}, %{$opts{$_}}
  0            
296             );
297             }
298             elsif($_ eq 'event_attr_handlers') {
299 0           %{$$self{event_attr_handlers}} = (
  0            
300 0           %{$$self{event_attr_handlers}},
301 0           %{$opts{$_}}
302             );
303             }
304             else {
305 0           require Carp;
306 0           Carp::croak(
307             "$_ is not a valid option for the DOM plugin"
308             );
309             }
310             }
311             }
312              
313             sub clone {
314 0     0 0   my $self = shift;
315 0           my $other = bless { map +($_=>$$self{$_}), qw[
316             script_handlers event_attr_handlers s
317             ]}, ref $self;
318 0           weaken($other->{mech} = shift);
319 0           $other;
320             }
321              
322 0     0 1   sub tree { $_[0]{tree} }
323             sub window {
324 0   0 0 1   $parathia{$_[0]{mech}} ||=
325             new WWW'Mechanize'Plugin'DOM'Window $_[0]{mech};
326             }
327              
328             sub scripts_enabled {
329 0     0 1   my $old = (my $self = shift)->{s};
330 0 0         if(@_) {{
  0            
331 0           $self->{s} = $_[0];
332 0   0       ($self->{tree} ||last) ->event_listeners_enabled(shift) ;
333             }}
334             $old
335 0           }
336              
337             sub check_timers {
338             # ~~~ temporary hack
339 0     0 1   shift->window->_check_timeouts;
340             }
341              
342             sub count_timers {
343             # ~~~ temporary hack
344 0     0 1   shift->window->_count_timers;
345             }
346              
347              
348             package WWW::Mechanize::Plugin::DOM::Links;
349              
350             our$ VERSION = '0.014';
351              
352 2     2   2127 use WWW::Mechanize::Link;
  2         764  
  2         427  
353              
354             sub TIEARRAY {
355 0     0     bless \(my $links = pop), shift;
356             }
357              
358             sub FETCH {
359 0     0     my $link = ${$_[0]}->[$_[1]];
  0            
360 0           return new WWW'Mechanize'Link::{
361             url => $link->attr('href'),
362             text => $link->as_text,
363             name => $link->attr('name'),
364             tag => $link->tag,
365             base => $link->ownerDocument->base,
366             attrs => {$link->all_external_attr},
367             }
368             }
369 0     0     sub FETCHSIZE { scalar @${$_[0]} }
  0            
370 0     0     sub EXISTS { exists ${$_[0]}->links->[$_[1]] }
  0            
371              
372              
373             package WWW::Mechanize::Plugin::DOM::Images;
374              
375             our$ VERSION = '0.014';
376              
377 2     2   1924 use WWW::Mechanize::Image;
  2         842  
  2         558  
378              
379             sub TIEARRAY {
380 0     0     bless \(my $links = pop), shift;
381             }
382              
383             sub FETCH {
384 0     0     my $img = ${$_[0]}->[$_[1]];
  0            
385 0           return new WWW'Mechanize'Image::{
386             url => $img->attr('src'),
387             name => $img->attr('name'),
388             tag => $img->tag,
389             base => $img->ownerDocument->base,
390             height => $img->attr('height'),
391             width => $img->attr('width'),
392             alt => $img->attr('alt'),
393             }
394             }
395 0     0     sub FETCHSIZE { scalar @${$_[0]} }
  0            
396 0     0     sub EXISTS { exists ${$_[0]}->links->[$_[1]] }
  0            
397              
398              
399             =head1 NAME
400              
401             WWW::Mechanize::Plugin::DOM - HTML Document Object Model plugin for Mech
402              
403             =head1 VERSION
404              
405             0.014 (alpha)
406              
407             THIS MODULE IS DEPRECATED. Please use L instead.
408              
409             =head1 SYNOPSIS
410              
411             use WWW::Mechanize;
412              
413             my $m = new WWW::Mechanize;
414              
415             $m->use_plugin('DOM',
416             script_handlers => {
417             default => \&script_handler,
418             qr/(?:^|\/)(?:x-)?javascript/ => \&script_handler,
419             },
420             event_attr_handlers => {
421             default => \&event_attr_handler,
422             qr/(?:^|\/)(?:x-)?javascript/ => \&event_attr_handler,
423             },
424             );
425              
426             sub script_handler {
427             my($mech, $dom_tree, $code, $url, $line, $is_inline) = @_;
428             # ... code to run the script ...
429             }
430              
431             sub event_attr_handler {
432             my($mech, $elem, $event_name, $code, $url, $line) = @_;
433             # ... code that returns a coderef ...
434             }
435              
436             $m->plugin('DOM')->tree; # DOM tree for the current page
437             $m->plugin('DOM')->window; # Window object
438              
439             =head1 DESCRIPTION
440              
441             This is a plugin for L that provides support for the HTML
442             Document Object Model. This is a part of the
443             L distribution, but it can be used on
444             its own.
445              
446             =head1 USAGE
447              
448             To enable this plugin, use Mech's C method, as shown in the
449             synopsis.
450              
451             To access the DOM tree, use C<< $mech->plugin('DOM')->tree >>, which
452             returns an HTML::DOM object.
453              
454             You may provide a subroutine that runs an inline script like this:
455              
456             $mech->use_plugin('DOM',
457             script_handlers => {
458             qr/.../ => sub { ... },
459             qr/.../ => sub { ... },
460             # etc
461             }
462             );
463              
464             And a subroutine for turning HTML event attributes into subroutines, like
465             this:
466              
467             $mech->use_plugin('DOM',
468             event_attr_handlers => {
469             qr/.../ => sub { ... },
470             qr/.../ => sub { ... },
471             # etc
472             }
473             );
474              
475             In both cases, the C should be a regular expression that matches
476             the scripting language to which the handler applies, or the string
477             'default'. The scripting language will be either a MIME type or the
478             contents of the C attribute if a script element's C
479             attribute is not present. The subroutine specified as the 'default' will be
480             used if there is no handler for the scripting language in question or if
481             there is no Content-Script-Type header and, for
482             C, the script element has no
483             'type' or 'language' attribute.
484              
485             Each time you move to another page with WWW::Mechanize, a different copy
486             of the DOM plugin object is created. So, if you must refer to it in a
487             callback
488             routine, don't use a closure, but get it from the C<$mech> object that is
489             passed as the first argument.
490              
491             =head1 METHODS
492              
493             This is the usual boring list of methods. Those that are described above
494             are listed here without descriptions.
495              
496             =item window
497              
498             This returns the window object.
499              
500             =item tree
501              
502             This returns the DOM tree (aka the document object).
503              
504             =item check_timers
505              
506             This evaluates the code associated with each timeout registered with
507             the window's C function,
508             if the appropriate interval has elapsed.
509              
510             =item count_timers
511              
512             This returns the number of timers currently registered.
513              
514             =item scripts_enabled ( $new_val )
515              
516             This returns a boolean indicating whether scripts are enabled. It is true
517             by default. You can disable scripts by passing a false value. When you
518             disable scripts, event handlers are also disabled, as is the registration
519             of event handlers by HTML event attributes.
520              
521             =head1 THE 'LOAD' EVENT
522              
523             Currently the (on)load event is triggered when the page finishes parsing.
524             This plugin assumes that you're not going to be loading any images, etc.
525              
526             =head1 THE C<%Interface> HASH
527              
528             If you are creating your own script binding, you'll probably want to access
529             the hash named C<%WWW::Mechanize::Plugin::DOM::Interface>, which lists, in
530             a machine-readable format, the interface members of the location and
531             navigator objects. It follows the same format as
532             L<%HTML::DOM::Interface|HTML::DOM::Interface>.
533              
534             See also L HASH> for
535             a list of members of the window object.
536              
537             =head1 PREREQUISITES
538              
539             L 0.021 or later
540              
541             L
542              
543             The current stable release of L does not support plugins.
544             See
545             L for more info.
546              
547             L
548              
549             L
550              
551             =head1 BUGS
552              
553             =over 4
554              
555             =item *
556              
557             The onunload event is not yet supported.
558              
559             =item *
560              
561             The location object's C method does not currently work correctly
562             if the current page is the first page. In that case it acts like an
563             assignment to C.
564              
565             =item *
566              
567             The window object's C property does not currently get updated
568             when you go back.
569              
570             =item *
571              
572             It does not hook into L's C feature to
573             run event handlers.
574              
575             =item *
576              
577             There is no support for XHTML.
578              
579             =item *
580              
581             The 'about:blank' URL is not yet supported.
582              
583             =item *
584              
585             If you try to get any of the attributes of the location object (or
586             stringify the loc object) when no browsing has happened yet, you'll get an
587             error. (This should return 'about:blank'.)
588              
589             =item *
590              
591             Fetching a URL that differs from the current page's only by the fragment
592             currently creates a brand new DOM object and scripting environment.
593              
594             =item *
595              
596             There is nothing to prevent infinite recursion when frames have circular
597             references.
598              
599             =back
600              
601             =head1 AUTHOR & COPYRIGHT
602              
603             Copyright (C) 2007-8 Father Chrysostomos
604             join '.', reverse org => 'cpan' >>E
605              
606             This program is free software; you may redistribute it and/or modify
607             it under the same terms as perl.
608              
609             =head1 SEE ALSO
610              
611             L
612              
613             L
614              
615             L
616              
617             L
618              
619             L