File Coverage

lib/WWW/Automate.pm
Criterion Covered Total %
statement 122 178 68.5
branch 15 28 53.5
condition 2 4 50.0
subroutine 19 23 82.6
pod 13 14 92.8
total 171 247 69.2


line stmt bran cond sub pod time code
1             #!/usr/bin/perl -w
2             #
3             # WWW::Automate (c) 2002 Kirrily Robert
4             # This software is distributed under the same licenses as Perl; see
5             # the file COPYING for details.
6              
7             #
8             # $Id: Automate.pm,v 1.1.1.1 2005/12/01 03:07:33 chezskud Exp $
9             #
10              
11             package WWW::Automate;
12              
13 1     1   975 use HTTP::Request;
  1         23231  
  1         43  
14 1     1   1090 use LWP::UserAgent;
  1         22771  
  1         34  
15 1     1   1035 use HTML::Form;
  1         19006  
  1         73  
16 1     1   1018 use HTML::TokeParser;
  1         12280  
  1         36  
17 1     1   683 use Clone qw(clone);
  1         15732  
  1         99  
18 1     1   12 use Carp;
  1         3  
  1         1593  
19              
20             our @ISA = qw( LWP::UserAgent );
21              
22             my $VERSION = $VERSION = "0.21";
23              
24             my $headers;
25              
26             =pod
27              
28             =head1 NAME
29              
30             WWW::Automate - automate interaction with websites
31              
32             =head1 NOTICE
33              
34             B
35              
36             Please use WWW::Mechanize instead.
37              
38             =head1 SYNOPSIS
39              
40             use WWW::Automate;
41             my $agent = WWW::Automate->new();
42              
43             $agent->get($url);
44              
45             $agent->follow($link);
46              
47             $agent->form($number);
48             $agent->field($name, $value);
49             $agent->click($button);
50              
51             $agent->back();
52              
53             $agent->add_header($name => $value);
54              
55             print "OK" if $agent->{content} =~ /$expected/;
56              
57             =head1 DESCRIPTION
58              
59             This module is intended to help you automate interaction with a website.
60             It bears a not-very-remarkable outwards resemblance to WWW::Chat, on
61             which it is based. The main difference between this module and
62             WWW::Chat is that WWW::Chat requires a pre-processing stage before you
63             can run your script, whereas WWW::Automate does not.
64              
65             WWW::Automate is a subclass of LWP::UserAgent, so anything you can do
66             with an LWP::UserAgent, you can also do with this. See
67             L for more information on the possibilities.
68              
69             =head2 new()
70              
71             Creates and returns a new WWW::Automate object, hereafter referred to as
72             the 'agent'.
73 1         41  
  1         5  
74 1     1   2 my $agent = WWW::Automate->new()
  1         21327  
  1         598  
75 1     1   6  
  1         312  
  1         4  
  1         3  
  1         22  
76 1     1   128 =begin testing
  1         2  
  1         2605  
77              
78             BEGIN: {
79 1         555 use lib qw(lib/);
80 1         333 use_ok('WWW::Automate');
81 1         451 use vars qw($agent);
82 1         395 }
83 1         516  
84 1         383 ok(WWW::Automate->can('new'), "can we call new?");
85             ok($agent = WWW::Automate->new(), "create agent object");
86             isa_ok($agent, 'WWW::Automate', "agent is a WWW::Automate");
87             can_ok($agent, 'request'); # as a subclass of LWP::UserAgent
88             like($agent->agent(), qr/WWW-Automate/, "Set user agent string");
89             like($agent->agent(), qr/$WWW::Automate::VERSION/, "Set user agent version");
90              
91             =end testing
92              
93             =cut
94              
95             our $base = "http://localhost/";
96              
97             sub new {
98 3     3 1 5 shift;
99 3         20 warn "WWW::Automate is no longer maintained. Please use WWW::Mechanize instead.\n";
100 3         28 my $self = { page_stack => [] };
101 3         5 bless $self;
102 3         25 $self->agent("WWW-Automate-$VERSION");
103 3         988 $self->env_proxy();
104 3         6906 return $self;
105             }
106              
107             =head2 $agent->get($url)
108              
109             Given a URL/URI, fetches it.
110              
111             The results are stored internally in the agent object, as follows:
112              
113             uri The current URI
114             req The current request object [HTTP::Request]
115             res The response received [HTTP::Response]
116             status The status code of the response
117             ct The content type of the response
118             base The base URI for current response
119             content The content of the response
120             forms Array of forms found in content [HTML::Form]
121             form Current form [HTML::Form]
122 1         384 links Array of links found in content
  1         10  
123 1         412  
124 1         422 You can get at them with, for example: $agent->{content}
125              
126             =begin testing
127              
128             ok($agent->get("http://google.com"), "Get google webpage");
129             isa_ok($agent->{uri}, "URI", "Set uri");
130             isa_ok($agent->{req}, 'HTTP::Request', "req should be a HTTP::Request");
131              
132             =end testing
133              
134             =cut
135              
136             sub get {
137 3     3 1 9 my ($self, $uri) = @_;
138 3         25 $self->{uri} = URI->new_abs($uri, $base);
139 3         10699 $self->{req} = HTTP::Request->new(GET => $uri);
140 3         414 $self->do_request();
141             }
142              
143             =head2 $agent->follow($string|$num)
144              
145 1         351 Follow a link. If you provide a string, the first link whose text
  1         7  
146 1         319 matches that string will be followed. If you provide a number, it will
147 1         665 be the nth link on the page.
148              
149 1         200 =begin testing
150 1         202  
151 1         406 ok(! $agent->follow(99999), "Can't follow too-high-numbered link");
152             ok($agent->follow(1), "Can follow first link");
153             ok($agent->back(), "Can go back");
154              
155             ok(! $agent->follow(qr/asdfghjksdfghj/), "Can't follow unlikely named link");
156             ok($agent->follow("Search"), "Can follow obvious named link");
157             $agent->back();
158              
159             =end testing
160              
161             =cut
162              
163             sub follow {
164 4     4 1 10 my ($self, $link) = @_;
165 4         5 my @links = @{$self->{links}};
  4         11  
166 4         6 my $thislink;
167 4 100       11 if (isnumber($link)) {
168 2 50       7 if ($link <= $#links) {
169 0         0 $thislink = $links[$link];
170             } else {
171 2         18 warn "Link number $link is greater than maximum link $#links ",
172             "on this page ($self->{uri})\n";
173 2         51 return undef;
174             }
175             } else { # user provided a regexp
176 2         5 LINK: foreach my $l (@links) {
177 2 50       20 if ($l->[1] =~ /$link/) {
178 0         0 $thislink = $l; # grab first match
179 0         0 last LINK;
180             }
181             }
182 2 50       7 unless ($thislink) {
183 2         9 warn "Can't find any link matching $link on this page ",
184             "($self->{uri})\n";
185 2         33 return undef;
186             }
187             }
188              
189 0         0 $thislink = $thislink->[0]; # we just want the URL, not the text
190              
191 0         0 $self->push_page_stack();
192             #print STDERR "thislink is $thislink, base is $self->{base}";
193 0         0 $self->{uri} = URI->new_abs($thislink, $self->{base});
194 0         0 $self->{req} = HTTP::Request->new(GET => $self->{uri});
195 0         0 $self->do_request();
196              
197 0         0 return 1;
198             }
199              
200             =head2 $agent->form($number)
201              
202             Selects the Nth form on the page as the target for subsequent calls to
203 1         2 field() and click(). Emits a warning and returns false if there is no
  1         6  
204 1         10 such form. Forms are indexed from 1, that is to say, the first form is
205 1         14 number 1 (not zero).
206 1         666  
207 1         263 =begin testing
208 1         284  
209             my $t = WWW::Automate->new();
210             $t->get("http://google.com");
211             ok($t->form(1), "Can select the first form");
212             is($t->{form}, $t->{forms}->[0], "Set the form attribute");
213             ok(! $t->form(99), "Can't select the 99th form");
214             is($t->{form}, $t->{forms}->[0], "Form is still set to 1");
215              
216             =end testing
217              
218             =cut
219              
220             sub form {
221 2     2 1 5 my ($self, $form) = @_;
222 2 50       9 if ($self->{forms}->[$form-1]) {
223 0         0 $self->{form} = $self->{forms}->[$form-1];
224 0         0 return 1;
225             } else {
226 2         480 carp "There is no form number $form";
227 2         109 return 0;
228             }
229             }
230              
231             =head2 $agent->field($name, $value, $number)
232              
233             Given the name of a field, set its value to the value specified. This
234             applies to the current form (as set by the form() method or defaulting
235             to the first form on the page).
236              
237             The optional $number parameter is used to distinguish between two fields
238             with the same name. The fields are numbered from 1.
239              
240             =cut
241              
242             sub field {
243 1     1 1 3 my ($self, $name, $value, $number) = @_;
244 1   50     10 $number ||= 1;
245 1 50       6 if ($number > 1) {
246 0         0 $form->find_input($name, $number)->value($value);
247             } else {
248 1         315 $self->{form}->value($name => $value);
249             }
250             }
251              
252             =head2 $agent->click($button, $x, $y);
253              
254 1         290 Has the effect of clicking a button on a form. This method takes an
  1         7  
255 1         6 optional method which is the name of the button to be pressed. If there
256 1         8 is only one button on the form, it simply clicks that one button.
257 0         0  
258 0         0 =begin testing
259              
260             my $t = WWW::Automate->new();
261             $t->get("http://google.com");
262             $t->field(q => "foo");
263             ok($t->click("btnG"), "Can click 'btnG' ('Google Search' button)");
264             like($t->{content}, qr/foo\s?fighters/i, "Found 'Foo Fighters'");
265              
266             =end testing
267              
268             =cut
269              
270             sub click {
271 0     0 1 0 my ($self, $button, $x, $y) = @_;
272 0 0       0 for ($x, $y) { $_ = 1 unless defined; }
  0         0  
273 0         0 $self->push_page_stack();
274 0         0 $self->{uri} = $self->{form}->uri;
275 0         0 $self->{req} = $self->{form}->click($name, $x, $y);
276 0         0 $self->do_request();
277             }
278              
279             =head2 $agent->submit()
280              
281             Shortcut for $a->click("submit")
282              
283             =cut
284              
285             sub submit {
286 0     0 1 0 my ($self) = shift;
287 0         0 $self->click("submit");
288             }
289              
290             =head2 $agent->back();
291              
292             The equivalent of hitting the "back" button in a browser. Returns to
293             the previous page. Won't go back past the first page.
294              
295             =cut
296              
297             sub back {
298 2     2 1 4 my $self = shift;
299 2         4 $self->pop_page_stack;
300             }
301              
302             =head2 $agent->add_header(name => $value)
303              
304             Sets a header for the WWW::Automate agent to use every time it gets a
305             webpage. This is *NOT* stored in the agent object (because if it were,
306             it would disappear if you went back() past where you'd set it) but in
307             the hash variable %WWW::Automate::headers, which is a hashref of all headers
308 0         0 to be set. You can manipulate this directly if you want to; the
  0         0  
309 0         0 add_header() method is just provided as a convenience function for the most
310             common case of adding a header.
311              
312             =begin testing
313              
314             $agent->add_header(foo => 'bar');
315             is($WWW::Automate::headers{'foo'}, 'bar', "set header");
316              
317             =end testing
318              
319             =cut
320              
321             sub add_header {
322 0     0 1 0 my ($self, $name, $value) = @_;
323 0         0 $WWW::Automate::headers{$name} = $value;
324             }
325              
326             =head1 INTERNAL METHODS
327              
328             These methods are only used internally. You probably don't need to
329             know about them.
330              
331             =head2 push_page_stack()
332              
333             =head2 pop_page_stack()
334              
335             The agent keeps a stack of visited pages, which it can pop when it needs
336             to go BACK and so on.
337              
338             The current page needs to be pushed onto the stack before we get a new
339             page, and the stack needs to be popped when BACK occurs.
340 0         0  
  0         0  
341 0         0 Neither of these take any arguments, they just operate on the $agent
342 0         0 object.
  0         0  
343 0         0  
344 0         0 =begin testing
  0         0  
345 0         0  
346 0         0 my $t = WWW::Automate->new();
  0         0  
347 0         0 $t->get("http://www.google.com");
348 0         0 is(scalar @{$t->{page_stack}}, 0, "Page stack starts empty");
  0         0  
349 0         0 $t->push_page_stack();
350 0         0 is(scalar @{$t->{page_stack}}, 1, "Pushed item onto page stack");
  0         0  
351 0         0 $t->push_page_stack();
352 0         0 is(scalar @{$t->{page_stack}}, 2, "Pushed item onto page stack");
  0         0  
353             $t->pop_page_stack();
354             is(scalar @{$t->{page_stack}}, 1, "Popped item from page stack");
355             $t->pop_page_stack();
356             is(scalar @{$t->{page_stack}}, 0, "Popped item from page stack");
357             $t->pop_page_stack();
358             is(scalar @{$t->{page_stack}}, 0, "Can't pop beyond end of page stack");
359              
360              
361             =end testing
362              
363             =cut
364              
365             sub push_page_stack {
366 0     0 1 0 my $self = shift;
367 0         0 $self->{page_stack} = [ @{$self->{page_stack}}, clone($self)];
  0         0  
368 0         0 return 1;
369             }
370              
371             sub pop_page_stack {
372 2     2 1 3 my $self = shift;
373 2 50       3 if (@{$self->{page_stack}}) {
  2         7  
374 0         0 $self = pop @{$self->{page_stack}};
  0         0  
375 0         0 bless $self;
376             }
377 2         6 return 1;
378             }
379              
380             =head2 extract_links()
381              
382             Extracts HREF links from the content of a webpage.
383              
384             =cut
385              
386             sub extract_links {
387 3     3 1 40 my $self = shift;
388 3         18 my $p = HTML::TokeParser->new(\$self->{content});
389 3         503 my @links;
390              
391 3         14 while (my $token = $p->get_tag("a", "frame")) {
392 3 50       937 my $url = $token->[0] eq 'a' ? $token->[1]{href} : $token->[1]{src};
393 3 50       13 next unless defined $url; # probably just a name link
394 3 50       19 my $text = $token->[0] eq 'a' ?
395             $p->get_trimmed_text("/a"):$token->[1]{name};
396 3         216 push(@links, [$url => $text]);
397             }
398 3         282 return \@links;
399             }
400              
401             =head2 do_request()
402              
403             Actually performs a request on the $self->{req} request object, and sets
404             a bunch of attributes on $self.
405              
406             =cut
407              
408             sub do_request {
409 3     3 1 6 my ($self) = @_;
410 3         14 foreach my $h (keys %WWW::Automate::headers) {
411 0         0 $self->{req}->header( $h => $WWW::Automate::headers{$h} );
412             }
413 3         22 $self->{res} = $self->request($self->{req});
414 3         176416 $self->{status} = $self->{res}->code;
415 3         46 $self->{base} = $self->{res}->base;
416 3   50     1707 $self->{ct} = $self->{res}->content_type || "";
417 3         156 $self->{content} = $self->{res}->content;
418              
419 3 50       59 if ($self->{ct} eq 'text/html') {
420 3         28 $self->{forms} = [ HTML::Form->parse($self->{content}, $self->{res}->base) ];
421 3 50       3424 $self->{form} = $self->{forms}->[0] if @{$self->{forms}};
  3         16  
422 3         16 $self->{links} = $self->extract_links();
423             }
424             }
425              
426             sub isnumber {
427 4     4 0 6 my $in = shift;
428 4 100       17 if ($in =~ /^\d+$/) {
429 2         17 return 1;
430             } else {
431 2         4 return 0;
432             }
433             }
434              
435             =head1 BUGS
436              
437             Please report any bugs via the system at http://rt.cpan.org/
438              
439             =head1 AUTHOR
440              
441             Kirrily "Skud" Robert
442              
443             =cut
444              
445             1;