File Coverage

blib/lib/HTML/TreeBuilderX/ASP_NET.pm
Criterion Covered Total %
statement 50 55 90.9
branch 6 10 60.0
condition 3 9 33.3
subroutine 14 16 87.5
pod 3 3 100.0
total 76 93 81.7


line stmt bran cond sub pod time code
1             package HTML::TreeBuilderX::ASP_NET;
2 5     5   131547 use 5.010;
  5         17  
  5         221  
3 5     5   31 use strict;
  5         18  
  5         190  
4 5     5   23 use warnings;
  5         9  
  5         151  
5              
6 5     5   4564 use Moose;
  5         2470989  
  5         44  
7 5     5   43846 use HTML::TreeBuilderX::ASP_NET::Types qw( htmlAnchorTag htmlFormTag );
  5         19  
  5         40  
8 5     5   14258 use HTTP::Request::Form;
  5         734850  
  5         191  
9 5     5   6227 use HTML::Element;
  5         534905  
  5         56  
10 5     5   224 use Carp;
  5         12  
  5         526  
11              
12             our $VERSION = '0.09';
13              
14 5     5   31 use mro 'c3';
  5         13  
  5         53  
15             with 'MooseX::Traits';
16              
17             has '+_trait_namespace' => (
18             isa => 'Str'
19             , default => 'HTML::TreeBuilderX::ASP_NET::Roles'
20             );
21              
22             has 'hrf' => (
23             isa => 'HTTP::Request::Form'
24             , is => 'ro'
25             , handles => qr/.*/
26             , lazy_build => 1
27             );
28              
29             has 'element' => (
30             isa => htmlAnchorTag
31             , is => 'ro'
32             , predicate => 'has_element'
33             );
34              
35             has 'form' => (
36             isa => htmlFormTag
37             , is => 'ro'
38             , lazy_build => 1
39             );
40              
41             has 'eventTriggerArgument' => (
42             isa => 'HashRef'
43             , is => 'ro'
44             , lazy_build => 1
45             );
46              
47             has 'baseURL' => ( isa => 'Maybe[URI]', is => 'ro' );
48              
49             has 'debug' => ( isa => 'Bool', is => 'ro', default => 0 );
50              
51             sub httpRequest {
52 0     0 1 0 my ( $self, @args ) = @_;
53 0         0 $self->press(@args);
54             }
55              
56             sub _build_eventTriggerArgument {
57 0     0   0 my $self = shift;
58              
59 0 0       0 Carp::croak
60             'User must provide an eventTriggerArgument, '
61             . ' or an element to generate one from'
62             unless $self->has_element
63             ;
64              
65 0         0 parseDoPostBack( $self->element );
66              
67             }
68              
69             sub _build_form {
70 3     3   6 my $self = shift;
71              
72 3 100       499 Carp::croak
73             'Please construct with either an HTML::Element of tag <form>'
74             . ' or with a child HTML::Element of a <form>'
75             unless $self->has_element
76             ;
77              
78 2         110 my $form = $self->element->look_up( _tag => 'form' );
79              
80 2 100       196 Carp::croak 'Please ensure there is a parent <form>'
81             . ' of the provided HTML::Element <'.$self->element->tag.'>'
82             unless defined $form
83             ;
84              
85 1         42 $form
86              
87             }
88              
89             around 'form' => sub {
90             my ( $sub, $self, @args ) = @_;
91              
92             my $form = $self->$sub( @args );
93             $form->push_content($_) for createInputElements($self->eventTriggerArgument);
94              
95             $form;
96            
97             };
98              
99             sub _build_hrf {
100 6     6   15 my $self = shift;
101              
102 6         50 HTTP::Request::Form->new(
103             $self->form
104             , $self->baseURL
105             , $self->debug
106             );
107              
108             }
109              
110             ##
111             ## END Moose, the other two funcs are helpers
112             ##
113             sub parseDoPostBack {
114 1     1 1 1193 my ($element) = @_;
115              
116             (
117 1   33     7 $element->attr('href')
118             // $element->attr('onchange')
119             ) =~ /__doPostBack\((.*)\)/;
120              
121 1         25 $1 =~ s/\\'/'/g;
122 1         4 my $args = $1;
123 1         9 my ( $eventTarget, $eventArgument ) = split /\s*,\s*/, $args;
124              
125 1 50 33     13 Carp::croak 'Please submit a valid __doPostBack'
126             unless $eventTarget && $eventArgument
127             ;
128              
129 1   33     17 s/^'// && s/'$// for ($eventTarget, $eventArgument);
130              
131 1         6 return { $eventTarget, $eventArgument };
132              
133             }
134              
135             sub createInputElements {
136 5     5 1 22 my $hash = shift;
137              
138 5 50       20 Carp::croak 'createInputElements requires a HashRef'
139             unless ref $hash eq 'HASH'
140             ;
141              
142 5         22 my ( $eventTarget, $eventArgument ) = %$hash;
143 5         45 my @elements = (
144             HTML::Element->new(
145             'input'
146             , name => '__EVENTTARGET'
147             , value => $eventTarget
148             )
149             , HTML::Element->new(
150             'input'
151             , name => '__EVENTARGUMENT'
152             , value => $eventArgument
153             )
154             );
155              
156 5         386 \@elements;
157              
158             }
159              
160 5     5   3809 no Moose;
  5         17  
  5         50  
161             __PACKAGE__->meta->make_immutable;
162              
163             1;
164              
165             __END__
166              
167             =head1 NAME
168              
169             HTML::TreeBuilderX::ASP_NET - Scrape ASP.NET/VB.NET sites which utilize Javascript POST-backs.
170              
171             =head1 SYNOPSIS
172              
173             my $ua = LWP::UserAgent->new;
174             my $resp = $ua->get('http://uniqueUrl.com/Server.aspx');
175             my $root = HTML::TreeBuilder->new_from_content( $resp->content );
176             my $a = $root->look_down( _tag => 'a', id => 'nextPage' );
177             my $aspnet = HTML::TreeBuilderX::ASP_NET->new({
178             element => $a
179             , baseURL =>$resp->request->uri ## takes into account posting redirects
180             });
181             my $resp = $ua->request( $aspnet->httpResponse );
182              
183             ## or the easy cheating way see the SEE ALSO section for links
184             my $aspnet = HTML::TreeBuilderX::ASP_NET->new_with_traits( traits => ['htmlElement'] );
185             $form->look_down(_tag=> 'a')->httpResponse
186              
187             =head1 DESCRIPTION
188              
189             Scrape ASP.NET sites which utilize the language's __VIEWSTATE, __EVENTTARGET, __EVENTARGUMENT, __LASTFOCUS, et al. This module returns a HTTP::Response from the form with the use of the method C<-E<gt>httpResponse>.
190              
191             In this scheme many of the links on a webpage will apear to be javascript functions. The default Javascript function is C<__doPostBack(eventTarget, eventArgument)>. ASP.NET has two hidden fields which record state: __VIEWSTATE, and __LASTFOCUS. It abstracts each link with a method that utilizes an HTTP post-back to the server. The Javascript behind C<__doPostBack> simply appends __EVENTTARGET=$eventTarget&__EVENTARGUMENT=$eventArgument onto the POST request from the parent form and submits it. When the server receives this request it decodes and decompresses the __VIEWSTATE and uses it along with the new __EVENTTARGET and __EVENTARGUMENT to perform the action, which is often no more than serializing the data back into the __VIEWSTATE.
192              
193             Sometimes developers cloak the C<__doPostBack(target,arg)> with names akin to C<changepage(arg)> which simply call C<__doPostBack("target", arg)>. This module will handle this use case as well using the explicit an eventTriggerArugment in the constructor.
194              
195             This flow is a bane on RESTLESS http and makes no sense whatsoever. Thanks Microsoft.
196              
197             .-------------------------------------------------------------------.
198             | HTML FORM 1 |
199             | <form action="Server.aspx" method="post"> |
200             | <input type="hidden" name="__VIEWSTATE" value="encryptedXML-FOO"> |
201             | <a>1</a> | |
202             | <a href="javascript:__doPostBack('gotopage','2')">2</a> |
203             | ... |
204             '-------------------------------------------------------------------'
205             |
206             v
207             _________________________________
208             \ \
209             ) User clicks the link named "2" )
210             /________________________________/
211             |
212             v
213             .------------------------------------------------------------------------.
214             | POST http://aspxnonsensery/Server.aspx |
215             | Content-Length: 2659 |
216             | Content-Type: application/x-www-form-urlencoded |
217             | |
218             | __VIEWSTATE=encryptedXML-FOO&__EVENTTARGET=gotopage1&__EVENTARGUMENT=2 |
219             '------------------------------------------------------------------------'
220             |
221             v
222             .----------------------------------------------------------------------.
223             | HTML FORM 2 |
224             | (different __VIEWSTATE) |
225             | <form action="Server.aspx" method="post"> |
226             | <input type="hidden" name="__VIEWSTATE" value="encryptedXML-BAR"> |
227             | <a href="javascript:__doPostBack('gotopage','1')">1</a> | |
228             | <a>2</a> |
229             | ... |
230             '----------------------------------------------------------------------'
231              
232             =head2 METHODS
233              
234             B< IN ADDITION TO ALL OF THE METHODS FROM L<HTTP::Request::Form> >
235              
236             =over 4
237              
238             =item ->new({ hashref })
239              
240             Takes a HashRef, returns a new instance some of the possible key/values are:
241              
242             =over 4
243              
244             =item form => $htmlElement
245              
246             optional: You explicitly send the HTML::Elmenet representing the form. If you do not one will be implicitly deduced from the $self->element, making element=>$htmlElement a requirement
247              
248             =item eventTriggerArgument => $hashRef
249              
250             Not needed if you supply an element. This takes a HashRef and will create HTML::Elements that mimmick hidden input fields. From which to tack onto the $self->form.
251              
252             =item element => $htmlElement
253              
254             Not needed if you send an eventTriggerArgument. Attempts to deduce the __EVENTARGUMENT and __EVENTTARGET from the 'href' attribute of the element just as if the two were supplied explicitly. It will also be used to deduce a form by looking up in the HTML tree if one is not supplied.
255              
256             =item debug => *0|1
257              
258             optional: Sends the debug flag H:R:F, default is off.
259              
260             =item baseURL => $uri
261              
262             optional: Sets the base of the URL for the post action
263              
264             =back
265              
266             =item ->httpRequest
267              
268             Returns an L<HTTP::Request> object for the HTTP POST
269              
270             =item ->hrf
271              
272             Explicitly return the underlying L<HTTP::Request::Form> object. All methods fallback here anyway, but this will return that object directly.
273              
274             =back
275              
276             =head2 FUNCTIONS
277              
278             None of these are exported...
279              
280             =over 4
281              
282             =item createInputElements( {eventTarget => eventArgument} )
283              
284             Helper function takes two values in an HashRef. Assumes the key is the __EVENTTARGET and value the __EVENTARGUMENT, returns two L<HTML::Element> pseudo-input fields with the information.
285              
286             =item parseDoPostBack( $str )
287              
288             Accepts a string that is often the "href" attribute of an HTTP::Element. It simple parses out the call to Javascript, using regexes, and makes the two args useable to perl in the form of an HashRef.
289              
290             =back
291              
292             =head1 SEE ALSO
293              
294             =over 4
295              
296             =item L<HTML::TreeBuilderX::ASP_NET::Roles::htmlElement>
297              
298             For an easy way to glue the two together
299              
300             =item L<HTTP::Request>
301              
302             For the object the method htmlElement returns
303              
304             =item L<HTTP::Request::Form>
305              
306             For a base class, to which all methods are valid
307              
308             =item HTML::Element
309              
310             For the base class of all HTML tokens
311              
312             =back
313              
314             =head1 AUTHOR
315              
316             Evan Carroll, C<< <me at evancarroll.com> >>
317              
318             =head1 BUGS
319              
320             None, though *much* more support should be added to ->element. Not everthing is a simple anchor tag.
321              
322             =head1 SUPPORT
323              
324             You can find documentation for this module with the perldoc command.
325              
326             perldoc HTML::TreeBuilderX::ASP_NET
327              
328              
329             You can also look for information at:
330              
331             =over 4
332              
333             =item * RT: CPAN's request tracker
334              
335             L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=HTML-TreeBuilderX-ASP_NET>
336              
337             =item * AnnoCPAN: Annotated CPAN documentation
338              
339             L<http://annocpan.org/dist/HTML-TreeBuilderX-ASP_NET>
340              
341             =item * CPAN Ratings
342              
343             L<http://cpanratings.perl.org/d/HTML-TreeBuilderX-ASP_NET>
344              
345             =item * Search CPAN
346              
347             L<http://search.cpan.org/dist/HTML-TreeBuilderX-ASP_NET>
348              
349             =back
350              
351             =head1 COPYRIGHT & LICENSE
352              
353             Copyright 2008 Evan Carroll, all rights reserved.
354              
355             This program is free software; you can redistribute it and/or modify it
356             under the same terms as Perl itself.
357              
358             =cut
359