File Coverage

blib/lib/WWW/IndexParser.pm
Criterion Covered Total %
statement 41 244 16.8
branch 4 248 1.6
condition 1 51 1.9
subroutine 10 15 66.6
pod 1 1 100.0
total 57 559 10.2


line stmt bran cond sub pod time code
1             package WWW::IndexParser;
2 1     1   31483 use warnings;
  1         2  
  1         36  
3 1     1   5 use strict;
  1         3  
  1         43  
4 1     1   5780 use LWP::UserAgent;
  1         102043  
  1         42  
5 1     1   10937 use HTML::Parser;
  1         8691  
  1         45  
6 1     1   9 use Time::Local;
  1         2  
  1         79  
7 1     1   936 use WWW::IndexParser::Entry;
  1         3  
  1         34  
8 1     1   8 use URI;
  1         2  
  1         31  
9 1     1   6 use Carp;
  1         1  
  1         85  
10              
11             BEGIN {
12 1     1   7438 our $VERSION = "0.91";
13             }
14              
15             our $months = {
16             Jan => 0, January => 0,
17             Feb => 1, February => 1,
18             Mar => 2, March => 2,
19             Apr => 3, April => 3,
20             May => 4,
21             Jun => 5, June => 5,
22             Jul => 6, July => 6,
23             Aug => 7, August => 7,
24             Sep => 8, September => 8,
25             Oct => 9, October => 9,
26             Nov => 10, November => 10,
27             Dec => 11, December => 11 };
28              
29              
30              
31              
32             sub new {
33 1     1 1 12 my $proto = shift;
34 1   33     7 my $class = ref($proto)||$proto;
35 1         3 my $self = {};
36 1         2 bless $self, $class;
37 1         3 my %args = @_;
38 1         11 $self->{ua} = LWP::UserAgent->new;
39 1         13974 $self->{ua}->agent('PerlIndexParser/0.1');
40 1 50       75 if (defined $args{timeout}) {
41 0 0       0 if ($args{timeout} =~ /^\d+/) {
42 0         0 $self->{ua}->timeout($args{timeout});
43             } else {
44 0         0 carp "Invalid timeout: " . $args{timeout};
45 0         0 return;
46             }
47             } else {
48 1         7 $self->{ua}->timeout(10);
49             }
50 1 50       22 if (defined $args{proxy}) {
51 0         0 $self->{ua}->proxy('http', $args{proxy});
52             }
53 1 50       4 if (defined $args{debug}) {
54 0         0 $self->{debug} = $args{debug};
55             } else {
56 1         3 $self->{debug} = 0;
57             }
58 1         8 $self->{parser} = HTML::Parser->new( api_version => 3);
59 1         46 $self->{parser}->{debug} = $self->{debug};
60 1 50       6 if (defined $args{url}) {
61 0         0 $self->_url($args{url});
62 0 0       0 return @{$self->{files}} if defined $self->{files};
  0         0  
63 0         0 return;
64             }
65 1         5 return $self;
66             }
67              
68              
69              
70             sub _url {
71 0     0     my $self = shift;
72 0 0         if (@_) {
73 0           my $new_url = shift;
74 0 0         warn "The URL was $new_url" if $self->{debug};
75              
76 0           $self->{url} = $new_url;
77 0 0         if ($new_url =~ /^([^:]+):\/\/([^:\/]+)(:(\d+))?/) {
78 0           $self->{server} = $2;
79 0           $self->{protocol} = $1;
80 0 0         $self->{port} = $4 if defined $4;
81 0           $self->{req} = HTTP::Request->new(GET => $new_url);
82 0           $self->{res} = $self->{ua}->request($self->{req});
83 0 0         if (not $self->{res}->is_success) {
84 0 0         carp "Cannot fetch for $new_url: " . $self->{res}->status_line if $self->{debug};
85 0           return;
86             }
87             } else {
88 0           warn "Invalid URL " . $new_url;
89 0           return;
90             }
91              
92 0 0         if (ref($self->{res}->headers->{'content-type'}) eq "ARRAY") {
    0          
93 0           my $found_html = 0;
94 0           foreach (@{$self->{res}->headers->{'content-type'}}) {
  0            
95 0 0         $found_html = 1 if /^text\/html/;
96             }
97 0 0         if (not $found_html) {
98 0           warn "Not an HTML page " . $self->{res}->headers->{'content-type'};
99 0           return;
100             }
101             } elsif ($self->{res}->headers->{'content-type'} !~ /^text\/html/) {
102 0           warn "Not an HTML page " . $self->{res}->headers->{'content-type'};
103 0           return;
104             }
105              
106 0 0         if ($self->{res}->headers->{server} =~ /^Apache-Coyote/) {
    0          
    0          
    0          
107 0 0         warn "Server is Tomcat Coyote" if $self->{debug};
108 0           $self->{parser}->handler( start => \&_parse_html_tomcat, "self, tagname, attr, attrseq, text");
109 0           $self->{parser}->handler( text => \&_parse_html_tomcat, "self, tagname, attr, attrseq, text");
110             } elsif ($self->{res}->headers->{server} =~ /^Apache/) {
111 0 0         warn "Server is Apache" if $self->{debug};
112 0           $self->{parser}->handler( start => \&_parse_html_apache, "self, tagname, attr, attrseq, text");
113 0           $self->{parser}->handler( text => \&_parse_html_apache, "self, tagname, attr, attrseq, text");
114             } elsif ($self->{res}->headers->{server} =~ /^Microsoft-IIS/) {
115 0 0         warn "Server is IIS" if $self->{debug};
116 0           $self->{parser}->handler( start => \&_parse_html_iis, "self, tagname, attr, attrseq, text");
117 0           $self->{parser}->handler( text => \&_parse_html_iis, "self, tagname, attr, attrseq, text");
118             } elsif ($self->{res}->headers->{server} =~ m!^lighttpd/!) {
119 0 0         warn "Server is lighttpd" if $self->{debug};
120 0           $self->{parser}->handler( start => \&_parse_html_lighttpd, "self, tagname, attr, attrseq, text");
121 0           $self->{parser}->handler( text => \&_parse_html_lighttpd, "self, tagname, attr, attrseq, text");
122             } else {
123 0 0         warn "Unknown web server" if $self->{debug};
124 0           return;
125             }
126              
127 0           $self->{parser}->parse($self->{res}->content);
128 0           $self->{parser}->eof();
129              
130             # Add the URL to each HASH for ease of use
131 0           foreach my $entry (@{$self->{parser}->{files}}) {
  0            
132 0 0         if ($entry->filename =~ /^\//) {
133 0 0         $entry->url($self->{protocol} . "://" . $self->{server} .
134             (defined $self->{port}?':' . $self->{port}:'') .
135             $entry->filename);
136             } else {
137 0           $entry->url( URI->new_abs($entry->filename, $self->{url}) );
138             }
139             }
140             # Get this back from the parser object.
141 0           $self->{files} = $self->{parser}->{files};
142             }
143 0           return $self->{url};
144             }
145              
146             sub _parse_html_tomcat {
147 0     0     my ($self, $tagname, $attr, $attrseq, $origtext) = @_;
148              
149 0 0 0       if (not defined $tagname) {
    0 0        
    0 0        
    0 0        
    0          
150 0 0         return unless $self->{parser_state};
151              
152 0 0         if ($self->{parser_state} == 2) {
153 0 0         warn "The title is: $origtext" if $self->{debug};
154 0 0         if ($origtext =~ /^Directory Listing For (.+)$/) {
155 0           $self->{directory} = $1;
156             }
157 0           $self->{parser_state} = 1;
158 0           return;
159             }
160 0 0 0       if ($self->{parser_state} == 1 && $origtext =~ /^([\d\.]+)(\s+(\w+))?/) {
161              
162 0           $self->{current_file}->{size} = $1;
163 0 0         $self->{current_file}->{size_units} = $3 if defined $3;
164             }
165 0 0 0       if ($self->{parser_state} == 1 && $origtext =~ /^\w+,\s+(\d+)\s+(\w+)\s+(\d+)\s+(\d+):(\d+):(\d+)\s+(\w+)/) {
166 0           my $time = timelocal($6, $5, $4, $1, $months->{$2}, $3-1900);
167 0           $self->{current_file}->{time} = $time;
168             }
169             } elsif ($tagname eq 'title') {
170 0           $self->{parser_state} = 2;
171             } elsif ($tagname eq "hr" && $self->{parser_state} && defined $self->{current_file}) {
172 0           my $entry = WWW::IndexParser::Entry->new;
173 0 0         $entry->filename($self->{current_file}->{filename}) if defined $self->{current_file}->{filename};
174 0 0         $entry->time($self->{current_file}->{time}) if defined $self->{current_file}->{time};
175 0 0         $entry->size($self->{current_file}->{size}) if defined $self->{current_file}->{size};
176 0 0         $entry->size_units($self->{current_file}->{size_units}) if defined $self->{current_file}->{size_units};
177 0           push @{$self->{files}}, $entry;
  0            
178 0           delete $self->{current_file};
179             #$self->{parser_state} = 1;
180             } elsif ($tagname eq "tr" && defined $self->{parser_state}) {
181 0 0         if (defined $self->{current_file}) {
182 0           my $entry = WWW::IndexParser::Entry->new;
183 0 0         $entry->filename($self->{current_file}->{filename}) if defined $self->{current_file}->{filename};
184 0 0         $entry->time($self->{current_file}->{time}) if defined $self->{current_file}->{time};
185 0 0         $entry->size($self->{current_file}->{size}) if defined $self->{current_file}->{size};
186 0 0         $entry->size_units($self->{current_file}->{size_units}) if defined $self->{current_file}->{size_units};
187 0           push @{$self->{files}}, $entry;
  0            
188 0           delete $self->{current_file};
189             }
190 0           $self->{parser_state} = 1;
191             } elsif ($tagname eq "a" && defined $self->{parser_state}) {
192 0 0         warn " file name = " . $attr->{href} if $self->{debug};
193 0 0         $self->{current_file}->{filename} = $attr->{href} if $attr->{href};
194 0           $self->{parser_state} = 1;
195             }
196             }
197              
198              
199             sub _parse_html_apache {
200 0     0     my ($self, $tagname, $attr, $attrseq, $origtext) = @_;
201              
202 0 0 0       if (not defined $tagname) {
    0 0        
    0 0        
    0          
    0          
203 0 0         return unless $self->{parser_state};
204              
205 0 0         if ($self->{parser_state} == 2) {
206 0 0         warn "The title is: $origtext" if $self->{debug};
207 0 0         if ($origtext =~ /^Index of (.+)$/) {
208 0           $self->{directory} = $1;
209             }
210 0           $self->{parser_state} = 1;
211 0           return;
212             }
213 0 0         if ($origtext =~ /(\d\d)-(\w\w\w)-(\d{4}) (\d\d):(\d\d)\s+([\d\.]+)(\w)?/) {
    0          
    0          
214 0           my $time = timelocal(0, $5, $4, $1, $months->{$2}, $3-1900);
215 0           $self->{current_file}->{time} = $time;
216 0           $self->{current_file}->{size} = $6;
217 0 0         $self->{current_file}->{size_units} = $7 if defined $7;
218             } elsif ($origtext =~ /(\d\d)-(\w\w\w)-(\d{4}) (\d\d):(\d\d)/) {
219 0           my $time = timelocal(0, $5, $4, $1, $months->{$2}, $3-1900);
220 0           $self->{current_file}->{time} = $time;
221 0 0         warn " Found time (using Apache 2.2+ check)" if $self->{debug};
222             } elsif ($origtext =~ /^(\d[\d\.]+)(\w)?/) {
223 0 0         warn " Found size (using Apache 2.2+ check)" if $self->{debug};
224 0           $self->{current_file}->{size} = $1;
225 0 0         $self->{current_file}->{size_units} = $2 if defined $2;
226             }
227             } elsif ($tagname eq 'title') {
228 0           $self->{parser_state} = 2;
229             } elsif ($tagname eq "pre") {
230 0           $self->{parser_state} = 1;
231             } elsif (($tagname eq "img" || $tagname eq "hr") && defined $self->{parser_state}) {
232 0 0 0       if (defined $self->{current_file} && $self->{current_file}->{filename} !~ /^\?/ && $self->{current_file}->{type} !~ /Icon/) {
      0        
233 0           my $entry = WWW::IndexParser::Entry->new;
234 0 0         $entry->filename($self->{current_file}->{filename}) if defined $self->{current_file}->{filename};
235 0 0         $entry->time($self->{current_file}->{time}) if defined $self->{current_file}->{time};
236 0 0         $entry->type($self->{current_file}->{type}) if defined $self->{current_file}->{type};
237 0 0         $entry->size($self->{current_file}->{size}) if defined $self->{current_file}->{size};
238 0 0         $entry->size_units($self->{current_file}->{size_units}) if defined $self->{current_file}->{size_units};
239 0           push @{$self->{files}}, $entry;
  0            
240 0 0         warn "Added " . $self->{current_file}->{filename} if $self->{debug};
241 0           delete $self->{current_file};
242             }
243 0 0         if (defined $attr->{alt}) {
244 0 0         warn "Possible new file:" . $attr->{alt} if $self->{debug};
245 0           $self->{current_file}->{type} = $attr->{alt};
246             }
247             } elsif ($tagname eq "a" && defined $self->{parser_state}) {
248 0 0         warn " file name = " . $attr->{href} if $self->{debug};
249 0 0         $self->{current_file}->{filename} = $attr->{href} if defined $attr->{href};
250             } else {
251 0 0         warn $tagname if $self->{debug};
252             }
253             }
254              
255              
256              
257              
258              
259              
260             sub _parse_html_iis {
261 0     0     my ($self, $tagname, $attr, $attrseq, $origtext) = @_;
262              
263 0 0 0       if (not defined $tagname) {
    0 0        
    0          
    0          
    0          
264 0 0         return unless $self->{parser_state};
265              
266 0 0         if ($self->{parser_state} == 2) {
267 0 0         if ($origtext =~ /- (.+)$/) {
268 0           $self->{directory} = $1;
269             }
270 0           $self->{parser_state} = 1;
271 0           return;
272             }
273 0 0         if ($origtext =~ /\s*(\w+),\s+(\w+)\s+(\d+),\s+(\d{4})\s+(\d{1,2}):(\d\d) (AM|PM)\s+([\d\.]+)/) {
274 0           my $hour_of_day = $5;
275 0 0 0       $hour_of_day = 0 if ($7 eq 'AM' && $hour_of_day eq 12);
276 0 0 0       $hour_of_day += 12 if ($7 eq 'PM' && $hour_of_day ne 12);
277 0           my $time = timelocal(0, $6, $hour_of_day, $3, $months->{$2}, $4-1900);
278 0           $self->{current_file}->{time} = $time;
279 0           $self->{current_file}->{size} = $8;
280             }
281             } elsif ($tagname eq 'title') {
282 0           $self->{parser_state} = 2;
283             } elsif ($tagname eq "pre") {
284 0           $self->{parser_state} = 1;
285             } elsif ($tagname eq "br" && defined $self->{parser_state}) {
286 0 0         if (defined $self->{current_file}) {
287 0           my $entry = WWW::IndexParser::Entry->new;
288 0 0         $entry->filename($self->{current_file}->{filename}) if defined $self->{current_file}->{filename};
289 0 0         $entry->time($self->{current_file}->{time}) if defined $self->{current_file}->{time};
290 0 0         $entry->size($self->{current_file}->{size}) if defined $self->{current_file}->{size};
291 0 0         $entry->size_units($self->{current_file}->{size_units}) if defined $self->{current_file}->{size_units};
292 0           push @{$self->{files}}, $entry;
  0            
293 0           delete $self->{current_file};
294             }
295             } elsif ($tagname eq "a" && defined $self->{parser_state}) {
296 0 0         warn " file name = " . $attr->{href} if $self->{debug};
297 0 0         $self->{current_file}->{filename} = $attr->{href} if defined $attr->{href};
298             }
299             }
300              
301              
302             sub _parse_html_lighttpd {
303 0     0     my ($self, $tagname, $attr, $attrseq, $origtext) = @_;
304            
305 0 0 0       if (not defined $tagname) {
    0          
    0          
    0          
    0          
306 0 0         return unless $self->{parser_state};
307            
308 0 0         if ($self->{parser_state} eq 'title') {
309 0 0         warn "The title is: $origtext" if $self->{debug};
310 0 0         if ($origtext =~ m!^Index of (.+)/$!) {
311 0           $self->{directory} = $1;
312             }
313 0           $self->{parser_state} = 1;
314 0           return;
315             }
316              
317 0 0         if ($self->{parser_state} eq 'time') {
    0          
    0          
318 0 0         if ($origtext =~ /^(\d{4})-(\w\w\w)-(\d\d) (\d\d):(\d\d):(\d\d)$/) {
319 0           my $time = timelocal(0, $5, $4, $3, $months->{$2}, $1-1900);
320 0           $self->{current_file}->{time} = $time;
321             }
322             } elsif ($self->{parser_state} eq 'size') {
323 0 0         if ($origtext =~ /^([\d\.]+)(\w)?/) {
324 0           $self->{current_file}->{size} = $1;
325 0 0         $self->{current_file}->{size_units} = $2 if defined $2;
326             }
327             } elsif ($self->{parser_state} eq 'type') {
328 0 0         if ($origtext =~ /^[\w\-\/]+$/) {
329 0           $self->{current_file}->{type} = $origtext;
330             }
331             }
332             } elsif ($tagname eq 'title') {
333 0           $self->{parser_state} = 'title';
334             } elsif ($tagname eq "td") {
335 0           my %class2state = (m => 'time', s => 'size', t => 'type');
336 0           my $class = $attr->{class};
337 0           my $state = $class2state{$class};
338 0 0         $self->{parser_state} = $state if $state;
339             } elsif ($tagname eq 'tr') {
340 0 0         if (defined $self->{current_file}) {
341 0           my $entry = WWW::IndexParser::Entry->new;
342 0 0         $entry->filename($self->{current_file}->{filename}) if defined $self->{current_file}->{filename};
343 0 0         $entry->time($self->{current_file}->{time}) if defined $self->{current_file}->{time};
344 0 0         $entry->type($self->{current_file}->{type}) if defined $self->{current_file}->{type};
345 0 0         $entry->size($self->{current_file}->{size}) if defined $self->{current_file}->{size};
346 0 0         $entry->size_units($self->{current_file}->{size_units}) if defined $self->{current_file}->{size_units};
347 0           push @{$self->{files}}, $entry;
  0            
348 0 0         warn "Added " . $self->{current_file}->{filename} if $self->{debug};
349 0           delete $self->{current_file};
350             }
351 0 0         warn "Possible new file row" if $self->{debug};
352 0           $self->{parser_state} = 1;
353             } elsif ($tagname eq "a" && defined $self->{parser_state}) {
354 0 0         warn " file name = " . $attr->{href} if $self->{debug};
355 0 0         $self->{current_file}->{filename} = $attr->{href} if defined $attr->{href};
356             } else {
357 0 0         warn $tagname if $self->{debug};
358             }
359             }
360              
361              
362              
363             =head1 NAME
364              
365             WWW::IndexParser - Fetch and parse the directory index from a web server
366              
367             =head1 SYNOPSIS
368              
369             use WWW::IndexParser;
370             my @files = WWW::IndexParser->new(url => 'http://www.example.com/dir/');
371             foreach my $entry (@files) {
372             printf "%s %s\n", $entry->filename,
373             scalar(localtime($entry->time)||'');
374             }
375              
376             =head1 DESCRIPTION
377              
378              
379             B is a module that uses LWP to fetch a URL from a web
380             server. It then atempts to parse this page as if it were an auto generated
381             index page. It returns an array of B objects, one
382             per entry in the directory index that it has found. Each Entry has a
383             set of methods: filename(), time(), size(), and others if supported
384             by the autoindex generated: type() and size_units().
385              
386             =head1 CONSTRUCTOR
387              
388             =over 4
389              
390             =item new ( url => $url, timeout => $seconds, proxy => $proxy_url, debug => 1 )
391              
392             When called with a URL to examine, this method does not return an object,
393             but an array of WWW::IndexParser::Entry obects, one per entry in the
394             directory listing that was accessed.
395              
396             The options to this are:
397              
398             =over 4
399              
400             =item url
401              
402             The complete URL of the index to fetch.
403              
404             =item timeout
405              
406             The timeout for the request to fetch data, default 10 seconds.
407              
408             =item proxy
409              
410             A proxy server URL, eg, 'http://proxy:3128/'.
411              
412             =item debug
413              
414             Decide if to print parsing debug information. Set to 0 (the default) to
415             disable, or anything non-false to print. Recommened you use a digit (ie, 1)
416             as this may become a numeric 'level' of debug in the future.
417              
418             =back
419              
420              
421             =back
422              
423              
424             =head1 METHODS
425              
426             All methods are private in this module. Pass only a URL to the constructor,
427             and it does everything for you itself.
428              
429             =head1 PREREQUISUTES
430              
431             This modile depends upon C, C, C.
432              
433              
434             =head1 OSNAMES
435              
436             any
437              
438             =head1 BUGS
439              
440             Currently only supports Apache, IIS and Tomcat style auto indexes. Send suggestions for new Auto-Indexes to support to the author (along with sample HTML)!
441              
442             =head1 AUTHOR
443              
444             James Bromberger Ejames@rcpt.toE
445              
446             =head1 COPYRIGHT
447              
448             Copyright (c) 2006 James Bromberger. All rights reserved. All rights
449             reserved. This program is free software; you can redistribute it and/or
450             modify it under the same terms as Perl itself.
451              
452             =cut
453              
454             1;