| blib/lib/AnyEvent/WebArchive.pm | |||
|---|---|---|---|
| Criterion | Covered | Total | % |
| statement | 24 | 67 | 35.8 |
| branch | 5 | 26 | 19.2 |
| condition | 0 | 11 | 0.0 |
| subroutine | 6 | 11 | 54.5 |
| pod | 1 | 1 | 100.0 |
| total | 36 | 116 | 31.0 |
| line | stmt | bran | cond | sub | pod | time | code |
|---|---|---|---|---|---|---|---|
| 1 | package AnyEvent::WebArchive; | ||||||
| 2 | |||||||
| 3 | 2 | 2 | 41961 | use strict; | |||
| 2 | 6 | ||||||
| 2 | 100 | ||||||
| 4 | 2 | 2 | 7042 | use AnyEvent::HTTP; | |||
| 2 | 142919 | ||||||
| 2 | 233 | ||||||
| 5 | 2 | 2 | 16017 | use Data::Dumper; | |||
| 2 | 21687 | ||||||
| 2 | 161 | ||||||
| 6 | 2 | 2 | 17 | use base 'Exporter'; | |||
| 2 | 6 | ||||||
| 2 | 2673 | ||||||
| 7 | our $VERSION = '0.02'; | ||||||
| 8 | |||||||
| 9 | our @EXPORT = qw(restore_url); | ||||||
| 10 | my $DEBUG = 0; | ||||||
| 11 | sub restore_url { | ||||||
| 12 | 0 | 0 | 1 | 0 | my $url = shift; | ||
| 13 | 0 | 0 | my $cb = pop; | ||||
| 14 | |||||||
| 15 | 0 | 0 | $url =~ s/^www\.//; | ||||
| 16 | 0 | 0 | 0 | my $opt = ref $_[0] ? $_[0] : {@_}; | |||
| 17 | |||||||
| 18 | 0 | 0 | 0 | $AnyEvent::HTTP::USERAGENT = $opt->{'user_agent'} || 'Opera/9.80 (Windows NT 5.1; U; ru) Presto/2.5.24 Version/10.52'; | |||
| 19 | 0 | 0 | 0 | $AnyEvent::HTTP::MAX_PER_HOST ||= $opt->{'max_per_host'}; | |||
| 20 | 0 | 0 | 0 | $AnyEvent::HTTP::ACTIVE ||= $opt->{'active' }; | |||
| 21 | |||||||
| 22 | 0 | 0 | my $count; | ||||
| 23 | 0 | 0 | my $worker = {}; | ||||
| 24 | 0 | 0 | bless $worker, __PACKAGE__; | ||||
| 25 | 0 | 0 | $worker->{'domain'} = $url; | ||||
| 26 | http_get _search($url), sub { | ||||||
| 27 | 0 | 0 | 0 | $url = $url; | |||
| 28 | 0 | 0 | 0 | $DEBUG && warn "GET $url\n"; | |||
| 29 | 0 | 0 | my ($body, $headers) = @_; | ||||
| 30 | |||||||
| 31 | 0 | 0 | for my $job (grep { $_->[0] } # XXX | ||||
| 0 | 0 | ||||||
| 0 | 0 | ||||||
| 32 | 0 | 0 | map { [ /href="([^"]+)"/sg, />([^<]+)<\/a>/sg ] } map { split /( ){2}/ } |
||||
| 33 | $body =~ m{(.*?)}si | ||||||
| 34 | ) { | ||||||
| 35 | 0 | 0 | 0 | $DEBUG && warn "GET $job->[0]\n"; | |||
| 36 | 0 | 0 | $count++; | ||||
| 37 | http_get $job->[0], sub { | ||||||
| 38 | 0 | 0 | my ($body, $headers) = @_; | ||||
| 39 | 0 | 0 | 0 | if ($headers->{'Status'} == 200) { | |||
| 40 | 0 | 0 | $worker->_save_file($job->[1], $body); | ||||
| 41 | } else { | ||||||
| 42 | 0 | 0 | warn "Bad status for url $job->[0]: $_" for Dumper($headers); | ||||
| 43 | } | ||||||
| 44 | |||||||
| 45 | 0 | 0 | 0 | $cb->() unless --$count; | |||
| 46 | } | ||||||
| 47 | 0 | 0 | } | ||||
| 48 | } | ||||||
| 49 | 0 | 0 | } | ||||
| 50 | |||||||
| 51 | sub _filename { | ||||||
| 52 | 0 | 0 | 0 | my $str = shift; | |||
| 53 | |||||||
| 54 | 0 | 0 | $str =~ s/[^a-z\.\,\s\;-]/_/sig; | ||||
| 55 | |||||||
| 56 | 0 | 0 | return $str; | ||||
| 57 | } | ||||||
| 58 | |||||||
| 59 | sub _search { | ||||||
| 60 | 0 | 0 | 0 | return "http://web.archive.org/web/*sr_1nr_10000/$_*" for shift; | |||
| 61 | } | ||||||
| 62 | |||||||
| 63 | sub _save_file { | ||||||
| 64 | 0 | 0 | 0 | my ($worker, $url,$body) = @_; | |||
| 65 | |||||||
| 66 | 0 | 0 | $url = $worker->{'domain'} . $worker->_normalize_url($url); | ||||
| 67 | |||||||
| 68 | 0 | 0 | my $path; | ||||
| 69 | 0 | 0 | for (split /\//, $url) { | ||||
| 70 | 0 | 0 | 0 | 0 | last if /^\?/ || $url =~ /$_$/; | ||
| 71 | 0 | 0 | $path .= "$_/"; | ||||
| 72 | 0 | 0 | 0 | $DEBUG && warn "mkdir $path\n"; | |||
| 73 | 0 | 0 | mkdir $path; | ||||
| 74 | } | ||||||
| 75 | |||||||
| 76 | 0 | 0 | 0 | return warn "file $url already exists, skipping\n" if -e $url; | |||
| 77 | 0 | 0 | 0 | $DEBUG && warn "writing $url\n"; | |||
| 78 | |||||||
| 79 | |||||||
| 80 | 0 | 0 | 0 | open my $fh, '>', $url or warn "$!: $url"; | |||
| 81 | 0 | 0 | print $fh $worker->_normalize($body); | ||||
| 82 | } | ||||||
| 83 | |||||||
| 84 | sub _normalize { | ||||||
| 85 | 1 | 1 | 3 | my ($worker,$body) = @_; | |||
| 86 | |||||||
| 87 | 1 | 11 | $body =~ s/(?<=href=")([^"]+)(?=")/$worker->_normalize_url($1)/sieg; | ||||
| 1 | 4 | ||||||
| 88 | 1 | 8 | $body =~ s{(?<= |
).*(?=