File Coverage

lib/WWW/Flatten.pm
Criterion Covered Total %
statement 65 157 41.4
branch 10 54 18.5
condition 4 32 12.5
subroutine 17 28 60.7
pod 8 9 88.8
total 104 280 37.1


line stmt bran cond sub pod time code
1             package WWW::Flatten;
2 1     1   17658 use strict;
  1         3  
  1         36  
3 1     1   6 use warnings;
  1         2  
  1         38  
4 1     1   6 use utf8;
  1         7  
  1         7  
5 1     1   44 use 5.010;
  1         3  
6 1     1   634 use Mojo::Base 'WWW::Crawler::Mojo';
  1         10041  
  1         7  
7 1     1   200871 use Mojo::Util qw(md5_sum);
  1         2  
  1         42  
8 1     1   3 use Mojo::File;
  1         1  
  1         28  
9 1     1   381 use Mojo::Log;
  1         901  
  1         6  
10 1     1   26 use WWW::Crawler::Mojo::ScraperUtil qw{resolve_href guess_encoding};
  1         1  
  1         36  
11 1     1   4 use Encode;
  1         1  
  1         1813  
12             our $VERSION = '0.09';
13              
14             my $html_type_regex = qr{^(text|application)/(html|xml|xhtml)};
15             my $css_type_regex = qr{^text/css$};
16             my $types = {
17             'application/atom+xml' => 'atom',
18             'application/font-woff' => 'woff',
19             'application/javascript' => 'js',
20             'application/json' => 'json',
21             'application/pdf' => 'pdf',
22             'application/rss+xml' => 'rss',
23             'application/x-gzip' => 'gz',
24             'application/xml' => 'xml',
25             'application/zip' => 'zip',
26             'audio/mpeg' => 'mp3',
27             'audio/ogg' => 'ogg',
28             'image/gif' => 'gif',
29             'image/jpeg' => 'jpg',
30             'image/png' => 'png',
31             'image/svg+xml' => 'svg',
32             'image/x-icon' => 'ico',
33             'text/cache-manifest' => 'appcache',
34             'text/css' => 'css',
35             'text/html' => 'html',
36             'text/plain' => 'txt',
37             'text/xml' => 'xml',
38             'video/mp4' => 'mp4',
39             'video/ogg' => 'ogv',
40             'video/webm' => 'webm',
41             };
42              
43             has depth => 10;
44             has filenames => sub { {} };
45             has 'basedir';
46             has is_target => sub { sub { 1 } };
47             has 'normalize';
48             has asset_name => sub { asset_number_generator(6) };
49             has _retrys => sub { {} };
50             has max_retry => 3;
51             has 'log_name';
52             has 'log';
53              
54             sub asset_number_generator {
55 0   0 0 1 0 my $digit = (shift || 6);
56 0         0 my $num = 0;
57             return sub {
58 0     0   0 return sprintf("%0${digit}d", $num++);
59 0         0 };
60             }
61              
62             sub asset_hash_generator {
63 0   0 0 1 0 my $len = (shift || 6);
64 0         0 my %uniq;
65             return sub {
66 0     0   0 my $md5 = md5_sum(shift->url);
67 0         0 my $len = $len;
68 0         0 my $key;
69 0         0 do { $key = substr($md5, 0, $len++) } while (exists $uniq{$key});
  0         0  
70 0         0 $uniq{$key} = undef;
71 0         0 return $key;
72 0         0 };
73             }
74              
75             sub init {
76 0     0 1 0 my ($self) = @_;
77            
78 0 0       0 $self->log(Mojo::Log->new(
79             path => $self->basedir. $self->log_name)) if $self->log_name;
80            
81 0         0 for (keys %{$self->filenames}) {
  0         0  
82 0         0 $self->enqueue($_);
83 0 0       0 if (!ref $_) {
84 0         0 my $val = $self->filenames->{$_};
85 0         0 delete $self->filenames->{$_};
86 0         0 $self->filenames->{Mojo::URL->new($_)} = $val;
87             }
88             }
89            
90             $self->on(res => sub {
91 0     0   0 my ($self, $scrape, $job, $res) = @_;
92              
93 0         0 $self->say_progress;
94            
95 0 0       0 return unless $res->code == 200;
96            
97 0         0 for my $job2 ($scrape->()) {
98            
99 0 0       0 next unless ($self->is_target->($job2));
100 0 0       0 next unless ($job2->depth <= $self->depth);
101            
102 0         0 my $url = $job2->url;
103            
104 0 0       0 if (my $cb = $self->normalize) {
105 0         0 $job2->url($url = $cb->($url));
106             }
107            
108 0 0       0 next if ($self->filenames->{$url});
109            
110 0         0 my $new_id = $self->asset_name->($job2);
111 0   0     0 my $type = $self->ua->head($url)->res->headers->content_type || '';
112 0         0 my $ext1 = $types->{ lc(($type =~ /([^;]*)/)[0]) };
113 0         0 my $ext2 = ($url->path =~ qr{\.(\w+)$})[0];
114 0   0     0 my $ext = $ext1 || $ext2;
115 0 0       0 $new_id .= ".$ext" if $ext;
116 0         0 $self->filenames->{$url} = $new_id;
117            
118 0 0 0     0 next if $type !~ $html_type_regex && $type !~ $css_type_regex
      0        
119             && -f $self->basedir. $new_id;
120            
121 0         0 $self->enqueue($job2);
122             }
123            
124 0         0 my $url = $job->url;
125 0         0 my $type = $res->headers->content_type;
126 0         0 my $original = $job->original_url;
127 0         0 my $save_file = $self->filenames->{$original};
128            
129 0 0 0     0 if ($type && $type =~ $html_type_regex) {
    0 0        
130 0   0     0 my $encode = guess_encoding($res) || 'UTF-8';
131 0         0 my $cont = Mojo::DOM->new(Encode::decode($encode, $res->body));
132 0         0 my $base = $url;
133            
134 0 0       0 if (my $base_tag = $cont->at('base')) {
135 0         0 $base = resolve_href($base, $base_tag->attr('href'));
136             }
137            
138 0         0 $self->flatten_html($cont, $base, $save_file);
139 0         0 $cont = $cont->to_string;
140 0         0 $self->save($original, $cont, $encode);
141             } elsif ($type && $type =~ $css_type_regex) {
142 0   0     0 my $encode = guess_encoding($res) || 'UTF-8';
143 0         0 my $cont = $self->flatten_css($res->body, $url, $save_file);
144 0         0 $self->save($original, $cont, $encode);
145             } else {
146 0         0 $self->save($original, $res->body);
147             }
148            
149 0 0       0 $self->log->info(
150             sprintf('created: %s => %s ', $save_file, $original)) if $self->log;
151 0         0 });
152            
153             $self->on(error => sub {
154 0     0   0 my ($self, $msg, $job) = @_;
155            
156 0 0       0 $self->log->error("$msg: ". $job->url) if $self->log;
157            
158 0         0 my $md5 = md5_sum($job->url->to_string);
159 0 0       0 if (++$self->_retrys->{$md5} < $self->max_retry) {
160 0         0 $self->requeue($job);
161 0 0       0 $self->log->warn("Re-scheduled: ". $job->url) if $self->log;
162             } else {
163 0         0 my $times = $self->max_retry;
164 0 0       0 $self->log->error("Failed $times times: ". $job->url) if $self->log;
165             }
166 0         0 });
167            
168 0         0 $self->SUPER::init;
169             }
170              
171             sub get_href {
172 24     24 1 220 my ($self, $base, $url, $ref_path) = @_;
173 24   50     160 my $fragment = ($url =~ qr{(#.+)})[0] || '';
174 24         61 my $abs = resolve_href($base, $url);
175 24 50       6394 if (my $cb = $self->normalize) {
176 0         0 $abs = $cb->($abs);
177             }
178 24         101 my $file = $self->filenames->{$abs};
179 24 100       2213 return $abs. $fragment unless $file;
180 21   50     60 my $path = Mojo::File->new($file)->to_rel(
181             Mojo::File->new($ref_path || '')->dirname);
182 21 50       2433 $path =~ s{\\}{/}g if $^O eq 'MSWin32';
183 21 50       63 return $path. $fragment if ($file);
184             }
185              
186             sub flatten_html {
187 1     1 1 1862 my ($self, $dom, $base, $ref_path) = @_;
188            
189 1         9 state $handlers = $self->html_handlers();
190 1         10 $dom->find(join(',', keys %{$handlers}))->each(sub {
191 16     16   3119 my $dom = shift;
192 16         18 for ('href', 'ping','src','data') {
193 64 100       1070 $dom->{$_} = $self->get_href($base, $dom->{$_}, $ref_path) if ($dom->{$_});
194             }
195 1         50 });
196            
197             $dom->find('meta[content]')->each(sub {
198 3 50 66 3   615 if ($_[0] =~ qr{http\-equiv="?Refresh"?}i && $_[0]->{content}) {
199             $_[0]->{content} =~
200 2         104 s{URL=(.+)}{ 'URL='. $self->get_href($base, $1, $ref_path) }e;
  2         28  
201             }
202 1         81 });
203              
204            
205 1     0   19 $dom->find('base')->each(sub {shift->remove});
  0         0  
206            
207             $dom->find('style')->each(sub {
208 1     1   357 my $dom = shift;
209 1         5 my $cont = $dom->content;
210 1         51 $dom->content($self->flatten_css($cont, $base, $ref_path));
211 1         361 });
212            
213             $dom->find('[style]')->each(sub {
214 1     1   491 my $dom = shift;
215 1         2 my $cont = $dom->{style};
216 1         13 $dom->{style} = $self->flatten_css($dom->{style}, $base, $ref_path);
217 1         114 });
218 1         25 return $dom
219             }
220              
221             sub flatten_css {
222 3     3 1 36 my ($self, $cont, $base, $ref_path) = @_;
223 3         24 $cont =~ s{url\((.+?)\)}{
224 9         99 my $url = $1;
225 9 100       57 $url =~ s/^(['"])// && $url =~ s/$1$//;
226 9         18 'url('. $self->get_href($base, $url, $ref_path). ')';
227             }egi;
228 3         37 return $cont;
229             }
230              
231             sub save {
232 0     0 1   my ($self, $url, $content, $encode) = @_;
233 0           my $path = Mojo::File->new($self->basedir. $self->filenames->{$url});
234 0 0         $path->dirname->make_path unless -d $path->dirname;
235 0 0         $content = Encode::encode($encode, $content) if $encode;
236 0           $path->spurt($content);
237             }
238              
239             sub say_start {
240 0     0 1   my $self = shift;
241            
242 0           my $content = <<"EOF";
243             ----------------------------------------
244 0           Crawling is starting with @{[ $self->queue->next->url ]}
245 0           Max Connection : @{[ $self->max_conn ]}
246 0           User Agent : @{[ $self->ua_name ]}
247             ----------------------------------------
248             EOF
249 0           say $content;
250              
251 0 0         if ($self->log) {
252 0           $self->log->append($content);
253            
254 0           my $path = Mojo::File->new($self->log->path)->to_abs;
255            
256 0           say <
257             This could take a while. You can run the following command on another shell to track the status:
258              
259             tail -f $path
260             EOF
261             }
262             }
263              
264             sub say_progress {
265 0     0 0   local $| = 1;
266 0           my $self = shift;
267 0           my $len = $self->queue->length;
268 0           print sprintf('%s jobs left', $len), ' ' x 30, "\r";
269             }
270              
271             1;
272              
273             =head1 NAME
274              
275             WWW::Flatten - Flatten a web pages deeply and make it portable
276              
277             =head1 SYNOPSIS
278              
279             use strict;
280             use warnings;
281             use utf8;
282             use 5.010;
283             use WWW::Flatten;
284            
285             my $basedir = './github/';
286             mkdir($basedir);
287            
288             my $bot = WWW::Flatten->new(
289             basedir => $basedir,
290             max_conn => 1,
291             max_conn_per_host => 1,
292             depth => 3,
293             filenames => {
294             'https://github.com' => 'index.html',
295             },
296             is_target => sub {
297             my $uri = shift->url;
298            
299             if ($uri =~ qr{\.(css|png|gif|jpeg|jpg|pdf|js|json)$}i) {
300             return 1;
301             }
302            
303             if ($uri->host eq 'assets-cdn.github.com') {
304             return 1;
305             }
306            
307             return 0;
308             },
309             normalize => sub {
310             my $uri = shift;
311             ...
312             return $uri;
313             }
314             );
315            
316             $bot->crawl;
317              
318             =head1 DESCRIPTION
319              
320             WWW::Flatten is a web crawling tool for freezing pages into standalone.
321              
322             This software is considered to be alpha quality and isn't recommended for regular usage.
323              
324             =head1 ATTRIBUTES
325              
326             =head2 depth
327              
328             Depth limitation. Defaults to 10.
329              
330             $ua->depth(10);
331              
332             =head2 filenames
333              
334             URL-Filename mapping table. This well automatically be increased during crawling
335             but you can pre-define some beforehand.
336              
337             $bot->finenames({
338             'http://example.com/index.html' => 'index.html',
339             'http://example.com/index2.html' => 'index2.html',
340             })
341              
342             =head2 basedir
343              
344             A directory path for output files.
345              
346             $bot->basedir('./out');
347              
348             =head2 is_target
349              
350             Set the condition which indecates whether the job is flatten target or not.
351              
352             $bot->is_target(sub {
353             my $job = shift;
354             ...
355             return 1 # or 0
356             });
357              
358             =head2 'normalize'
359              
360             A code reference which perform normalization for URLs. The callback will take
361             L instance.
362              
363             $bot->normalize(sub {
364             my $url = shift;
365             my $modified = ...;
366             return $modified;
367             });
368              
369             =head2 asset_name
370              
371             A code reference that generates asset names. Defaults to a preset generator
372             asset_number_generator, which generates 6 digit number. There provides
373             another option asset_hash_generator, which generates 6 character hash.
374              
375             $bot->asset_name(WWW::Flatten::asset_hash_generator(6));
376              
377             =head2 max_retry
378              
379             Max attempt limit of retry in case the server in inresponsible. Defaults to 3.
380              
381             =head1 METHODS
382              
383             =head2 asset_number_generator
384              
385             Numeric file name generating closure with self containing storage. See also
386             L attribute.
387              
388             $bot->asset_name(WWW::Flatten::asset_number_generator(3));
389              
390             =head2 asset_hash_generator
391              
392             Hash-based file name generating closure with self containing storage. See also
393             L attribute. This function automatically avoid name collision by
394             extending the given length.
395              
396             If you want the names as short as possible, use the following setting.
397              
398             $bot->asset_name(WWW::Flatten::asset_hash_generator(1));
399              
400             =head2 init
401              
402             Initialize the crawler
403              
404             =head2 get_href
405              
406             Generate new href with old one.
407              
408             =head2 flatten_html
409              
410             Replace URLs in a Mojo::DOM instance, according to filenames attribute.
411              
412             =head2 flatten_css
413              
414             Replace URLs in a CSS string, according to filenames attribute.
415              
416             =head2 save
417              
418             Save HTTP response into a file.
419              
420             =head1 AUTHOR
421              
422             Sugama Keita, Esugama@jamadam.comE
423              
424             =head1 COPYRIGHT AND LICENSE
425              
426             Copyright (C) jamadam
427              
428             This program is free software; you can redistribute it and/or
429             modify it under the same terms as Perl itself.
430              
431             =cut