File Coverage

lib/WWW/Flatten.pm
Criterion Covered Total %
statement 64 151 42.3
branch 8 50 16.0
condition 4 32 12.5
subroutine 17 27 62.9
pod 8 8 100.0
total 101 268 37.6


line stmt bran cond sub pod time code
1             package WWW::Flatten;
2 1     1   18427 use strict;
  1         2  
  1         26  
3 1     1   3 use warnings;
  1         1  
  1         22  
4 1     1   3 use utf8;
  1         4  
  1         6  
5 1     1   26 use 5.010;
  1         3  
6 1     1   491 use Mojo::Base 'WWW::Crawler::Mojo';
  1         6933  
  1         5  
7 1     1   219519 use Mojo::Util qw(md5_sum);
  1         2  
  1         39  
8 1     1   3 use Mojo::File;
  1         2  
  1         38  
9 1     1   649 use Mojo::Log;
  1         986  
  1         6  
10 1     1   28 use WWW::Crawler::Mojo::ScraperUtil qw{resolve_href guess_encoding};
  1         2  
  1         38  
11 1     1   4 use Encode;
  1         1  
  1         1919  
12             our $VERSION = '0.08';
13              
14             my $html_type_regex = qr{^(text|application)/(html|xml|xhtml)};
15             my $css_type_regex = qr{^text/css$};
16             my $types = {
17             'application/atom+xml' => 'atom',
18             'application/font-woff' => 'woff',
19             'application/javascript' => 'js',
20             'application/json' => 'json',
21             'application/pdf' => 'pdf',
22             'application/rss+xml' => 'rss',
23             'application/x-gzip' => 'gz',
24             'application/xml' => 'xml',
25             'application/zip' => 'zip',
26             'audio/mpeg' => 'mp3',
27             'audio/ogg' => 'ogg',
28             'image/gif' => 'gif',
29             'image/jpeg' => 'jpg',
30             'image/png' => 'png',
31             'image/svg+xml' => 'svg',
32             'image/x-icon' => 'ico',
33             'text/cache-manifest' => 'appcache',
34             'text/css' => 'css',
35             'text/html' => 'html',
36             'text/plain' => 'txt',
37             'text/xml' => 'xml',
38             'video/mp4' => 'mp4',
39             'video/ogg' => 'ogv',
40             'video/webm' => 'webm',
41             };
42              
43             has depth => 10;
44             has filenames => sub { {} };
45             has 'basedir';
46             has is_target => sub { sub { 1 } };
47             has 'normalize';
48             has asset_name => sub { asset_number_generator(6) };
49             has _retrys => sub { {} };
50             has max_retry => 3;
51             has 'log_name';
52             has 'log';
53              
54             sub asset_number_generator {
55 0   0 0 1 0 my $digit = (shift || 6);
56 0         0 my $num = 0;
57             return sub {
58 0     0   0 return sprintf("%0${digit}d", $num++);
59 0         0 };
60             }
61              
62             sub asset_hash_generator {
63 0   0 0 1 0 my $len = (shift || 6);
64 0         0 my %uniq;
65             return sub {
66 0     0   0 my $md5 = md5_sum(shift->url);
67 0         0 my $len = $len;
68 0         0 my $key;
69 0         0 do { $key = substr($md5, 0, $len++) } while (exists $uniq{$key});
  0         0  
70 0         0 $uniq{$key} = undef;
71 0         0 return $key;
72 0         0 };
73             }
74              
75             sub init {
76 0     0 1 0 my ($self) = @_;
77            
78 0 0       0 $self->log(Mojo::Log->new(
79             path => $self->basedir. $self->log_name)) if $self->log_name;
80            
81 0         0 for (keys %{$self->filenames}) {
  0         0  
82 0         0 $self->enqueue($_);
83 0 0       0 if (!ref $_) {
84 0         0 my $val = $self->filenames->{$_};
85 0         0 delete $self->filenames->{$_};
86 0         0 $self->filenames->{Mojo::URL->new($_)} = $val;
87             }
88             }
89            
90             $self->on(res => sub {
91 0     0   0 my ($self, $scrape, $job, $res) = @_;
92            
93 0 0       0 return unless $res->code == 200;
94            
95 0         0 for my $job2 ($scrape->()) {
96            
97 0 0       0 next unless ($self->is_target->($job2));
98 0 0       0 next unless ($job2->depth <= $self->depth);
99            
100 0         0 my $url = $job2->url;
101            
102 0 0       0 if (my $cb = $self->normalize) {
103 0         0 $job2->url($url = $cb->($url));
104             }
105            
106 0 0       0 next if ($self->filenames->{$url});
107            
108 0         0 my $new_id = $self->asset_name->($job2);
109 0   0     0 my $type = $self->ua->head($url)->res->headers->content_type || '';
110 0         0 my $ext1 = $types->{ lc(($type =~ /([^;]*)/)[0]) };
111 0         0 my $ext2 = ($url->path =~ qr{\.(\w+)$})[0];
112 0   0     0 my $ext = $ext1 || $ext2;
113 0 0       0 $new_id .= ".$ext" if $ext;
114 0         0 $self->filenames->{$url} = $new_id;
115            
116 0 0 0     0 next if $type !~ $html_type_regex && $type !~ $css_type_regex
      0        
117             && -f $self->basedir. $new_id;
118            
119 0         0 $self->enqueue($job2);
120             }
121            
122 0         0 my $url = $job->url;
123 0         0 my $type = $res->headers->content_type;
124 0         0 my $original = $job->original_url;
125 0         0 my $save_file = $self->filenames->{$original};
126            
127 0 0 0     0 if ($type && $type =~ $html_type_regex) {
    0 0        
128 0   0     0 my $encode = guess_encoding($res) || 'UTF-8';
129 0         0 my $cont = Mojo::DOM->new(Encode::decode($encode, $res->body));
130 0         0 my $base = $url;
131            
132 0 0       0 if (my $base_tag = $cont->at('base')) {
133 0         0 $base = resolve_href($base, $base_tag->attr('href'));
134             }
135            
136 0         0 $self->flatten_html($cont, $base, $save_file);
137 0         0 $cont = $cont->to_string;
138 0         0 $self->save($original, $cont, $encode);
139             } elsif ($type && $type =~ $css_type_regex) {
140 0   0     0 my $encode = guess_encoding($res) || 'UTF-8';
141 0         0 my $cont = $self->flatten_css($res->body, $url, $save_file);
142 0         0 $self->save($original, $cont, $encode);
143             } else {
144 0         0 $self->save($original, $res->body);
145             }
146            
147 0 0       0 $self->log->info(
148             sprintf('created: %s => %s ', $save_file, $original)) if $self->log;
149 0         0 });
150            
151             $self->on(error => sub {
152 0     0   0 my ($self, $msg, $job) = @_;
153            
154 0 0       0 $self->log->error("$msg: ". $job->url) if $self->log;
155            
156 0         0 my $md5 = md5_sum($job->url->to_string);
157 0 0       0 if (++$self->_retrys->{$md5} < $self->max_retry) {
158 0         0 $self->requeue($job);
159 0 0       0 $self->log->warn("Re-scheduled: ". $job->url) if $self->log;
160             } else {
161 0         0 my $times = $self->max_retry;
162 0 0       0 $self->log->error("Failed $times times: ". $job->url) if $self->log;
163             }
164 0         0 });
165            
166 0         0 $self->SUPER::init;
167             }
168              
169             sub get_href {
170 24     24 1 247 my ($self, $base, $url, $ref_path) = @_;
171 24   50     191 my $fragment = ($url =~ qr{(#.+)})[0] || '';
172 24         72 my $abs = resolve_href($base, $url);
173 24 50       7096 if (my $cb = $self->normalize) {
174 0         0 $abs = $cb->($abs);
175             }
176 24         113 my $file = $self->filenames->{$abs};
177 24   50     2704 my $refdir = Mojo::File->new($ref_path || '')->dirname;
178 24 100       843 return (Mojo::File->new($file)->to_rel($refdir)). $fragment if ($file);
179 3         8 return $abs. $fragment;
180             }
181              
182             sub flatten_html {
183 1     1 1 2323 my ($self, $dom, $base, $ref_path) = @_;
184            
185 1         11 state $handlers = $self->html_handlers();
186 1         18 $dom->find(join(',', keys %{$handlers}))->each(sub {
187 16     16   4562 my $dom = shift;
188 16         20 for ('href', 'ping','src','data') {
189 64 100       1660 $dom->{$_} = $self->get_href($base, $dom->{$_}, $ref_path) if ($dom->{$_});
190             }
191 1         76 });
192            
193             $dom->find('meta[content]')->each(sub {
194 3 50 66 3   835 if ($_[0] =~ qr{http\-equiv="?Refresh"?}i && $_[0]->{content}) {
195             $_[0]->{content} =~
196 2         125 s{URL=(.+)}{ 'URL='. $self->get_href($base, $1, $ref_path) }e;
  2         30  
197             }
198 1         96 });
199              
200            
201 1     0   94 $dom->find('base')->each(sub {shift->remove});
  0         0  
202            
203             $dom->find('style')->each(sub {
204 1     1   428 my $dom = shift;
205 1         5 my $cont = $dom->content;
206 1         66 $dom->content($self->flatten_css($cont, $base, $ref_path));
207 1         484 });
208            
209             $dom->find('[style]')->each(sub {
210 1     1   559 my $dom = shift;
211 1         3 my $cont = $dom->{style};
212 1         12 $dom->{style} = $self->flatten_css($dom->{style}, $base, $ref_path);
213 1         130 });
214 1         51 return $dom
215             }
216              
217             sub flatten_css {
218 3     3 1 13 my ($self, $cont, $base, $ref_path) = @_;
219 3         91 $cont =~ s{url\((.+?)\)}{
220 9         445 my $url = $1;
221 9 100       63 $url =~ s/^(['"])// && $url =~ s/$1$//;
222 9         19 'url('. $self->get_href($base, $url, $ref_path). ')';
223             }egi;
224 3         248 return $cont;
225             }
226              
227             sub save {
228 0     0 1   my ($self, $url, $content, $encode) = @_;
229 0           my $path = Mojo::File->new($self->basedir. $self->filenames->{$url});
230 0 0         $path->dirname->make_path unless -d $path->dirname;
231 0 0         $content = Encode::encode($encode, $content) if $encode;
232 0           $path->spurt($content);
233             }
234              
235             sub say_start {
236 0     0 1   my $self = shift;
237            
238 0           my $content = <<"EOF";
239             ----------------------------------------
240 0           Crawling is starting with @{[ $self->queue->next->url ]}
241 0           Max Connection : @{[ $self->max_conn ]}
242 0           User Agent : @{[ $self->ua_name ]}
243             ----------------------------------------
244             EOF
245 0           say $content;
246              
247 0 0         if ($self->log) {
248 0           $self->log->append($content);
249 0           say <
250             This could take a while. You can run the following command on another shell to track the status:
251              
252 0           tail -f @{[ $self->log ]}
253             EOF
254             }
255             }
256              
257             1;
258              
259             =head1 NAME
260              
261             WWW::Flatten - Flatten a web pages deeply and make it portable
262              
263             =head1 SYNOPSIS
264              
265             use strict;
266             use warnings;
267             use utf8;
268             use 5.010;
269             use WWW::Flatten;
270            
271             my $basedir = './github/';
272             mkdir($basedir);
273            
274             my $bot = WWW::Flatten->new(
275             basedir => $basedir,
276             max_conn => 1,
277             max_conn_per_host => 1,
278             depth => 3,
279             filenames => {
280             'https://github.com' => 'index.html',
281             },
282             is_target => sub {
283             my $uri = shift->url;
284            
285             if ($uri =~ qr{\.(css|png|gif|jpeg|jpg|pdf|js|json)$}i) {
286             return 1;
287             }
288            
289             if ($uri->host eq 'assets-cdn.github.com') {
290             return 1;
291             }
292            
293             return 0;
294             },
295             normalize => sub {
296             my $uri = shift;
297             ...
298             return $uri;
299             }
300             );
301            
302             $bot->crawl;
303              
304             =head1 DESCRIPTION
305              
306             WWW::Flatten is a web crawling tool for freezing pages into standalone.
307              
308             This software is considered to be alpha quality and isn't recommended for regular usage.
309              
310             =head1 ATTRIBUTES
311              
312             =head2 depth
313              
314             Depth limitation. Defaults to 10.
315              
316             $ua->depth(10);
317              
318             =head2 filenames
319              
320             URL-Filename mapping table. This well automatically be increased during crawling
321             but you can pre-define some beforehand.
322              
323             $bot->finenames({
324             'http://example.com/index.html' => 'index.html',
325             'http://example.com/index2.html' => 'index2.html',
326             })
327              
328             =head2 basedir
329              
330             A directory path for output files.
331              
332             $bot->basedir('./out');
333              
334             =head2 is_target
335              
336             Set the condition which indecates whether the job is flatten target or not.
337              
338             $bot->is_target(sub {
339             my $job = shift;
340             ...
341             return 1 # or 0
342             });
343              
344             =head2 'normalize'
345              
346             A code reference which perform normalization for URLs. The callback will take
347             L instance.
348              
349             $bot->normalize(sub {
350             my $url = shift;
351             my $modified = ...;
352             return $modified;
353             });
354              
355             =head2 asset_name
356              
357             A code reference that generates asset names. Defaults to a preset generator
358             asset_number_generator, which generates 6 digit number. There provides
359             another option asset_hash_generator, which generates 6 character hash.
360              
361             $bot->asset_name(WWW::Flatten::asset_hash_generator(6));
362              
363             =head2 max_retry
364              
365             Max attempt limit of retry in case the server in inresponsible. Defaults to 3.
366              
367             =head1 METHODS
368              
369             =head2 asset_number_generator
370              
371             Numeric file name generating closure with self containing storage. See also
372             L attribute.
373              
374             $bot->asset_name(WWW::Flatten::asset_number_generator(3));
375              
376             =head2 asset_hash_generator
377              
378             Hash-based file name generating closure with self containing storage. See also
379             L attribute. This function automatically avoid name collision by
380             extending the given length.
381              
382             If you want the names as short as possible, use the following setting.
383              
384             $bot->asset_name(WWW::Flatten::asset_hash_generator(1));
385              
386             =head2 init
387              
388             Initialize the crawler
389              
390             =head2 get_href
391              
392             Generate new href with old one.
393              
394             =head2 flatten_html
395              
396             Replace URLs in a Mojo::DOM instance, according to filenames attribute.
397              
398             =head2 flatten_css
399              
400             Replace URLs in a CSS string, according to filenames attribute.
401              
402             =head2 save
403              
404             Save HTTP response into a file.
405              
406             =head1 AUTHOR
407              
408             Sugama Keita, Esugama@jamadam.comE
409              
410             =head1 COPYRIGHT AND LICENSE
411              
412             Copyright (C) jamadam
413              
414             This program is free software; you can redistribute it and/or
415             modify it under the same terms as Perl itself.
416              
417             =cut