File Coverage

blib/lib/HTML/Copy.pm
Criterion Covered Total %
statement 214 243 88.0
branch 67 94 71.2
condition 4 6 66.6
subroutine 39 42 92.8
pod 16 27 59.2
total 340 412 82.5


line stmt bran cond sub pod time code
1             package HTML::Copy;
2              
3 2     2   21410 use 5.008;
  2         8  
  2         68  
4 2     2   8 use strict;
  2         3  
  2         61  
5 2     2   13 use warnings;
  2         6  
  2         46  
6 2     2   9 use File::Spec;
  2         2  
  2         42  
7 2     2   7 use File::Basename;
  2         3  
  2         178  
8 2     2   9 use File::Path;
  2         3  
  2         108  
9 2     2   1687 use utf8;
  2         16  
  2         8  
10 2     2   1780 use Encode;
  2         21355  
  2         177  
11 2     2   1418 use Encode::Guess;
  2         228130  
  2         16  
12 2     2   146 use Carp;
  2         5  
  2         182  
13              
14 2     2   2044 use HTML::Parser 3.40;
  2         19002  
  2         66  
15 2     2   1994 use HTML::HeadParser;
  2         2928  
  2         56  
16 2     2   104918 use URI::file;
  2         642416  
  2         79  
17              
18 2     2   41 use base qw(HTML::Parser Class::Accessor::Fast);
  2         5  
  2         2141  
19              
20             __PACKAGE__->mk_accessors(qw(link_attributes
21             has_base));
22              
23             #use Data::Dumper;
24              
25             our @default_link_attributes = ('src', 'href', 'background', 'csref', 'livesrc', 'user');
26             # 'livesrc', 'user' and 'csref' are uesed in Adobe GoLive
27              
28             =head1 NAME
29              
30             HTML::Copy - copy a HTML file without breaking links.
31              
32             =head1 VERSION
33              
34             Version 1.31
35              
36             =cut
37              
38             our $VERSION = '1.31';
39              
40             =head1 SYMPOSIS
41              
42             use HTML::Copy;
43              
44             HTML::Copy->htmlcopy($source_path, $destination_path);
45              
46             # or
47              
48             $p = HTML::Copy->new($source_path);
49             $p->copy_to($destination_path);
50              
51             # or
52              
53             open my $in, "<", $source_path;
54             $p = HTML::Copy->new($in)
55             $p->source_path($source_path); # can be omitted,
56             # when $source_path is in cwd.
57              
58             $p->destination_path($destination_path) # can be omitted,
59             # when $source_path is in cwd.
60             open my $out, ">", $source_path;
61             $p->copy_to($out);
62              
63             =head1 DESCRIPTION
64              
65             This module is to copy a HTML file without beaking links in the file. This module is a sub class of HTML::Parser.
66              
67             =head1 REQUIRED MODULES
68              
69             =over 2
70              
71             =item L
72              
73             =back
74              
75             =head1 CLASS METHODS
76              
77             =head2 htmlcopy
78              
79             HTML::Copy->htmlcopy($source_path, $destination_path);
80              
81             Parse contents of $source_path, change links and write into $destination_path.
82              
83             =cut
84              
85             sub htmlcopy($$$) {
86 1     1 1 1179 my ($class, $source_path, $destination_path) = @_;
87 1         12 my $p = $class->new($source_path);
88 1         14 return $p->copy_to($destination_path);
89             }
90              
91             =head2 parse_file
92              
93             $html_text = HTML::Copy->parse_file($source_path,
94             $destination_path);
95              
96             Parse contents of $source_path and change links to copy into $destination_path. But don't make $destination_path. Just return modified HTML. The encoding of strings is converted into utf8.
97              
98             =cut
99              
100             sub parse_file($$$) {
101 1     1 1 874 my ($class, $source, $destination) = @_;
102 1         7 my $p = $class->new($source);
103 1         23 return $p->parse_to($destination);
104             }
105              
106              
107             =head1 CONSTRUCTOR METHODS
108              
109             =head2 new
110              
111             $p = HTML::Copy->new($source);
112              
113             Make an instance of this module with specifying a source of HTML.
114              
115             The argument $source can be a file path or a file handle. When a file handle is passed, you may need to indicate a file path of the passed file handle by the method L<"source_path">. If calling L<"source_path"> is omitted, it is assumed that the location of the file handle is the current working directory.
116              
117             =cut
118              
119             sub new {
120 11     11 1 23324 my $class = shift @_;
121 11         139 my $self = $class->SUPER::new();
122 11 50       862 if (@_ > 1) {
123 0         0 my %args = @_;
124 0         0 my @keys = keys %args;
125 0         0 @$self{@keys} = @args{@keys};
126             } else {
127 11         34 my $file = shift @_;
128 11         31 my $ref = ref($file);
129 11 50 66     156 if ($ref =~ /^Path::Class::File/) {
    100          
130 0         0 $self->source_path($file);
131             } elsif (! $ref && (ref(\$file) ne 'GLOB')) {
132 8         28 $self->source_path($file);
133             } else {
134 3         18 $self->source_handle($file);
135             }
136             }
137            
138 11         150 $self->link_attributes(\@default_link_attributes);
139 11         158 $self->has_base(0);
140 11         166 $self->attr_encoded(1);
141 11         117 return $self;
142             }
143              
144              
145             =head1 INSTANCE METHODS
146              
147             =head2 copy_to
148              
149             $p->copy_to($destination)
150              
151             Parse contents of $source given in new method, change links and write into $destination.
152              
153             The argument $destination can be a file path or a file handle. When $destination is a file handle, you may need to indicate the location of the file handle by a method L<"destination_path">. L<"destination_path"> must be called before calling L<"copy_to">. When calling L<"destination_path"> is omitted, it is assumed that the locaiton of the file handle is the current working directory.
154              
155             =cut
156              
157             sub copy_to {
158 16     16 1 6034 my ($self, $destination) = @_;
159 16         95 my $io_layer = $self->io_layer();
160 16         24 my $fh;
161 16 100 66     149 if (!ref($destination) && (ref(\$destination) ne "GLOB")) {
162 7         37 $destination = $self->set_destination($destination);
163 7 50       1621 open $fh, ">$io_layer", $destination
164             or croak "can't open $destination.";
165             } else {
166 9         16 $fh = $destination;
167 9         84 binmode($fh, $io_layer);
168             }
169            
170 16         540 $self->{'output_handle'} = $fh;
171 16         371 $self->SUPER::parse($self->{'source_html'});
172 16         121 $self->eof;
173 16         687 close $fh;
174 16         84 $self->source_handle(undef);
175 16         82 return $self->destination_path;
176             }
177              
178             =head2 parse_to
179              
180             $p->parse_to($destination_path)
181              
182             Parse contents of $source_path given in new method, change links and return HTML contents to wirte $destination_path. Unlike copy_to, $destination_path will not created and just return modified HTML. The encoding of strings is converted into utf8.
183              
184             =cut
185              
186             sub parse_to {
187 7     7 1 747 my ($self, $destination_path) = @_;
188 7         45 $destination_path = $self->destination_path($destination_path);
189            
190 7         21 my $output = '';
191 7     1   620 open my $fh, ">", \$output;
  1         18  
  1         5  
  1         26  
192 7         2254 $self->copy_to($fh);
193 7         27 return Encode::decode($self->encoding, $output);
194             }
195              
196             =head1 ACCESSOR METHODS
197              
198             =head2 source_path
199              
200             $p->source_path
201             $p->source_path($path)
202              
203             Get and set a source location. Usually source location is specified with the L<"new"> method. When a file handle is passed to L<"new"> and the location of the file handle is not the current working directory, you need to use this method.
204              
205             =cut
206              
207             sub source_path {
208 20     20 1 68 my $self = shift @_;
209            
210 20 100       78 if (@_) {
211 8         13 my $path = shift @_;
212 8         40 $self->{'source_path'} = $path;
213 8         52 $self->source_uri(URI::file->new_abs($path));
214             }
215            
216 20         247 return $self->{'source_path'};
217             }
218              
219              
220             =head2 destination_path
221              
222             $p->destination_path
223             $p->destination_path($path)
224              
225             Get and set a destination location. Usually destination location is specified with the L<"copy_to">. When a file handle is passed to L<"copy_to"> and the location of the file handle is not the current working directory, you need to use this method before L<"copy_to">.
226              
227             =cut
228              
229             sub destination_path {
230 32     32 1 170 my $self = shift @_;
231            
232 32 100       119 if (@_) {
233 16         31 my $path = shift @_;
234 16         66 $self->{'destination_path'} = $path;
235 16         130 $self->destination_uri(URI::file->new_abs($path));
236             }
237            
238 32         356 return $self->{'destination_path'};
239             }
240              
241             =head2 enchoding
242              
243             $p->encoding;
244              
245             Get an encoding of a source HTML.
246              
247             =cut
248              
249             sub encoding {
250 19     19 0 49 my ($self) = @_;
251 19 100       124 if ($self->{'encoding'}) {
252 8         131 return $self->{'encoding'};
253             }
254 11         48 my $in = $self->source_handle;
255 11         21 my $data = do {local $/; <$in>;};
  11         43  
  11         295  
256 11         362 my $p = HTML::HeadParser->new;
257 11         216787 $p->utf8_mode(1);
258 11         214 $p->parse($data);
259 11         2966 my $content_type = $p->header('content-type');
260 11         708 my $encoding = '';
261 11 100       58 if ($content_type) {
262 9 50       216 if ($content_type =~ /charset\s*=(.+)/) {
263 9         59 $encoding = $1;
264             }
265             }
266            
267 11 100       44 unless ($encoding) {
268 2         3 my $decoder;
269 2 100       13 if (my @suspects = $self->encode_suspects) {
270 1         44 $decoder = Encode::Guess->guess($data, @suspects);
271             }
272             else {
273 1         28 $decoder = Encode::Guess->guess($data);
274             }
275            
276 2 50       558 ref($decoder) or
277             die("Can't guess encoding of ".$self->source_path);
278            
279 2         26 $encoding = $decoder->name;
280             }
281            
282 11         253 $self->{'source_html'} = Encode::decode($encoding, $data);
283 11         2751 $self->{'encoding'} = $encoding;
284 11         156 return $encoding;
285             }
286              
287             =head2 io_layer
288              
289             $p->io_layer;
290             $p->io_layer(':utf8');
291              
292             Get and set PerlIO layer to read the source path and to write the destination path. Usually it was automatically determined by $source_path's charset tag. If charset is not specified, Encode::Guess module will be used.
293              
294             =cut
295              
296             sub io_layer {
297 27     27 1 383 my $self = shift @_;
298 27 50       89 if (@_) {
299 0         0 $self->{'io_layer'} = shift @_;
300             }
301             else {
302 27 100       146 unless ($self->{'io_layer'}) {
303 11         95 $self->{'io_layer'} = $self->check_io_layer();
304             }
305             }
306            
307 27         614 return $self->{'io_layer'};
308             }
309              
310             =head2 encode_suspects
311              
312             @suspects = $p->encode_sustects;
313             $p->encode_suspects(qw/shiftjis euc-jp/);
314              
315             Add suspects of text encoding to guess the text encoding of the source HTML. If the source HTML have charset tag, it is not required to add suspects.
316              
317             =cut
318              
319             sub encode_suspects {
320 4     4 1 352 my $self = shift @_;
321            
322 4 100       103 if (@_) {
323 2         16 my @suspects = @_;
324 2         25 $self->{'EncodeSuspects'} = \@suspects;
325             }
326            
327 4 100       24 if (my $suspects_ref = $self->{'EncodeSuspects'}) {
328 3         27 return @$suspects_ref;
329             }
330             else {
331 1         6 return ();
332             }
333             }
334              
335             =head2 source_html
336              
337             $p->source_html;
338              
339             Obtain source HTML's contents
340              
341             =cut
342              
343             sub source_html {
344 0     0 1 0 my ($self) = @_;
345 0         0 $self->io_layer;
346 0         0 return $self->{'source_html'};
347             }
348              
349             =head1 NOTE
350              
351             Cleanuped pathes should be given to HTML::Copy and it's instances. For example, a verbose path like '/aa/bb/../cc' may cause converting links wrongly. This is a limitaion of the URI module's rel method. To cleanup pathes, Cwd::realpath is useful.
352              
353              
354             =head1 AUTHOR
355              
356             Tetsuro KURITA
357              
358             =cut
359              
360             ##== overriding methods of HTML::Parser
361              
362 16     16 1 390 sub declaration { $_[0]->output("") }
363 0     0 1 0 sub process { $_[0]->output($_[2]) }
364 66     66 1 184 sub end { $_[0]->output($_[2]) }
365 178     178 1 519 sub text { $_[0]->output($_[1]) }
366              
367             sub comment {
368 0     0 1 0 my ($self, $comment) = @_;
369 0 0       0 if ($comment =~ /InstanceBegin template="([^"]+)"/) {
370 0         0 my $uri = URI->new($1);
371 0         0 my $newlink = $self->change_link($uri);
372 0         0 $comment = " InstanceBegin template=\"$newlink\" ";
373             }
374            
375 0         0 $self->output("");
376             }
377              
378             sub process_link {
379 71     71 0 109 my ($self, $link_path)= @_;
380 71 50       225 return undef if ($link_path =~ /^\$/);
381 71 50       187 return undef if ($link_path =~ /^\[%.*%\]$/);
382 71         248 my $uri = URI->new($link_path);
383 71 100       9034 return undef if ($uri->scheme);
384 70         929 return $self->change_link($uri);
385             }
386              
387             sub start {
388 128     128 1 297 my ($self, $tag, $attr_dict, $attr_names, $tag_text) = @_;
389            
390 128 100       517 unless ($self->has_base) {
391 114 100       1337 if ($tag eq 'base') {
392 1         10 $self->has_base(1);
393             }
394            
395 114         148 my $is_changed = 0;
396 114         121 foreach my $an_attr (@{$self->link_attributes}) {
  114         403  
397 684 100       2526 if (exists($attr_dict->{$an_attr})){
398 71         260 my $newlink = $self->process_link($attr_dict->{$an_attr});
399 71 100       898 next unless ($newlink);
400 70         253 $attr_dict->{$an_attr} = $newlink;
401 70         163 $is_changed = 1;
402             }
403             }
404            
405 114 50       398 if ($tag eq 'param') {
406 0 0       0 if ($attr_dict->{'name'} eq 'src') {
407 0         0 my $newlink = $self->process_link($attr_dict->{'value'});
408 0 0       0 if ($newlink) {
409 0         0 $attr_dict->{'value'} = $newlink;
410 0         0 $is_changed = 1;
411             }
412             }
413             }
414            
415 114 100       274 if ($is_changed) {
416 70         175 my $attrs_text = $self->build_attributes($attr_dict, $attr_names);
417 70         449 $tag_text = "<$tag $attrs_text>";
418             }
419             }
420            
421 128         438 $self->output($tag_text);
422             }
423              
424             ##== private functions
425              
426             sub complete_destination_path {
427 1     1 0 7 my ($self, $dir) = @_;
428 1 50       15 my $source_path = $self->source_path
429             or croak "Can't resolve a file name of the destination, because a source path is not given.";
430 1 50       241 my $filename = basename($source_path)
431             or croak "Can't resolve a file name of the destination, because given source path is a directory.";
432 1         58 return File::Spec->catfile($dir, $filename);
433            
434             }
435            
436             sub set_destination {
437 7     7 0 13 my ($self, $destination_path) = @_;
438            
439 7 100       247 if (-d $destination_path) {
440 1         14 $destination_path = $self->complete_destination_path($destination_path);
441             } else {
442 6         380 my ($name, $dir) = fileparse($destination_path);
443 6 50       22 unless ($name) {
444 0         0 $destination_path = $self->complete_destination_path($destination_path);
445             }
446            
447 6         410 mkpath($dir);
448             }
449              
450 7         31 return $self->destination_path($destination_path);
451             }
452              
453             sub check_io_layer {
454 11     11 0 29 my ($self) = @_;
455 11         292 my $encoding = $self->encoding;
456 11 50       44 return '' unless ($encoding);
457            
458 11         60 my $io_layer = '';
459 11 100       34 if (grep {/$encoding/} ('utf8', 'utf-8', 'UTF-8') ) {
  33         353  
460 7         27 $io_layer = ":utf8";
461             }
462             else {
463 4         25 $io_layer = ":encoding($encoding)";
464             }
465 11         55 return $io_layer;
466             }
467              
468             sub build_attributes {
469 70     70 0 128 my ($self, $attr_dict, $attr_names) = @_;
470 70         110 my @attrs = ();
471 70         78 foreach my $attr_name (@{$attr_names}) {
  70         189  
472 70 50       141 if ($attr_name eq '/') {
473 0         0 push @attrs, '/';
474             } else {
475 70         160 my $attr_value = $attr_dict->{$attr_name};
476 70         380 push @attrs, "$attr_name=\"$attr_value\"";
477             }
478             }
479 70         228 return join(' ', @attrs);
480             }
481              
482             sub change_link {
483 70     70 0 105 my ($self, $uri) = @_;
484 70         81 my $result_uri;
485 70         164 my $abs_uri = $uri->abs( $self->source_uri );
486 70         16026 my $abs_path = $abs_uri->file;
487              
488 70 50       10290 if (-e $abs_path) {
489 70         203 $result_uri = $abs_uri->rel($self->destination_uri);
490             } else {
491 0         0 warn("$abs_path is not found.\nThe link to this path is not changed.\n");
492 0         0 return "";
493             }
494            
495 70         19310 return $result_uri->as_string;
496             }
497              
498             sub output {
499 388     388 0 915 my ($self, $out_text) = @_;
500 388         472 print {$self->{'output_handle'}} $out_text;
  388         3956  
501             }
502              
503             sub source_handle {
504 30     30 0 64 my $self = shift @_;
505            
506 30 100       114 if (@_) {
    100          
507 19         51 $self->{'source_handle'} = shift @_;
508             } elsif (!$self->{'source_handle'}) {
509 8 50       103 my $path = $self->source_path or croak "source_path is undefined.";
510 8 50       611 open my $in, "<", $path or croak "Can't open $path.";
511 8         64 $self->{'source_handle'} = $in;
512             }
513            
514 30         299 return $self->{'source_handle'}
515             }
516              
517             sub source_uri {
518 78     78 0 59723 my $self = shift @_;
519 78 100       477 if (@_) {
    100          
520 8         84 $self->{'source_uri'} = shift @_;
521             } elsif (!$self->{'source_uri'}) {
522 3         9 $self->{'source_uri'} = do {
523 3 50       14 if (my $path = $self->source_path) {
524 0         0 URI::file->new_abs($path);
525             } else {
526 3         22 URI::file->cwd;
527             }
528             }
529             }
530            
531 78         16024 return $self->{'source_uri'}
532             }
533              
534             sub destination_uri {
535 86     86 0 111195 my $self = shift @_;
536            
537 86 100       760 if (@_) {
    50          
538 16         172 $self->{'destination_uri'} = shift @_;
539             } elsif (!$self->{'destination_uri'}) {
540 0         0 $self->{'destination_uri'} = do {
541 0 0       0 if (my $path = $self->destination_path) {
542 0         0 URI::file->new_abs($path);
543             } else {
544 0         0 URI::file->cwd;
545             }
546             }
547             }
548            
549 86         690 return $self->{'destination_uri'};
550             }
551              
552              
553              
554             1;