File Coverage

blib/lib/HTML/FormatExternal.pm
Criterion Covered Total %
statement 34 139 24.4
branch 3 66 4.5
condition 0 26 0.0
subroutine 10 18 55.5
pod 4 4 100.0
total 51 253 20.1


line stmt bran cond sub pod time code
1             # Copyright 2008, 2009, 2010, 2011, 2012, 2013, 2015 Kevin Ryde
2              
3             # HTML-FormatExternal is free software; you can redistribute it and/or
4             # modify it under the terms of the GNU General Public License as published
5             # by the Free Software Foundation; either version 3, or (at your option) any
6             # later version.
7             #
8             # HTML-FormatExternal is distributed in the hope that it will be useful, but
9             # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
10             # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
11             # for more details.
12             #
13             # You should have received a copy of the GNU General Public License along
14             # with HTML-FormatExternal. If not, see .
15              
16              
17              
18             # Maybe:
19             # capture error output
20             # errors_to => \$var
21             # combine error messages
22             #
23              
24              
25             package HTML::FormatExternal;
26 5     5   1498 use 5.006;
  5         10  
  5         131  
27 5     5   16 use strict;
  5         4  
  5         96  
28 5     5   13 use warnings;
  5         5  
  5         147  
29 5     5   17 use Carp;
  5         7  
  5         234  
30 5     5   16 use File::Spec 0.80; # version 0.80 of perl 5.6.0 or thereabouts for devnull()
  5         82  
  5         76  
31 5     5   3642 use IPC::Run;
  5         144701  
  5         467  
32              
33             # uncomment this to run the ### lines
34             # use Smart::Comments;
35              
36             our $VERSION = 23;
37              
38             sub new {
39 14     14 1 11220 my ($class, %self) = @_;
40 14         45 return bless \%self, $class;
41             }
42             sub format {
43 0     0 1 0 my ($self, $html) = @_;
44 0 0       0 if (ref $html) { $html = $html->as_HTML; }
  0         0  
45 0         0 return $self->format_string ($html, %$self);
46             }
47              
48 5     5   34 use constant _WIDE_INPUT_CHARSET => 'UTF-8';
  5         7  
  5         287  
49 5     5   19 use constant _WIDE_OUTPUT_CHARSET => 'UTF-8';
  5         6  
  5         4491  
50              
51             # format_string() takes the easy approach of putting the string in a temp
52             # file and letting format_file() do the real work. The formatter programs
53             # can generally read stdin and write stdout, so might do that with select()
54             # to simultaneously write and read back.
55             #
56             sub format_string {
57 0     0 1 0 my ($class, $html_str, %options) = @_;
58              
59 0         0 my $fh = _tempfile();
60 0         0 my $input_wide = eval { utf8::is_utf8($html_str) };
  0         0  
61 0         0 _output_wide(\%options, $input_wide);
62              
63             # insert while in wide chars
64 0 0       0 if (defined $options{'base'}) {
65 0         0 $html_str = _base_prefix(\%options, $html_str, $input_wide);
66             }
67              
68 0 0       0 if ($input_wide) {
69 0 0       0 if (! $options{'input_charset'}) {
70 0         0 $options{'input_charset'} = $class->_WIDE_INPUT_CHARSET;
71             }
72             ### input_charset for wide: $options{'input_charset'}
73 0 0       0 if ($options{'input_charset'} eq 'entitize') {
74 0         0 $html_str = _entitize($html_str);
75 0         0 delete $options{'input_charset'};
76             } else {
77 0         0 my $layer = ":encoding($options{'input_charset'})";
78 0 0       0 binmode ($fh, $layer) or die 'Cannot add layer ',$layer;
79             }
80             }
81              
82 0 0       0 do {
83 0 0       0 print $fh $html_str
84             and close($fh)
85             } || die 'Cannot write temp file: ',$!;
86              
87 0         0 return $class->format_file ($fh->filename, %options);
88             }
89              
90             # Left margin is synthesized by adding spaces afterwards because the various
91             # programs have pretty variable support for a specified margin.
92             # * w3m doesn't seem to have a left margin option at all
93             # * lynx has one but it's too well hidden in its style sheet or something
94             # * elinks has document.browse.margin_width but it's limited to 8 or so
95             # * netrik doesn't seem to have one at all
96             # * vilistextum has a "spaces" internally for lists etc but no apparent
97             # way to initialize from the command line
98             #
99             sub format_file {
100 0     0 1 0 my ($class, $filename, %options) = @_;
101              
102             # If neither leftmargin nor rightmargin are specified then '_width' is
103             # unset and the _make_run() funcs leave it to the program defaults.
104             #
105             # If either leftmargin or rightmargin are set then '_width' is established
106             # and the _make_run() funcs use it and and zero left margin, then the
107             # actual left margin is applied below.
108             #
109             # The DEFAULT_LEFTMARGIN and DEFAULT_RIGHTMARGIN establish the defaults
110             # when just one of the two is set. Not good hard coding those values,
111             # but the programs don't have anything to set one but not the other.
112             #
113 0         0 my $leftmargin = $options{'leftmargin'};
114 0         0 my $rightmargin = $options{'rightmargin'};
115 0 0 0     0 if (defined $leftmargin || defined $rightmargin) {
116 0 0       0 if (! defined $leftmargin) { $leftmargin = $class->DEFAULT_LEFTMARGIN; }
  0         0  
117 0 0       0 if (! defined $rightmargin) { $rightmargin = $class->DEFAULT_RIGHTMARGIN; }
  0         0  
118 0         0 $options{'_width'} = $rightmargin - $leftmargin;
119             }
120              
121 0         0 _output_wide(\%options, 0); # file input is reckoned as not wide
122 0 0       0 if ($options{'output_wide'}) {
123 0   0     0 $options{'output_charset'} ||= $class->_WIDE_OUTPUT_CHARSET;
124             }
125              
126 0         0 my $tempfh;
127 0 0       0 if (defined $options{'base'}) {
128             # insert by copying to a temp file
129              
130             # File::Copy rudely calls eq() to compare $from and $to. Need either
131             # File::Temp 0.18 to have that work on $tempfh, or File::Copy 2.??? for
132             # it to check an overload method exists first. Newer File::Temp is
133             # available from cpan, where File::Copy may not be, so ask for
134             # File::Temp 0.18.
135 0         0 require File::Temp;
136 0         0 File::Temp->VERSION(0.18);
137              
138             # must sysread()/syswrite() because that's what File::Copy does (as of
139             # its version 2.30) so anything held in the perl buffering by the normal
140             # read() is lost.
141              
142 0         0 my $initial;
143             my $fh;
144 0 0       0 do {
145 0 0 0     0 open $fh, '<', $filename
146             and binmode $fh
147             and defined (sysread $fh, $initial, 4)
148             } || croak "Cannot open $filename: $!";
149             ### $initial
150              
151 0         0 $initial = _base_prefix(\%options, $initial, 0);
152              
153 0         0 $tempfh = _tempfile();
154 0         0 $tempfh->autoflush(1);
155 0         0 require File::Copy;
156 0 0       0 do {
157 0 0 0     0 defined(syswrite($tempfh, $initial))
      0        
158             and File::Copy::copy($fh, $tempfh)
159             and close $tempfh
160             and close $fh
161             } || croak "Cannot copy $filename to temp file: $!";
162              
163              
164 0         0 $filename = $tempfh->filename;
165             }
166              
167             # # dump the file being crunched
168             # print "Bytes passed to program:\n";
169             # IPC::Run::run(['hd'], '<',$filename, '|',['cat']);
170              
171             # _make_run() can set $options{'ENV'} too
172 0         0 my ($command_aref, @run) = $class->_make_run($filename, \%options);
173 0   0     0 my $env = $options{'ENV'} || {};
174              
175             ### $command_aref
176             ### @run
177             ### $env
178              
179 0 0       0 if (! @run) {
180 0         0 push @run, '<', File::Spec->devnull;
181             }
182              
183 0         0 my $str;
184             {
185 0         0 local %ENV = (%ENV, %$env); # overrides from _make_command()
  0         0  
186 0         0 eval { IPC::Run::run($command_aref,
  0         0  
187             @run,
188             '>', \$str,
189             # FIXME: what to do with stderr ?
190             # '2>', File::Spec->devnull,
191             ) };
192             }
193 0         0 _die_on_insecure();
194             ### $str
195              
196             ### final output_wide: $options{'output_wide'}
197 0 0       0 if ($options{'output_wide'}) {
198 0         0 require Encode;
199 0         0 $str = Encode::decode ($options{'output_charset'}, $str);
200             }
201              
202 0 0       0 if ($leftmargin) {
203 0         0 my $fill = ' ' x $leftmargin;
204 0         0 $str =~ s/^(.)/$fill$1/mg; # non-empty lines only
205             }
206 0         0 return $str;
207             }
208              
209             # most program running errors are quietly ignored for now, but re-throw
210             # "Insecure $ENV{PATH}" when cannot run due to taintedness.
211             sub _die_on_insecure {
212 0 0   0   0 if ($@ =~ /^Insecure/) {
213 0         0 die $@;
214             }
215             }
216              
217             sub _run_version {
218 39     39   53 my ($self_or_class, $command_aref, @ipc_options) = @_;
219             ### _run_version() ...
220             ### $command_aref
221             ### @ipc_options
222              
223 39 100       77 if (! @ipc_options) {
224 29         102 @ipc_options = ('2>', File::Spec->devnull);
225             }
226              
227 39         32 my $version; # left undef if any exec/slurp problem
228 39         35 eval { IPC::Run::run($command_aref,
  39         120  
229             '<', File::Spec->devnull,
230             '>', \$version,
231             @ipc_options) };
232              
233             # strip blank lines at end of lynx, maybe others
234 39 50       33619 if (defined $version) { $version =~ s/\n+$/\n/s; }
  0         0  
235 39         90 return $version;
236             }
237              
238             # return a File::Temp filehandle object
239             sub _tempfile {
240 0     0     require File::Temp;
241 0           my $fh = File::Temp->new (TEMPLATE => 'HTML-FormatExternal-XXXXXX',
242             SUFFIX => '.html',
243             TMPDIR => 1);
244 0 0         binmode($fh) or die 'Oops, cannot set binmode() on temp file';
245              
246             ### tempfile: $fh->filename
247             # $fh->unlink_on_destroy(0); # to preserve for debugging ...
248              
249 0           return $fh;
250             }
251              
252             sub _output_wide {
253 0     0     my ($options, $input_wide) = @_;
254 0 0 0       if (! defined $options->{'output_wide'}
255             || $options->{'output_wide'} eq 'as_input') {
256 0           $options->{'output_wide'} = $input_wide;
257             }
258             }
259              
260             # $str is HTML or some initial bytes.
261             # Return a new string with at the start.
262             #
263             sub _base_prefix {
264 0     0     my ($options, $str, $input_wide) = @_;
265 0           my $base = delete $options->{'base'};
266             ### _base_prefix: $base
267              
268 0           $base = "$base"; # stringize possible URI object
269 0           $base = _entitize($base); # probably shouldn't be any non-ascii in a url
270 0           $base = "\n";
271              
272 0           my $pos = 0;
273 0 0         unless ($input_wide) {
274             # encode $base in the input_charset, and possibly after a BOM.
275             #
276             # Lynx recognises a BOM, if it doesn't have other -assume_charset. It
277             # recognises it only at the start of the file, so must insert
278             # after it here to preserve that feature of Lynx.
279             #
280             # If input_charset is utf-32 or utf-16 then it seems reasonable to step
281             # over any BOM. But Lynx for some reason doesn't like a BOM together
282             # with utf-32 or utf-16 specified. Dunno if that's a bug or a feature
283             # on its part.
284              
285 0           my $input_charset = $options->{'input_charset'};
286 0 0 0       if (! defined $input_charset || lc($input_charset) eq 'utf-32') {
287 0 0         if ($str =~ /^\000\000\376\377/) {
    0          
288 0           $input_charset = 'utf-32be';
289 0           $pos = 4;
290             } elsif ($str =~ /^\377\376\000\000/) {
291 0           $input_charset = 'utf-32le';
292 0           $pos = 4;
293             }
294             }
295 0 0 0       if (! defined $input_charset || lc($input_charset) eq 'utf-16') {
296 0 0         if ($str =~ /^\376\377/) {
    0          
297 0           $input_charset = 'utf-16be';
298 0           $pos = 4;
299             } elsif ($str =~ /^\377\376/) {
300 0           $input_charset = 'utf-16le';
301 0           $pos = 2;
302             }
303             }
304 0 0         if (defined $input_charset) {
305             # encode() errors out if unknown charset, and doesn't exist for older
306             # Perl, in which case leave $base as ascii. May not be right, but
307             # ought to work with the various ASCII superset encodings.
308 0           eval {
309 0           require Encode;
310 0           $base = Encode::encode ($input_charset, $base);
311             };
312             }
313             }
314 0           substr($str, $pos,0, $base); # insert $base at $pos
315 0           return $str;
316             }
317              
318             # return $str with non-ascii replaced by { entities
319             sub _entitize {
320 0     0     my ($str) = @_;
321 0           $str =~ s{([^\x20-\x7E])}{'&#'.ord($1).';'}eg;
  0            
322             ### $str
323 0           return $str;
324             }
325              
326             1;
327             __END__