File Coverage

blib/lib/SWISH/Filters/Base.pm
Criterion Covered Total %
statement 59 133 44.3
branch 15 62 24.1
condition 1 8 12.5
subroutine 11 23 47.8
pod 13 16 81.2
total 99 242 40.9


line stmt bran cond sub pod time code
1             package SWISH::Filters::Base;
2 2     2   13 use strict;
  2         24  
  2         77  
3 2     2   11 use Carp;
  2         4  
  2         144  
4 2     2   15 use vars qw( $VERSION );
  2         2  
  2         976  
5              
6             $VERSION = '0.190';
7              
8             =pod
9              
10             =head1 NAME
11              
12             SWISH::Filters::Base - base class for SWISH::Filters
13              
14             =head1 DESCRIPTION
15              
16             Each filter is a subclass of SWISH::Filters::Base. A number of methods
17             are available by default (and some can be overridden). Others are useful
18             when writing your new() constructor.
19              
20              
21             =head1 METHODS
22              
23             =head2 filter
24              
25             You B override this method in your filter subclass.
26              
27             =cut
28              
29             sub filter {
30 0     0 1 0 my $class = ref( shift(@_) );
31 0         0 croak "$class must implement a filter() method";
32             }
33              
34             =head2 parent_filter
35              
36             This method is no longer supported.
37              
38             =cut
39              
40             sub parent_filter {
41 0     0 1 0 croak "parent_filter is no longer supported";
42             }
43              
44             =head2 type
45              
46             This method fetches the type of the filter. The value returned sets the
47             primary sort key for sorting the filters. You can override this in your
48             filter, or just set it as an attribute in your object. The default is 2.
49              
50             The idea of the "type" is to create groups of filters, if needed.
51             For example, you might have a set of filters that are used for uncompressing
52             some documents before passing on to another group for filtering.
53              
54             =cut
55              
56 0 0   0 1 0 sub type { $_[0]->{type} || 2 }
57              
58             =head2 priority
59              
60             This method fetches the priority of the filter. The value returned sets the
61             secondary sort key for sorting the filters. You can override this in your
62             filter, or just set it as an attribute in your object. The default method
63             returns 50.
64              
65             The priority is useful if you have multiple filters for the same content type that
66             use different methods for filtering (say one uses wvWare and another uses catdoc for
67             filtering MS Word files). You might give the wvWare filter a lower priority number
68             so it runs before the catdoc filter if both wvWare AND catdoc happen to be installed
69             at the same time.
70              
71             A lower priority value is given preference over a higher priority value.
72              
73             =cut
74              
75 0 0   0 1 0 sub priority { $_[0]->{priority} || 50 }
76              
77             =head2 mimetypes
78              
79             Returns the list of mimetypes (as regular expressions) set for the filter.
80              
81             =cut
82              
83             sub mimetypes {
84 0     0 1 0 my $self = shift;
85 0 0       0 croak "Filter [$self] failed to set 'mimetypes' in new() constructor\n"
86             if !$self->{mimetypes};
87              
88 0 0       0 croak "Filter [$self] 'mimetypes' entry is not an array reference\n"
89             unless ref $self->{mimetypes} eq 'ARRAY';
90              
91 0         0 return @{ $self->{mimetypes} };
  0         0  
92             }
93              
94             =head2 can_filter_mimetype( I )
95              
96             Returns true if passed in content type matches one of the filter's mimetypes
97             Returns the pattern that matched.
98              
99             =cut
100              
101             sub can_filter_mimetype {
102 0     0 1 0 my ( $self, $content_type ) = @_;
103              
104 0 0       0 croak "Must supply content_type to can_filter_mimetype()"
105             unless $content_type;
106 0         0 for my $pattern ( $self->mimetypes ) {
107 0 0       0 return $pattern if $content_type =~ /$pattern/;
108             }
109 0         0 return;
110             }
111              
112             =head2 mywarn( I )
113              
114             Prints I on STDERR if debugging is set with FILTER_DEBUG environment
115             variable.
116              
117             =cut
118              
119             sub mywarn {
120 76     76 1 89 my $self = shift;
121              
122 76 50       205 print STDERR "Filter: $self: ", @_, "\n" if $ENV{FILTER_DEBUG};
123             }
124              
125             =head2 set_programs( @I );
126              
127             Creates a method for each
128             program with the "run_" prefix. Returns undef if B program cannot
129             be found.
130              
131             If all the programs listed in @I are found
132             and can be executed as the current user,
133             set_programs() returns $self, so you can chain methods together.
134              
135             For example, in your constructor you might do:
136              
137             return $self->set_programs( qw/ pdftotext pdfinfo / );
138              
139             Then in your filter() method:
140              
141             my $content = $self->run_pdfinfo( $doc->fetch_filename, [options] );
142              
143             =cut
144              
145             sub set_programs {
146 6     6 1 16 my ( $self, @progs ) = @_;
147              
148 6         11 for my $prog (@progs) {
149 6         39 my $path = $self->find_binary($prog);
150 6 50       16 unless ($path) {
151 6         16 $self->mywarn(
152             "Can not use Filter: failed to find $prog. Maybe need to install?"
153             );
154 6         70 return;
155             }
156              
157 0 0       0 if ( !$self->can("run_${prog}") ) {
158 2     2   11 no strict 'refs';
  2         4  
  2         188  
159 0         0 *{"run_$prog"} = sub {
160 0     0   0 return shift->run_program( $path, @_ ); # closure
161 0         0 };
162             }
163             }
164              
165 0         0 return $self;
166             }
167              
168             =head2 find_binary( I );
169              
170             Use in a filter's new() method to test for a necesary program located in C<$ENV{PATH}>.
171             Returns the path to the program if I exists and passes the built-in C<-x> test.
172             Returns undefined otherwise.
173              
174             =cut
175              
176 2     2   11 use Config;
  2         4  
  2         1790  
177             my @path_segments;
178              
179             sub find_binary {
180 6     6 1 11 my ( $self, $prog ) = @_;
181              
182 6 100       15 unless (@path_segments) {
183 1   50     14 my $path_sep = $Config{path_sep} || ':';
184              
185 1         16 @path_segments = split /\Q$path_sep/, $ENV{PATH};
186              
187 1 50       4 if ( my $libexecdir = get_libexec() ) {
188 1         2 push @path_segments, $libexecdir;
189             }
190             }
191              
192 6         53 $self->mywarn( "Find path of [$prog] in " . join ':', @path_segments );
193              
194 6         13 for (@path_segments) {
195 48         74 my $path = "$_/$prog";
196              
197             # For buggy Windows98 that accepts forward slashes if the filename isn't too long
198 48 50       216 $path =~ s[/][\\]g if $^O =~ /Win32/;
199              
200 48 50       771 if ( -x $path ) {
201              
202 0         0 $self->mywarn(" * Found program at: [$path]\n");
203 0         0 return $path;
204             }
205 48         121 $self->mywarn(" Not found at path [$path]");
206              
207             # ok, try Windows extenstions
208 48 50       138 if ( $^O =~ /Win32/ ) {
209 0         0 for my $extension (qw/ exe bat /) {
210 0 0       0 if ( -x "$path.$extension" ) {
211 0         0 $self->mywarn(
212             " * Found program at: [$path.$extension]\n");
213 0         0 return "$path.$extension";
214             }
215 0         0 $self->mywarn(" Not found at path [$path.$extension]");
216             }
217             }
218              
219             }
220 6         13 return;
221             }
222              
223             # Try and return libexecdir in case programs are installed there (the case with Windows)
224             # Assumes that we are running from libexecdir or bindir
225             # The other option under Windows would be to fetch libexecdir from the Windows registry,
226             # but that could break if a new (another) swish install was done since the registry
227             # would then point to the new install location.
228              
229             sub get_libexec {
230              
231             # karman changed to return just 'swish-e' and rely on PATH to find it
232 1     1 0 5 return 'swish-e';
233             }
234              
235             =head2 use_modules( @I );
236              
237             Attempts to load each of the modules listed and call its import() method.
238              
239             Use to test and load required modules within a filter without aborting.
240              
241             return unless $self->use_modules( qw/ Spreadsheet::ParseExcel HTML::Entities / );
242              
243             If the module name is an array reference, the first item is considered the module
244             name and the second the minimum version required.
245              
246             return unless $self->use_modules( [ 'Foo::Bar' => '0.123' ] );
247              
248             Returns undef if any module is unavailable.
249             A warning message is displayed if the FILTER_DEBUG environment variable is true.
250              
251             Returns C<$self> on success.
252              
253              
254             =cut
255              
256             sub use_modules {
257 8     8 1 22 my ( $self, @modules ) = @_;
258              
259 8         16 for my $module (@modules) {
260 8         18 my $req_vers = 0;
261 8         13 my $mod;
262 8 50       20 if ( ref $module ) {
263 0         0 ( $mod, $req_vers ) = @$module;
264             }
265             else {
266 8         15 $mod = $module;
267             }
268              
269 8         66 $self->mywarn("trying to load [$mod $req_vers]");
270              
271 8 50       21 if ($req_vers) {
272 0 0       0 eval { eval "use $mod $req_vers"; die "$@\n" if $@; };
  0         0  
  0         0  
273             }
274             else {
275 8 100       12 eval { eval "require $mod" or die "$!\n" };
  8         497  
276             }
277              
278 8 100       228967 if ($@) {
279 6         9 my $err = $@;
280 6         15 my $caller = caller();
281 6         140 $self->mywarn(
282             "Can not use Filter $caller -- need to install $mod $req_vers: $err"
283             );
284 6         224 return;
285             }
286              
287 2         18 $self->mywarn(" ** Loaded $mod **");
288              
289             # Export back to caller
290 2 50       464 $mod->export_to_level(1) if $mod->can('export_to_level');
291             }
292 2         34 return $self;
293             }
294              
295             =head2 run_program( I, @I );
296              
297             Runs I with @I. Must pass in @args.
298              
299             Under Windows calls IPC::Open2, which may pass data through the shell. Double-quotes are
300             escaped (backslashed) and each parameter is wrapped in double-quotes.
301              
302             On other platforms a fork() and exec() is used to avoid passing any data through the shell.
303              
304             Returns a reference to a scalar containing the output from your program, or croaks.
305              
306             This method is intended to read output from a program that converts one format into text.
307             The output is read back in text mode -- on systems like Windows this means \r\n (CRLF) will
308             be convertet to \n.
309              
310             =cut
311              
312             sub run_program {
313 0     0 1   my $self = shift;
314              
315 0 0         croak "No arguments passed to run_program()\n"
316             unless @_;
317              
318 0 0         croak "Must pass arguments to program '$_[0]'\n"
319             unless @_ > 1;
320              
321 0 0 0       my $fh
322             = $^O =~ /Win32/i || $^O =~ /VMS/i
323             ? $self->windows_fork(@_)
324             : $self->real_fork(@_);
325              
326 0           local $/ = undef;
327 0           my $output = <$fh>;
328 0           close $fh;
329              
330             # When using IPC::Open3 need to reap the processes.
331 0 0         waitpid delete $self->{pid}, 0 if $self->{pid};
332              
333 0           return $output;
334             }
335              
336             #==================================================================
337             # Run swish-e by forking
338             #
339              
340 2     2   20 use Symbol;
  2         3  
  2         1233  
341              
342             sub real_fork {
343 0     0 0   my ( $self, @args ) = @_;
344              
345             # Run swish
346 0           my $fh = gensym;
347 0           my $pid = open( $fh, '-|' );
348              
349 0 0         croak "Failed to fork: $!\n" unless defined $pid;
350              
351 0 0         return $fh if $pid;
352              
353 0           delete $self->{temp_file}; # in child, so don't want to delete on destroy.
354              
355 0 0         exec @args or exit; # die "Failed to exec '$args[0]': $!\n";
356             }
357              
358             #=====================================================================================
359             # Need
360             #
361             sub windows_fork {
362 0     0 0   my ( $self, @args ) = @_;
363              
364 0           require IPC::Open2;
365 0           my ( $rdrfh, $wtrfh );
366              
367 0           my @command = map { s/"/\\"/g; qq["$_"] } @args;
  0            
  0            
368              
369 0           my $pid = IPC::Open2::open2( $rdrfh, $wtrfh, @command );
370              
371             # IPC::Open3 uses binmode for some reason (5.6.1)
372             # Assume that the output from the program will be in text
373             # Maybe an invalid assumption if running through a binary filter
374              
375 0           binmode $rdrfh, ':crlf'; # perhpaps: unless delete $self->{binary_output};
376              
377 0           $self->{pid} = $pid;
378              
379 0           return $rdrfh;
380             }
381              
382             =head2 escapeXML( I )
383              
384             Escapes the 5 primary XML characters & < > ' and ", plus all ASCII control
385             characters. Returns the escaped string.
386              
387             =cut
388              
389             sub escapeXML {
390 0     0 1   my $self = shift;
391 0           my $str = shift;
392              
393 0 0         return '' unless defined $str;
394              
395 0           $str =~ s/[\x00-\x1f]/\n/go; # converts all low chars to LF
396              
397 0           for ($str) {
398 0           s/&/&/go;
399 0           s/"/"/go;
400 0           s/
401 0           s/>/>/go;
402 0           s/'/'/go;
403             }
404 0           return $str;
405             }
406              
407             =head2 format_meta_headers( I )
408              
409             Returns XHTML-compliant C tags as a scalar, suitable for inserting into the C
410             tagset of HTML or anywhere in an XML doc.
411              
412             I should be a hash ref of name/content pairs. Both name and content
413             will be run through escapeXML for you, so do B escape them yourself or you
414             run the risk of double-escaped text.
415              
416             =cut
417              
418             sub format_meta_headers {
419 0     0 1   my $self = shift;
420 0 0         my $m = shift or croak "need meta hash ref";
421 0 0 0       croak "$m is not a hash ref" unless ref $m and ref $m eq 'HASH';
422              
423 0           my $metas = join "\n", map {
424 0           ' 425             . $self->escapeXML($_)
426             . '" content="'
427             . $self->escapeXML( $m->{$_} ) . '"/>';
428              
429             } sort keys %$m;
430              
431 0           return $metas;
432             }
433              
434             1;
435              
436             __END__