File Coverage

blib/lib/SWISH/Filter.pm
Criterion Covered Total %
statement 48 144 33.3
branch 4 62 6.4
condition 2 13 15.3
subroutine 12 21 57.1
pod 8 9 88.8
total 74 249 29.7


line stmt bran cond sub pod time code
1             package SWISH::Filter;
2              
3 2     2   20143 use 5.005;
  2         6  
  2         64  
4 2     2   8 use strict;
  2         3  
  2         58  
5 2     2   9 use File::Basename;
  2         2  
  2         164  
6 2     2   9 use Carp;
  2         3  
  2         107  
7 2     2   645 use SWISH::Filter::MIMETypes;
  2         4  
  2         77  
8 2     2   894 use SWISH::Filter::Document;
  2         4  
  2         58  
9 2     2   769 use SWISH::Filters::Base;
  2         2  
  2         72  
10             use Module::Pluggable
11 2         14 search_path => 'SWISH::Filters',
12             except => 'SWISH::Filters::Base',
13             sub_name => 'filters_found',
14             require => 1,
15 2     2   935 instantiate => 'new';
  2         14274  
16              
17 2     2   154 use vars qw/ $VERSION %extra_methods /;
  2         3  
  2         2506  
18              
19             $VERSION = '0.191';
20              
21             # Define the available parameters
22             %extra_methods = map { $_ => 1 } qw( meta_data name user_data );
23              
24             # For testing only
25              
26             if ( $0 =~ 'Filter.pm' && @ARGV >= 2 && shift =~ /^test/i ) {
27             die "Please use the 'swish-filter-test' program.\n";
28             }
29              
30             =head1 NAME
31              
32             SWISH::Filter - filter documents for indexing with Swish-e
33              
34             =head1 SYNOPSIS
35              
36             use SWISH::Filter;
37              
38             # load available filters into memory
39             my $filter = SWISH::Filter->new;
40              
41              
42             # convert a document
43              
44             my $doc = $filter->convert(
45             document => \$scalar_ref, # path or ref to a doc
46             content_type => $content_type, # content type if doc reference
47             name => $real_path, # optional name for this file (useful for debugging)
48             user_data => $whatever, # optional data to make available to filters
49             );
50              
51             return unless $doc; # empty doc, zero size, or no filters installed
52              
53             # Was the document converted by a filter?
54             my $was_filtered = $doc->was_filtered;
55              
56             # Skip if the file is not text
57             return if $doc->is_binary;
58              
59             # Print out the doc
60             my $doc_ref = $doc->fetch_doc;
61             print $$doc_ref;
62              
63             # Fetch the final content type of the document
64             my $content_type = $doc->content_type;
65              
66             # Fetch Swish-e parser type (TXT*, XML*, HTML*, or undefined)
67             my $doc_type = $doc->swish_parser_type;
68              
69             =head1 DESCRIPTION
70              
71             SWISH::Filter provides a unified way to convert documents into a type that
72             Swish-e can index. Individual filters are installed as separate subclasses (modules).
73             For example, there might be a filter that converts from PDF format to HTML
74             format.
75              
76             SWISH::Filter is a framework that relies on other packages to do the heavy lifting
77             of converting non-text documents to text. B
78             programs or Perl modules may need to be installed to use SWISH::Filter to filter
79             documents.> For example, to filter PDF documents you must install the C
80             package.
81              
82             The filters are automatically loaded when Cnew()> is
83             called. Filters define a type and priority that determines the processing
84             order of the filter. Filters are processed in this sort order until a filter
85             accepts the document for filtering. The filter uses the document's content type
86             to determine if the filter should handle the current document. The
87             content-type is determined by the files suffix if not supplied by the calling
88             program.
89              
90             The individual filters are not designed to be used as separate modules. All
91             access to the filters is through this SWISH::Filter module.
92              
93             Normally, once a document is filtered processing stops. Filters can filter the
94             document and then set a flag saying that filtering should continue (for example
95             a filter that uncompresses a MS Word document before passing on to the filter
96             that converts from MS Word to text). All this should be transparent to the end
97             user. So, filters can be pipe-lined.
98              
99             The idea of SWISH::Filter is that new filters can be created, and then
100             downloaded and installed to provide new filtering capabilities. For example,
101             if you needed to index MS Excel documents you might be able to download a
102             filter from the Swish-e site and magically next time you run indexing MS Excel
103             docs would be indexed.
104              
105             The SWISH::Filter setup can be used with -S prog or -S http. It works best
106             with the -S prog method because the filter modules only need to be loaded and
107             compiled one time. The -S prog program F will automatically use
108             SWISH::Filter when spidering with default settings (using "default" as the
109             first parameter to spider.pl).
110              
111             The -S http indexing method uses a Perl helper script called F.
112             F has been updated to work with SWISH::Filter, but (unlike
113             spider.pl) does not contain a "use lib" line to point to the location of
114             SWISH::Filter. This means that by default F will B use
115             SWISH::Filter for filtering. The reason for this is because F
116             runs for every URL fetched, and loading the Filters for each document can be
117             slow. The recommended way of spidering is using -S prog with spider.pl, but if
118             -S http is desired the way to enable SWISH::Filter is to set PERL5LIB before
119             running swish so that F will be able to locate the SWISH::Filter
120             module. Here's one way to set the PERL5LIB with the bash shell:
121              
122             $ export PERL5LIB=`swish-filter-test -path`
123              
124              
125              
126             =head1 METHODS
127              
128             =head2 new( %I )
129              
130             new() creates a SWISH::Filter object. You may pass in options as a list or a hash reference.
131              
132             =head3 Options
133              
134             There is currently only one option that can be passed in to new():
135              
136             =over 4
137              
138             =item ignore_filters
139              
140             Pass in a reference to a list of filter names to ignore. For example, if you have two filters installed
141             "Pdf2HTML" and "Pdf2XML" and want to avoid using "Pdf2XML":
142              
143             my $filter = SWISH::Filter->new( ignore_filters => ['Pdf2XML'];
144              
145             =back
146              
147             =cut
148              
149             sub new {
150 1     1 1 313 my $class = shift;
151 1   33     10 $class = ref($class) || $class;
152              
153 1 0       3 my %attr = ref $_[0] ? %{ $_[0] } : @_ if @_;
  0 50       0  
154              
155 1         2 my $self = bless {}, $class;
156              
157 1         5 $self->{skip_filters} = {};
158              
159 1 50       3 $self->ignore_filters( delete $attr{ignore_filters} )
160             if $attr{ignore_filters};
161              
162 1         3 warn "Unknown SWISH::Filter->new() config setting '$_'\n" for keys %attr;
163              
164 1         6 $self->{mimetypes} = SWISH::Filter::MIMETypes->new;
165              
166 1         2 $self->create_filter_list(%attr);
167              
168 1   50     9 $self->{doc_class} ||= 'SWISH::Filter::Document';
169              
170 1         8 return $self;
171             }
172              
173             sub ignore_filters {
174 0     0 1 0 my ( $self, $filters ) = @_;
175              
176 0 0       0 unless ($filters) {
177 0 0       0 return unless $self->{ignore_filter_list};
178 0         0 return @{ $self->{ignore_filter_list} };
  0         0  
179             }
180              
181 0         0 @{ $self->{ignore_filter_list} } = @$filters;
  0         0  
182              
183             # create lookup hash for filters to skip
184 0         0 $self->{skip_filters} = { map { $_, 1 } @$filters };
  0         0  
185             }
186              
187             =head2 doc_class
188              
189             If you subclass SWISH::Filter::Document with your own class, indicate your class
190             name in the new() method with the C param. The return value of doc_class()
191             is used in convert() for instatiating the Document object. The default value is
192             C.
193              
194             =cut
195              
196             sub doc_class {
197 0     0 1 0 return $_[0]->{doc_class};
198             }
199              
200             =head2 convert
201              
202             This method filters a document. Returns an object belonging to doc_class()
203             on success. If passed an empty document, a filename that cannot be read off disk, or
204             if no filters have been loaded, returns undef.
205              
206             See the SWISH::Filter::Document documentation.
207              
208             You must pass in a hash (or hash reference) of parameters to the convert() method. The
209             possible parameters are:
210              
211             =over 8
212              
213             =item document
214              
215             This can be either a path to a file, or a scalar reference to a document in memory.
216             This is required.
217              
218             =item content_type
219              
220             The MIME type of the document. This is only required when passing in a scalar
221             reference to a document. The content type string is what the filters use to
222             match a document type.
223              
224             When passing in a file name and C is not set, then the content type will
225             be determined from the file's extension by using the MIME::Types Perl module (available on CPAN).
226              
227             =item name
228              
229             Optional name to pass in to filters that will be used in error and warning messages.
230              
231             =item user_data
232              
233             Optional data structure that all filters may access.
234             This can be fetched in a filter by:
235              
236             my $user_data = $doc_object->user_data;
237              
238             And used in the filter as:
239              
240             if ( ref $user_data && $user_data->{pdf2html}{title} ) {
241             ...
242             }
243              
244             It's up to the filter author to use a unique first-level hash key for a given filter.
245              
246             =item meta_data
247              
248             Optional data structure intended for meta name/content pairs for HTML
249             or XML output. See SWISH::Filter::Document for discussion of this data.
250              
251             =back
252              
253             Example of using the convert() method:
254              
255             $doc_object = $filter->convert(
256             document => $doc_ref,
257             content-type => 'application/pdf',
258             );
259              
260             =cut
261              
262             sub convert {
263 0     0 1 0 my $self = shift;
264 0 0       0 my %attr = ref $_[0] ? %{ $_[0] } : @_ if @_;
  0 0       0  
265              
266             # Any filters?
267 0 0       0 return unless $self->filter_list;
268              
269 0   0     0 my $doc = delete $attr{document}
270             || die
271             "Failed to supply document attribute 'document' when calling filter()\n";
272              
273 0         0 my $content_type = delete $attr{content_type};
274              
275 0 0       0 if ( ref $content_type ) {
276 0         0 my $type = $self->decode_content_type($$content_type);
277              
278 0 0       0 unless ($type) {
279 0         0 warn
280             "Failed to set content type for file reference '$$content_type'\n";
281 0         0 return;
282             }
283 0         0 $content_type = $type;
284             }
285              
286 0 0       0 if ( ref $doc ) {
287 0 0       0 die
288             "Must supply a content type when passing in a reference to a document\n"
289             unless $content_type;
290             }
291             else {
292 0   0     0 $content_type ||= $self->decode_content_type($doc);
293 0 0       0 unless ($content_type) {
294 0         0 warn "Failed to set content type for document '$doc'\n";
295 0         0 return;
296             }
297              
298 0   0     0 $attr{name} ||= $doc; # Set default name of document
299             }
300              
301 0         0 $self->mywarn(
302             "\n>> Starting to process new document: $attr{name} -> $content_type"
303             );
304              
305             ## Create a new document object
306              
307 0         0 my $doc_object = $self->doc_class->new( $doc, $content_type );
308 0 0       0 return unless $doc_object; # fails on empty doc or doc not readable
309              
310 0         0 $self->_set_extra_methods( $doc_object, {%attr} );
311              
312             # Now run through the filters
313 0         0 for my $filter ( $self->filter_list ) {
314              
315 0         0 $self->mywarn(" ++Checking filter [$filter] for $content_type");
316              
317             # can this filter handle this content type?
318 0 0       0 next unless $filter->can_filter_mimetype( $doc_object->content_type );
319              
320 0         0 my $start_content_type = $doc_object->content_type;
321 0         0 my ( $filtered_doc, $metadata );
322              
323             # run the filter
324 0         0 eval {
325 0         0 local $SIG{__DIE__};
326 0         0 ( $filtered_doc, $metadata ) = $filter->filter($doc_object);
327             };
328              
329 0 0       0 if ($@) {
330 0         0 $self->mywarn(
331             "Problems with filter '$filter'. Filter disabled:\n -> $@");
332 0         0 $self->filter_list(
333 0         0 [ grep { $_ != $filter } $self->filter_list ] );
334 0         0 next;
335             }
336              
337 0 0       0 $self->mywarn( " ++ $content_type "
338             . ( $filtered_doc ? '*WAS*' : 'was not' )
339             . " filtered by $filter\n" );
340              
341             # save the working filters in this list
342              
343 0 0       0 if ($filtered_doc) { # either a file name or a reference to the doc
344              
345             # Track chain of filters
346              
347 0         0 push @{ $doc_object->{filters_used} },
  0         0  
348             {
349             name => $filter,
350             start_content_type => $start_content_type,
351             end_content_type => $doc_object->content_type,
352             };
353              
354             # and save it (filename or reference)
355 0         0 $doc_object->cur_doc($filtered_doc);
356              
357             # set meta_data explicitly since %attr only has what we originally had
358 0         0 $doc_object->set_meta_data($metadata);
359 0         0 delete $attr{'meta_data'};
360              
361             # All done?
362 0 0       0 last unless $doc_object->continue(0);
363              
364 0         0 $self->_set_extra_methods( $doc_object, {%attr} );
365              
366 0         0 $content_type = $doc_object->content_type();
367             }
368             }
369              
370 0 0       0 $doc_object->dump_filters_used if $ENV{FILTER_DEBUG};
371              
372 0         0 return $doc_object;
373              
374             }
375              
376             sub _set_extra_methods {
377 0     0   0 my ( $self, $doc_object, $attr ) = @_;
378              
379 0         0 local $SIG{__DIE__};
380 0         0 local $SIG{__WARN__};
381              
382             # Look for left over config settings that we do not know about
383              
384 0         0 for my $setting ( keys %extra_methods ) {
385 0 0       0 next unless $attr->{$setting};
386 0         0 my $method = "set_" . $setting;
387 0         0 $doc_object->$method( delete $attr->{$setting} );
388              
389             # if given a document name then use that in error messages
390              
391 0 0       0 if ( $setting eq 'name' ) {
392             $SIG{__DIE__}
393 0     0   0 = sub { die "$$ Error- ", $doc_object->name, ": ", @_ };
  0         0  
394             $SIG{__WARN__}
395 0     0   0 = sub { warn "$$ Warning - ", $doc_object->name, ": ", @_ };
  0         0  
396             }
397             }
398              
399 0         0 warn "Unknown filter config setting '$_'\n" for keys %$attr;
400              
401             }
402              
403             =head2 mywarn
404              
405             Internal method used for writing warning messages to STDERR if
406             $ENV{FILTER_DEBUG} is set. Set the environment variable FILTER_DEBUG before
407             running to see extra messages while processing.
408              
409             =cut
410              
411             sub mywarn {
412 0     0 1 0 my $self = shift;
413              
414 0 0       0 print STDERR @_, "\n" if $ENV{FILTER_DEBUG};
415             }
416              
417             =head2 filter_list
418              
419             Returns a list of filter objects installed.
420              
421             =cut
422              
423             sub filter_list {
424 1     1 1 3 my ( $self, $filter_ref ) = @_;
425              
426 1 50       4 unless ($filter_ref) {
427 0 0       0 return ref $self->{filters} ? @{ $self->{filters} } : ();
  0         0  
428             }
429              
430 1         4 $self->{filters} = $filter_ref;
431             }
432              
433             # Creates the list of filters
434             sub create_filter_list {
435 1     1 0 2 my $self = shift;
436 1         2 my %attr = @_;
437              
438 1         2 my @filters = grep {defined} $self->filters_found(%attr);
  1         31  
439              
440 1 50       8 unless (@filters) {
441 0         0 warn "No SWISH filters found\n";
442 0         0 return;
443             }
444              
445             # Now sort the filters in order.
446 1 0       3 @filters = sort { $a->type <=> $b->type || $a->priority <=> $b->priority }
  0         0  
447             @filters;
448 1         7 $self->filter_list( \@filters );
449             }
450              
451             =head2 can_filter( I )
452              
453             This is useful for testing to see if a mimetype might be handled by SWISH::Filter
454             wihtout having to pass in a document. Helpful if doing HEAD requests.
455              
456             Returns an array of filters that can handle this type of document
457              
458             =cut
459              
460             my %can_filter = (); # memoize
461              
462             sub can_filter {
463 0     0 1   my ( $self, $content_type ) = @_;
464              
465 0 0         unless ($content_type) {
466 0           carp "Failed to pass in a content type to can_filter() method";
467 0           return;
468             }
469              
470 0 0         if ( exists $can_filter{$content_type} ) {
471 0           return @{ $can_filter{$content_type} };
  0            
472             }
473             else {
474 0           $can_filter{$content_type} = [];
475             }
476              
477 0           for my $filter ( $self->filter_list ) {
478 0 0         if ( $filter->can_filter_mimetype($content_type) ) {
479 0           push @{ $can_filter{$content_type} }, $filter;
  0            
480             }
481             }
482              
483 0           return @{ $can_filter{$content_type} };
  0            
484             }
485              
486             =head2 decode_content_type( I )
487              
488             Returns MIME type for I if known.
489              
490             =cut
491              
492             sub decode_content_type {
493 0     0 1   my ( $self, $file ) = @_;
494              
495 0 0         return unless $file;
496              
497 0           return $self->{mimetypes}->get_mime_type($file);
498             }
499              
500             =head1 WRITING FILTERS
501              
502             Filters are standard perl modules that are installed into the C name space.
503             Filters are not complicated -- see the core SWISH::Filters::* modules for examples.
504              
505             Each filter defines the content-types (or mimetypes) that it can handle. These
506             are specified as a list of regular expressions to match against the document's
507             content-type. If one of the mimetypes of a filter match the incoming
508             document's content-type the filter is called. The filter can then either
509             filter the content or return undefined indicating that it decided not to
510             filter the document for some reason. If the document is converted the filter
511             returns either a reference to a scalar of the content or a file name where the
512             content is stored. The filter also must change the content-type of the document
513             to reflect the new document.
514              
515             Filters typically use external programs or modules to do that actual work of
516             converting a document from one type to another. For example, programs in the
517             Xpdf packages are used for converting PDF files. The filter can (and should)
518             test for those programs in its new() method.
519              
520             Filters also can define a type and priority. These attributes are used
521             to set the order filters are tested for a content-type match. This allows
522             you to have more than one filter that can work on the same content-type. A lower
523             priority value is given preference over a higher priority value.
524              
525             If a filter calls die() then the filter is removed from the chain and will not be
526             called again I. Calling die when running with -S http or
527             -S fs has no effect since the program is run once per document.
528              
529             Once a filter returns something other than undef no more filters will be
530             called. If the filter calls $filter-Eset_continue then processing will
531             continue as if the file was not filtered. For example, a filter can uncompress
532             data and then set $filter-Eset_continue and let other filters process the
533             document.
534              
535              
536             A filter may define the following methods (required methods are indicated):
537              
538             =over 4
539              
540             =item new() B
541              
542             This method returns either an object which provides access to the filter, or undefined
543             if the filter is not to be used.
544              
545             The new() method is a good place to check for required modules or helper programs.
546             Returning undefined prevents the filter from being included in the filter chain.
547              
548             The new method must return a blessed hash reference. The only required attribute
549             is B. This attribute must contain a reference to an array of regular
550             expressions used for matching the content-type of the document passed in.
551              
552             Example:
553              
554             sub new {
555             my ( $class ) = @_;
556              
557             # List of regular expressions
558             my @mimetypes = (
559             qr[application/(x-)?msword],
560             qr[application/worddoc],
561             );
562              
563             my %settings = (
564             mimetypes => \@mimetypes,
565              
566             # Optional settings
567             priority => 20,
568             type => 2,
569             );
570              
571             return bless \%settings, $class;
572             }
573              
574             The attribute "mimetypes" returns an array reference to a list of regular
575             expressions. Those patterns are matched against each document's content type.
576              
577             =item filter() B
578              
579             This is the function that does the work of converting a document from one content type
580             to another. The function is passed the document object. See document object methods
581             listed below for what methods may be called on a document.
582              
583             The function can return undefined (or any false value) to indicate that the
584             filter did not want to process the document. Other filters will then be tested for
585             a content type match.
586              
587             If the document is filtered then the filter must set the new document's content
588             type (if it changed) and return either a file name where the document can be found or
589             a reference to a scalar containing the document.
590              
591             The filter() method may also return a second value for storing metadata. The value
592             is typically a hash ref of name/value pairs. This value can then
593             be accessed via the meta_data() method in the SWISH::Filter::Document class.
594              
595             =item type()
596              
597             Returns a number. Filters are sorted (for processing in a specific order)
598             and this number is simply the primary key used in sorting. If not specified
599             the filter's type used for sorting is 2.
600              
601             This is an optional method. You can also set the type in your new() constructor
602             as shown above.
603              
604              
605             =item priority()
606              
607             Returns a number. Filters are sorted (for processing in a specific order)
608             and this number is simply the secondary key used in sorting. If not specified
609             the filter's priority is 50.
610              
611             This is an optional method. You can also set the priority in your new() constructor
612             as shown above.
613              
614              
615             =back
616              
617             Again, the point of the type() and priority() methods is to allow setting the sort order
618             of the filters. Useful if you have two filters for filtering the same content-type,
619             but prefer to use one over the other. Neither are required.
620              
621              
622             =head1 EXAMPLE FILTER
623              
624             Here's a module to convert MS Word documents using the program "catdoc":
625              
626             package SWISH::Filters::Doc2txt;
627             use vars qw/ $VERSION /;
628              
629             $VERSION = '0.191';
630              
631              
632             sub new {
633             my ( $class ) = @_;
634              
635             my $self = bless {
636             mimetypes => [ qr!application/(x-)?msword! ],
637             priority => 50,
638             }, $class;
639              
640              
641             # check for helpers
642             return $self->set_programs( 'catdoc' );
643              
644             }
645              
646              
647             sub filter {
648             my ( $self, $doc ) = @_;
649              
650             my $content = $self->run_catdoc( $doc->fetch_filename ) || return;
651              
652             # update the document's content type
653             $filter->set_content_type( 'text/plain' );
654              
655             # return the document
656             return \$content;
657             }
658             1;
659              
660             The new() constructor creates a blessed hash which contains an array reference
661             of mimetypes patterns that this filter accepts. The priority sets this
662             filter to run after any other filters that might handle the same type of content.
663             The F function says that we need to call a program called "catdoc".
664             The function either returns $self or undefined if catdoc could not be found.
665             The F function creates a new method for running catdoc.
666              
667             The filter function runs catdoc passing in the name of the file (if the file is in memory
668             a temporary file is created). That F function was created by the
669             F call above.
670              
671              
672             =cut
673              
674             1;
675             __END__