File Coverage

blib/lib/Plucene/SearchEngine/Index.pm
Criterion Covered Total %
statement 30 30 100.0
branch 3 6 50.0
condition n/a
subroutine 10 10 100.0
pod 2 2 100.0
total 45 48 93.7


line stmt bran cond sub pod time code
1             package Plucene::SearchEngine::Index;
2 1     1   848 use Carp;
  1         2  
  1         83  
3 1     1   5 use strict;
  1         2  
  1         32  
4 1     1   21 use warnings;
  1         2  
  1         43  
5 1     1   3264 use Module::Pluggable (require => 1, search_path => [qw/Plucene::SearchEngine::Index/]);
  1         13209  
  1         9  
6 1     1   97 use File::Spec::Functions qw(catfile);
  1         2  
  1         62  
7 1     1   986 use Plucene::Index::Writer;
  1         161879  
  1         36  
8 1     1   1075 use UNIVERSAL::require;
  1         1796  
  1         12  
9             our $VERSION = "1.1";
10              
11             __PACKAGE__->plugins;
12              
13             =head1 NAME
14              
15             Plucene::SearchEngine::Index - A higher level abstraction for Plucene
16              
17             =head1 SYNOPSIS
18              
19             my $indexer = Plucene::SearchEngine::Index->new(
20             dir => "/var/lib/plucene"
21             );
22              
23             my @documents = map { $_->document }
24             Plucene::SearchEngine::Index::File->examine("foo.html");
25              
26             $indexer->index($_) for @documents;
27              
28             =head1 DESCRIPTION
29              
30             This module makes it easy to write to Plucene indexes. It does so by
31             providing an interface to the index writer which, in terms of
32             complexity, sits between C and
33             C; it also provides a framework of modules for turning
34             data into C objects, so that you don't necessarily
35             have to parse them yourself. See L for
36             more on this.
37              
38             Designed to be used with L, these two
39             modules aim to make it easy for anyone writing search engines based on
40             Plucene.
41              
42             =head1 METHODS
43              
44             =head2 new
45              
46             my $indexer = Plucene::SearchEngine::Index->new(
47             dir => "/var/plucene/foo",
48             analyzer => "Plucene::Analysis::SimpleAnalyzer",
49             );
50              
51             This creates a new indexer; you must specify the directory to contain
52             the index, and you may specify an analyzer to tokenize the data.
53              
54             =cut
55              
56             sub new {
57 1     1 1 409 my ($class, %args) = @_;
58 1 50       8 croak("No directory given!") unless $args{dir};
59 1         6 my $self = bless {
60             analyzer => "Plucene::Analysis::SimpleAnalyzer",
61             %args
62             }, $class;
63 1 50       13 $self->{analyzer}->require
64             or die "Couldn't require analyzer: $self->{analyzer}";
65 1         12141 return $self;
66             }
67              
68             =head2 index
69              
70             This adds a C to the index.
71              
72             =cut
73              
74             sub index {
75 1     1 1 343 my ($self, $doc) = @_;
76 1         56 $self->_writer->add_document($doc);
77             }
78              
79             sub _writer {
80 1     1   2 my $self = shift;
81 1 50       18 return Plucene::Index::Writer->new(
82             $self->{dir},
83             $self->{analyzer}->new,
84             -e catfile($self->{dir}, "segments") ? 0 : 1
85             );
86             }
87              
88             =head1 Document Frontends and Backends
89              
90             So far so good, but how do you create these C? You
91             can, of course, do so manually, but the easiest way is to use the
92             supplied C or
93             C modules.
94              
95             These two modules are frontends which gather metadata about a file or
96             URL and then hand the data off to one of the backend modules - there are
97             backends supplied for PDF, HTML and plain text files. These in turn
98             return a list of documents found in the file or URL. In most cases,
99             there'll only be one document, but, for instance, a Unix mbox should
100             return an object for each email in the box. These objects can be turned
101             into C objects by calling the C method on
102             them. This isn't done by default because you may wish to mess with the
103             hash yourself, or serialize it, or whatever.
104              
105             =head2 Creating your own backend
106              
107             If you want to handle a different type of file, it's relatively easy to
108             do. All you need to do is create a module called
109             C; this should inherit from
110             C and supply a
111             C method. It should also call the
112             C method to state which MIME types and file extensions
113             it can handle.
114              
115             For instance, suppose we want to create a backend which grabs metadata
116             from an image and indexes that. (Not unlike
117             L...) We'd start off like this:
118              
119             package Plucene::SearchEngine::Index::Image;
120             use strict;
121             use warnings;
122             use base 'Plucene::SearchEngine::Index::Base';
123             use Image::Info;
124              
125             Now we register the mime types and file extensions we can handle:
126              
127             __PACKAGE__->register_handler(qw(
128             image/bmp .bmp
129             image/gif .gif
130             image/jpeg .jpeg .jpg .jpe
131             ...
132             ));
133              
134             And our C method will call C for
135             each bit of metadata it can find:
136              
137             sub gather_data_from_file {
138             my ($self, $filename) = @_;
139             my $info = image_info($filename);
140             return if $info->{error};
141             $self->add_data("size", "UnStored", scalar html_dim($info));
142             $self->add_data("text", "UnStored", $info->{Comment});
143             $self->add_data("subtype", "UnStored", $info->{file_ext});
144             $self->add_data("created", "Date", Time::Piece->new(
145             str2time($info->{LastModificationTime})));
146             }
147              
148             See L for an explanation of C.
149              
150             Beceause C uses a plugin architecture,
151             once this module is installed, it will automatically be called upon to
152             handle those image types it can deal with, without any additional action
153             by the user.
154              
155             =head2 Creating your own frontend
156              
157             For certain types of data, such as emails, news articles, or instant
158             messages, you may not want to use the file or URL frontends.
159             Alternatively, if you have a simple piece of data which isn't
160             file-based, you may just want to do everything yourself. Even then,
161             C can help you to create
162             C - just inherit from it, and use C to add
163             fields to the document in your C method. See
164             L for more details.
165              
166             =head1 SEE ALSO
167              
168             L,
169             L,
170             L, L,
171             L.
172              
173             =head1 AUTHOR
174              
175             Simon Cozens C.
176              
177             This library is free software; you can redistribute it and/or modify it
178             under the same terms as Perl itself.
179              
180             =cut
181              
182              
183             1;