File Coverage

blib/lib/Dezi/Lucy/Indexer.pm
Criterion Covered Total %
statement 4 6 66.6
branch n/a
condition n/a
subroutine 2 2 100.0
pod n/a
total 6 8 75.0


line stmt bran cond sub pod time code
1             package Dezi::Lucy::Indexer;
2 1     1   22866 use Moose;
  1         503725  
  1         7  
3             extends 'Dezi::Indexer';
4              
5 1     1   8522 use Dezi::Lucy::InvIndex;
  0            
  0            
6              
7             use Lucy::Index::Indexer;
8             use Lucy::Plan::Schema;
9             use Lucy::Plan::FullTextType;
10             use Lucy::Plan::StringType;
11             use Lucy::Analysis::PolyAnalyzer;
12              
13             use Carp;
14             use SWISH::3 qw( :constants );
15             use Scalar::Util qw( blessed );
16             use Data::Dump qw( dump );
17             use Search::Tools::UTF8;
18             use Path::Class::File::Lockable;
19             use Sys::Hostname qw( hostname );
20             use Digest::MD5 ();
21              
22             our $VERSION = '0.014';
23              
24             has 'highlightable_fields' =>
25             ( is => 'rw', isa => 'Bool', default => sub {0} );
26              
27             my $BUILT_IN_PROPS = SWISH_DOC_PROP_MAP();
28              
29             =head1 NAME
30              
31             Dezi::Lucy::Indexer - Dezi::App Apache Lucy indexer
32              
33             =head1 SYNOPSIS
34              
35             use Dezi::Lucy::Indexer;
36             my $indexer = Dezi::Lucy::Indexer->new(
37             config => Dezi::Indexer::Config->new(),
38             invindex => Dezi::Lucy::InvIndex->new(),
39             highlightable_fields => 0,
40             );
41              
42             =head1 DESCRIPTION
43              
44             Dezi::Lucy::Indexer is an Apache Lucy based indexer
45             class based on L<SWISH::3>.
46              
47             =head1 CONSTANTS
48              
49             All the L<SWISH::3> constants are imported into this namespace,
50             including:
51              
52             =over
53              
54             =item SWISH_DOC_PROP_MAP
55              
56             =item SWISH_INDEX_STEMMER_LANG
57              
58             =item SWISH_INDEX_NAME
59              
60             =item SWISH_INDEX_FORMAT
61              
62             =back
63              
64             =head1 METHODS
65              
66             Only new and overridden methods are documented here. See
67             the L<Dezi::Indexer> documentation.
68              
69             =head2 BUILD
70              
71             Implements basic object set up. Called internally by new().
72              
73             In addition to the attributes documented in Dezi::Indexer,
74             this class implements the following attributes:
75              
76             =over
77              
78             =item highlightable_fields
79              
80             Value should be 0 or 1. Default is 0. Passed directly to the
81             constructor for Lucy::Plan::FullTextField objects as the value
82             for the C<highlightable> option.
83              
84             =back
85              
86             =cut
87              
88             sub BUILD {
89             my $self = shift;
90              
91             # coerce our invindex into our format subclass
92             unless ( $self->invindex->isa('Dezi::Lucy::InvIndex') ) {
93             $self->invindex(
94             Dezi::Lucy::InvIndex->new( path => $self->invindex->path ) );
95             }
96              
97             $self->_build_lucy_delegates();
98             }
99              
100             sub _build_lucy_delegates {
101             my $self = shift;
102             my $s3config = $self->swish3->config;
103             my $lang = $s3config->get_index->get( SWISH_INDEX_STEMMER_LANG() )
104             || 'none';
105             $self->{_lang} = $lang; # cache for finish()
106             my $schema = Lucy::Plan::Schema->new();
107             my $analyzers = {};
108             my $case_folder = Lucy::Analysis::CaseFolder->new;
109             my $tokenizer = Lucy::Analysis::RegexTokenizer->new;
110             my $multival_tokenizer
111             = Lucy::Analysis::RegexTokenizer->new(
112             pattern => '[^' . SWISH_TOKENPOS_BUMPER() . ']+' );
113              
114             # mimic StringType fields that require case and/or multival parsing.
115             $analyzers->{store_lc} = Lucy::Analysis::PolyAnalyzer->new(
116             analyzers => [ $multival_tokenizer, $case_folder ] );
117             $analyzers->{store} = $multival_tokenizer;
118              
119             # stemming means we fold case and tokenize too.
120             if ( $lang and $lang =~ m/^\w\w$/ ) {
121             my $stemmer
122             = Lucy::Analysis::SnowballStemmer->new( language => $lang );
123             $analyzers->{fulltext_lc}
124             = Lucy::Analysis::PolyAnalyzer->new( analyzers =>
125             [ $multival_tokenizer, $case_folder, $tokenizer, $stemmer ] );
126             $analyzers->{fulltext} = Lucy::Analysis::PolyAnalyzer->new(
127             analyzers => [ $multival_tokenizer, $tokenizer, $stemmer ] );
128             }
129             else {
130             $analyzers->{fulltext_lc}
131             = Lucy::Analysis::PolyAnalyzer->new(
132             analyzers => [ $multival_tokenizer, $case_folder, $tokenizer, ],
133             );
134             $analyzers->{fulltext} = Lucy::Analysis::PolyAnalyzer->new(
135             analyzers => [ $multival_tokenizer, $tokenizer ] );
136             }
137              
138             # cache our objects for later
139             $self->{__lucy}->{analyzers} = $analyzers;
140             $self->{__lucy}->{schema} = $schema;
141              
142             # build the Lucy fields, which are a merger of MetaNames+PropertyNames
143             my %fields;
144              
145             my $metanames = $s3config->get_metanames;
146             my $meta_keys = $metanames->keys;
147             my $properties = $s3config->get_properties;
148             my $property_keys = $properties->keys;
149              
150             # merge first by name so we pair correctly in _create_field_def()
151             my %tmpfields;
152             for my $name (@$meta_keys) {
153             my $mn = $metanames->get($name);
154             $tmpfields{$name}->{meta} = $mn;
155             }
156             for my $name (@$property_keys) {
157             if ( exists $BUILT_IN_PROPS->{$name} ) {
158             confess
159             "$name is a built-in PropertyName and should not be defined in config";
160             }
161             my $pr = $properties->get($name);
162             $tmpfields{$name}->{prop} = $pr;
163             }
164              
165             # build out field definitions
166             for my $n ( keys %tmpfields ) {
167             my %fdef = $self->_create_field_def( $tmpfields{$n}->{meta},
168             $tmpfields{$n}->{prop} );
169             $fields{ $fdef{name} } = $fdef{def};
170             }
171              
172             $self->{_fields} = \%fields;
173              
174             for my $name ( keys %fields ) {
175             my $def = $fields{$name};
176             my $key = $name;
177              
178             # if a field is purely an alias, skip it.
179             if ( defined $def->{is_meta_alias}
180             and defined $def->{is_prop_alias} )
181             {
182             $def->{store_as}->{ $def->{is_meta_alias} } = 1;
183             $def->{store_as}->{ $def->{is_prop_alias} } = 1;
184             next;
185             }
186              
187             my $type = $self->_get_lucy_field_type($def) or next;
188              
189             $schema->spec_field( name => $name, type => $type );
190              
191             $def->{store_as}->{$name} = 1;
192             }
193              
194             # build in the built-ins
195             $self->debug and warn dump \%fields;
196              
197             for my $name ( keys %$BUILT_IN_PROPS ) {
198             if ( exists $fields{$name} ) {
199             my $def = $fields{$name};
200              
201             #carp "found $name in built-in props: " . dump($field);
202              
203             # in theory this should never happen.
204             if ( !$def->{is_prop} ) {
205             confess
206             "$name is a built-in PropertyName but not defined as a PropertyName in config";
207             }
208             }
209              
210             # default property
211             else {
212             $schema->spec_field(
213             name => $name,
214             type => Lucy::Plan::StringType->new( sortable => 1, )
215             );
216             }
217             }
218              
219             #dump( \%fields );
220              
221             # TODO can pass lucy in? make 'lucy' attribute public?
222             my $hostname = hostname() or confess "Can't get unique hostname";
223             my $manager = Lucy::Index::IndexManager->new( host => $hostname );
224             $self->{lucy} ||= Lucy::Index::Indexer->new(
225             schema => $schema,
226             index => $self->invindex->path . "",
227             create => 1,
228             manager => $manager,
229             );
230              
231             }
232              
233             sub _get_lucy_field_type {
234             my ( $self, $def ) = @_;
235             my ( $type, $key );
236             my $analyzers = $self->{__lucy}->{analyzers};
237              
238             # MetaName==yes, PropertyName==no
239             if ( $def->{is_meta} and !$def->{is_prop} ) {
240             if ( defined $def->{is_meta_alias} ) {
241             $key = $def->{is_meta_alias};
242             $def->{store_as}->{$key} = 1;
243             return;
244             }
245              
246             #warn "spec meta $name";
247             $type = Lucy::Plan::FullTextType->new(
248             analyzer => $analyzers->{fulltext_lc},
249             stored => 0,
250             boost => $def->{bias} || 1.0,
251             highlightable => $self->highlightable_fields,
252             );
253             }
254              
255             # MetaName==yes, PropertyName==yes
256             # this is the trickiest case, because the field
257             # is both prop+meta and could be an alias for one
258             # and a real for the other.
259             # **NOTE** we must have already eliminated the case where
260             # the field is an alias for both.
261             elsif ( $def->{is_meta} and $def->{is_prop} ) {
262             if ( defined $def->{is_meta_alias} ) {
263             $key = $def->{is_meta_alias};
264             $def->{store_as}->{$key} = 1;
265             }
266             elsif ( defined $def->{is_prop_alias} ) {
267             $key = $def->{is_prop_alias};
268             $def->{store_as}->{$key} = 1;
269             }
270              
271             my $analyzer = $analyzers->{fulltext_lc};
272             if ( !$def->{ignore_case} ) {
273             $analyzer = $analyzers->{fulltext};
274             }
275              
276             #warn "spec meta+prop $name";
277             $type = Lucy::Plan::FullTextType->new(
278             analyzer => $analyzer,
279             highlightable => $self->highlightable_fields,
280             sortable => $def->{sortable},
281             boost => $def->{bias} || 1.0,
282             );
283             }
284              
285             # MetaName==no, PropertyName==yes
286             elsif (!$def->{is_meta}
287             and $def->{is_prop} )
288             {
289              
290             if ( defined $def->{is_prop_alias} ) {
291             $key = $def->{is_prop_alias};
292             $def->{store_as}->{$key} = 1;
293             return;
294             }
295              
296             #warn "spec prop !sort $name";
297             my $analyzer_key = 'store';
298             if ( $def->{ignore_case} ) {
299             $analyzer_key = 'store_lc';
300             }
301              
302             $type = Lucy::Plan::FullTextType->new(
303             analyzer => $analyzers->{$analyzer_key},
304             highlightable => $self->highlightable_fields,
305             sortable => $def->{sortable},
306             boost => $def->{bias} || 1.0,
307             );
308             }
309              
310             $self->debug
311             and warn
312             sprintf( "field def %s => field type %s", dump($def), $type );
313              
314             return $type;
315              
316             }
317              
318             sub _create_field_def {
319             my ( $self, $metaname, $propname ) = @_;
320             if ( !$metaname and !$propname ) {
321             confess "Must have one of metaname or propname objects";
322             }
323             my $name = $metaname ? $metaname->name : $propname->name;
324             my %field_def = ();
325             if ($metaname) {
326             if ( $metaname->name ne $name ) {
327             confess "Mismatched metaname for '$name': " . $metaname->name;
328             }
329             my $alias = $metaname->alias_for;
330             $field_def{is_meta} = 1;
331             $field_def{is_meta_alias} = $alias;
332             $field_def{bias} = $metaname->bias;
333             $field_def{store_as}->{$name} = 1;
334              
335             # allow for aliases to built-ins
336             if ( exists $BUILT_IN_PROPS->{$name} ) {
337             $field_def{is_prop} = 1;
338             $field_def{sortable} = 1;
339             }
340             }
341             if ($propname) {
342             if ( $propname->name ne $name ) {
343             confess "Mismatched propname for '$name'" . $propname->name;
344             }
345             my $prop_alias = $propname->alias_for;
346             $field_def{is_prop} = 1;
347             $field_def{is_prop_alias} = $prop_alias;
348             if ( $propname->sort ) {
349             $field_def{sortable} = 1;
350             }
351             for my $attr (qw( ignore_case verbatim max )) {
352             $field_def{$attr} = $propname->$attr;
353             }
354             }
355             return ( name => $name, def => \%field_def );
356             }
357              
358             sub _add_new_field {
359             my ( $self, $metaname, $propname ) = @_;
360             my $fields = $self->{_fields};
361             my %field_def = $self->_create_field_def( $metaname, $propname );
362             my $name = $field_def{name};
363             my $def = $field_def{def};
364             $fields->{$name} ||= $def;
365             $self->{__lucy}->{schema}->spec_field(
366             name => $name,
367             type => $self->_get_lucy_field_type($def),
368             );
369             return $def;
370             }
371              
372             =head2 swish3_handler( I<swish3_data> )
373              
374             Called by the SWISH::3::handler() function for every document being
375             indexed.
376              
377             =cut
378              
379             sub swish3_handler {
380             my ( $self, $data ) = @_;
381             my $config = $data->config;
382             my $conf_props = $config->get_properties;
383             my $conf_metas = $config->get_metanames;
384              
385             # will hold all the parsed text, keyed by field name
386             my %doc;
387             my $docinfo = $data->doc;
388              
389             # Swish built-in fields first
390             for my $propname ( keys %$BUILT_IN_PROPS ) {
391             my $attr = $BUILT_IN_PROPS->{$propname};
392             $doc{$propname} = [ $docinfo->$attr ];
393             }
394              
395             # fields parsed from document
396             my $props = $data->properties;
397             my $metas = $data->metanames;
398              
399             # field def cache
400             my $fields = $self->{_fields};
401              
402             # may need to add newly-discovered fields from $metas
403             # that were added via UndefinedMetaTags e.g.
404             for my $mname ( keys %$metas ) {
405             if ( !exists $fields->{$mname} ) {
406              
407             #warn "New field: $mname\n";
408             my $prop;
409             if ( exists $props->{$mname} ) {
410             $prop = $conf_props->get($mname);
411             }
412             $self->_add_new_field( $conf_metas->get($mname), $prop );
413             }
414             }
415              
416             #dump $fields;
417             #dump $props;
418             #dump $metas;
419             for my $fname ( sort keys %$fields ) {
420             my $field = $self->{_fields}->{$fname};
421             next if $field->{is_prop_alias};
422             next if $field->{is_meta_alias};
423              
424             my @keys = keys %{ $field->{store_as} };
425              
426             for my $key (@keys) {
427              
428             # prefer properties over metanames because
429             # properties have verbatim flag, which affects
430             # the stored whitespace.
431              
432             if ( $field->{is_prop} and !exists $BUILT_IN_PROPS->{$fname} ) {
433             push( @{ $doc{$key} }, @{ $props->{$fname} } );
434             }
435             elsif ( $field->{is_meta} ) {
436             push( @{ $doc{$key} }, @{ $metas->{$fname} } );
437             }
438             else {
439             croak "field '$fname' is neither a PropertyName nor MetaName";
440             }
441             }
442             }
443              
444             # serialize the doc with our tokenpos_bump char
445             for my $k ( keys %doc ) {
446             $doc{$k} = to_utf8( join( SWISH_TOKENPOS_BUMPER(), @{ $doc{$k} } ) );
447             }
448              
449             $self->debug and carp dump \%doc;
450              
451             # make sure we delete any existing doc with same URI
452             $self->{lucy}->delete_by_term(
453             field => 'swishdocpath',
454             term => $doc{swishdocpath}
455             );
456              
457             $self->{lucy}->add_doc( \%doc );
458             }
459              
460             =head2 finish
461              
462             Calls commit() on the internal Lucy::Indexer object,
463             writes the C<swish.xml> header file and calls the superclass finish()
464             method.
465              
466             =cut
467              
468             my @chars = ( 'a' .. 'z', 'A' .. 'Z', 0 .. 9 );
469              
470             around finish => sub {
471             my $super_method = shift;
472             my $self = shift;
473              
474             return 0 if $self->{_is_finished};
475              
476             my $doc_count = $self->_finish_lucy();
477             $super_method->( $self, @_ );
478             $self->{_is_finished} = 1;
479              
480             return $doc_count;
481             };
482              
483             sub _finish_lucy {
484             my $self = shift;
485              
486             # get a lock on our header file till
487             # this entire transaction is complete.
488             # Note that we trust the Lucy locking feature
489             # to have prevented any other process
490             # from getting a lock on the invindex itself,
491             # but we want to make sure nothing interrupts
492             # us from writing our own header after calling ->commit().
493             my $invindex = $self->invindex;
494             my $header = $invindex->header_file->stringify;
495             my $lock_file = Path::Class::File::Lockable->new($header);
496             if ( $lock_file->locked ) {
497             croak "Lock file found on $header -- cannot commit indexing changes";
498             }
499             $lock_file->lock;
500              
501             # commit our changes
502             $self->{lucy}->commit();
503              
504             # get total doc count
505             my $polyreader = Lucy::Index::PolyReader->open( index => "$invindex", );
506             my $doc_count = $polyreader->doc_count();
507              
508             # write header
509             # the current config should contain any existing header + runtime config
510             my $idx_cfg = $self->swish3->config->get_index;
511              
512             # poor man's uuid
513             my $uuid = Digest::MD5::md5_hex(
514             time() . join( "", @chars[ map { rand @chars } ( 1 .. 24 ) ] ) );
515              
516             $idx_cfg->set( SWISH_INDEX_NAME(), "$invindex" );
517             $idx_cfg->set( SWISH_INDEX_FORMAT(), 'Lucy' );
518             $idx_cfg->set( SWISH_INDEX_STEMMER_LANG(), $self->{_lang} );
519             $idx_cfg->set( 'DeziVersion', $invindex->version );
520             $idx_cfg->set( "DocCount", $doc_count );
521             $idx_cfg->set( "UUID", $uuid );
522              
523             $self->swish3->config->write($header);
524              
525             # transaction complete
526             $lock_file->unlock;
527              
528             $self->debug and carp "wrote $header with uuid $uuid";
529             $self->debug and carp "$doc_count docs indexed";
530             $self->swish3(undef); # invalidate this indexer
531              
532             return $doc_count;
533             }
534              
535             =head2 get_lucy
536              
537             Returns the internal Lucy::Index::Indexer object.
538              
539             =cut
540              
541             sub get_lucy {
542             return shift->{lucy};
543             }
544              
545             =head2 abort
546              
547             Sets the internal Lucy::Index::Indexer to undef,
548             which should release any locks on the index.
549             Also flags the Dezi::Lucy::Indexer object
550             as stale.
551              
552             =cut
553              
554             sub abort {
555             my $self = shift;
556             $self->{lucy} = undef;
557             $self->{_is_finished} = 1;
558             $self->swish3(undef);
559             }
560              
561             __PACKAGE__->meta->make_immutable;
562              
563             1;
564              
565             __END__
566              
567             =head2 MetaNames and PropertyNames
568              
569             Some implementation notes about MetaNames and PropertyNames.
570             See also L<http://dezi.org/2014/07/18/metanames-and-propertynames/>.
571              
572             =over
573              
574             =item
575              
576             A field defined as either a MetaName, PropertyName or both, can be searched.
577              
578             =item
579              
580             Fields are matched against tag names in your XML/HTML documents. See also the TagAlias, UndefinedMetaTags, UndefinedXMLAttributes, and XMLClassAttributes directives.
581              
582             =item
583              
584             You can alias field names with MetaNamesAlias and PropertyNamesAlias.
585              
586             =item
587              
588             MetaNames are tokenized and case-insensitive and (optionally, with FuzzyIndexingMode) stemmed.
589              
590             =item
591              
592             PropertyNames are stored, case-sensitive strings.
593              
594             =item
595              
596             If a field is defined as both a MetaName and PropertyName, then it will be tokenized.
597              
598             =item
599              
600             If a field is defined only as a MetaName, it will be parsed but not stored. That means you can search on the field but when you try and retrieve the field's value from the results, it will cause a fatal error.
601              
602             =item
603              
604             If a field is defined only as a PropertyName, it will be parsed and stored, but it will not be tokenized. That means the field's contents are stored without being split up into words.
605              
606             =item
607              
608             You can control the parsing and storage of PropertyName-only fields with the following additional directives:
609              
610             =over
611              
612             =item PropertyNamesCompareCase
613              
614             case sensitive search
615              
616             =item PropertyNamesIgnoreCase
617              
618             case insensitive search (default)
619              
620             =item PropertyNamesNoStripChars
621              
622             preserve whitespace
623              
624             =back
625              
626             =item
627              
628             There are two default MetaNames defined: swishdefault and swishtitle.
629              
630             =item
631              
632             There are two default PropertyNames defined: swishtitle and swishdescription.
633              
634             =item
635              
636             The libswish3 XML and HTML parsers will automatically treat a <title> tag as swishtitle. Likewise they will treat <body> tag as swishdescription.
637              
638             =item
639              
640             Things get complicated quickly when defining fields. Experiment with small test cases to arrive a the configuration that works best with your application.
641              
642             =back
643              
644             =head1 AUTHOR
645              
646             Peter Karman, E<lt>karpet@dezi.orgE<gt>
647              
648             =head1 BUGS
649              
650             Please report any bugs or feature requests to C<bug-dezi-app at rt.cpan.org>, or through
651             the web interface at L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=Dezi-App>.
652             I will be notified, and then you'll automatically be notified of progress on your bug as I make changes.
653              
654             =head1 SUPPORT
655              
656             You can find documentation for this module with the perldoc command.
657              
658             perldoc Dezi::App
659              
660             You can also look for information at:
661              
662             =over 4
663              
664             =item * Website
665              
666             L<http://dezi.org/>
667              
668             =item * IRC
669              
670             #dezisearch at freenode
671              
672             =item * Mailing list
673              
674             L<https://groups.google.com/forum/#!forum/dezi-search>
675              
676             =item * RT: CPAN's request tracker
677              
678             L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=Dezi-App>
679              
680             =item * AnnoCPAN: Annotated CPAN documentation
681              
682             L<http://annocpan.org/dist/Dezi-App>
683              
684             =item * CPAN Ratings
685              
686             L<http://cpanratings.perl.org/d/Dezi-App>
687              
688             =item * Search CPAN
689              
690             L<https://metacpan.org/dist/Dezi-App/>
691              
692             =back
693              
694             =head1 COPYRIGHT AND LICENSE
695              
696             Copyright 2014 by Peter Karman
697              
698             This library is free software; you can redistribute it and/or modify
699             it under the terms of the GPL v2 or later.
700              
701             =head1 SEE ALSO
702              
703             L<http://dezi.org/>, L<http://swish-e.org/>, L<http://lucy.apache.org/>
704