File Coverage

blib/lib/Wiki/Toolkit/Search/Base.pm
Criterion Covered Total %
statement 9 55 16.3
branch 0 12 0.0
condition 0 3 0.0
subroutine 3 20 15.0
pod 10 10 100.0
total 22 100 22.0


line stmt bran cond sub pod time code
1             package Wiki::Toolkit::Search::Base;
2              
3 1     1   635 use strict;
  1         2  
  1         24  
4 1     1   3 use Carp "croak";
  1         1  
  1         43  
5              
6 1     1   3 use vars qw( @ISA $VERSION );
  1         15  
  1         565  
7              
8             sub _abstract {
9 0     0     my $who = (caller(1))[3];
10 0           croak "$who is an abstract method which the ".(ref shift).
11             " class has not provided";
12             }
13              
14             $VERSION = 0.01;
15              
16             =head1 NAME
17              
18             Wiki::Toolkit::Search::Base - Base class for Wiki::Toolkit search plugins.
19              
20             =head1 SYNOPSIS
21              
22             my $search = Wiki::Toolkit::Search::XXX->new( @args );
23             my %wombat_nodes = $search->search_nodes("wombat");
24              
25             This class details the methods that need to be overridden by search plugins.
26              
27             =cut
28              
29             =head1 METHODS
30              
31             =head2 C
32              
33             my $search = Wiki::Toolkit::Search::XXX->new( @args );
34              
35             Creates a new searcher. By default the arguments are just passed to
36             C<_init>, so you may wish to override that instead.
37              
38             =cut
39              
40             sub new {
41 0     0 1   my ($class, @args) = @_;
42 0           my $self = {};
43 0           bless $self, $class;
44 0           return $self->_init(@args);
45             }
46              
47             sub _init {
48 0     0     my ($self, %args) = @_;
49 0           @{$self}{keys %args} = values %args;
  0            
50 0           return $self;
51             }
52              
53             =head2 C
54              
55             # Find all the nodes which contain the word 'expert'.
56             my %results = $search->search_nodes('expert');
57              
58             Returns a (possibly empty) hash whose keys are the node names and
59             whose values are the scores in some kind of relevance-scoring system I
60             haven't entirely come up with yet. For OR searches, this could
61             initially be the number of terms that appear in the node, perhaps.
62              
63             Defaults to AND searches (if $and_or is not supplied, or is anything
64             other than C or C).
65              
66             Searches are case-insensitive.
67              
68             =cut
69              
70             sub search_nodes {
71 0     0 1   my ($self, $termstr, $and_or) = @_;
72              
73 0           $and_or = lc($and_or);
74 0 0 0       unless ( defined $and_or and $and_or eq "or" ) {
75 0           $and_or = "and";
76             }
77              
78             # Extract individual search terms.
79 0           my @terms = $self->analyze($termstr);
80              
81 0           return $self->_do_search($and_or, \@terms);
82             }
83              
84 0     0     sub _do_search { shift->_abstract };
85              
86             =head2 C
87              
88             @terms = $self->analyze($string)
89              
90             Splits a string into a set of terms for indexing and searching. Typically
91             this is done case-insensitively, splitting at word boundaries, and extracting
92             words that contain at least 1 word characters.
93              
94             =cut
95              
96             sub analyze {
97 0     0 1   my ($self, $string) = @_;
98 0 0         return grep { length > 1 # ignore single characters
  0            
99             and ! /^\W*$/ } # and things composed entirely
100             # of non-word characters
101             split( /\b/, # split at word boundaries
102             lc($string) # be case-insensitive
103             );
104             }
105              
106             =head2 C
107              
108             $wiki->write_node( "King's Cross St Pancras", "A station." );
109             my %matches = $search->fuzzy_title_match( "Kings Cross St. Pancras" );
110              
111             Returns a (possibly empty) hash whose keys are the node names and
112             whose values are the scores in some kind of relevance-scoring system I
113             haven't entirely come up with yet.
114              
115             Note that even if an exact match is found, any other similar enough
116             matches will also be returned. However, any exact match is guaranteed
117             to have the highest relevance score.
118              
119             The matching is done against "canonicalised" forms of the search
120             string and the node titles in the database: stripping vowels, repeated
121             letters and non-word characters, and lowercasing.
122              
123             =cut
124              
125             sub fuzzy_title_match {
126 0     0 1   my ($self, $string) = @_;
127 0           my $canonical = $self->canonicalise_title( $string );
128 0           $self->_fuzzy_match($string, $canonical);
129             }
130              
131 0     0     sub _fuzzy_match { shift->_abstract };
132              
133             =head2 C
134              
135             $search->index_node( $node, $content, $metadata );
136              
137             Indexes or reindexes the given node in the search engine indexes.
138             You must supply both the node name and its content, but metadata is
139             optional.
140              
141             If you do supply metadata, it will be used if and only if your chosen
142             search backend supports metadata indexing (see
143             C). It should be a reference to a hash
144             where the keys are the names of the metadata fields and the values are
145             either scalars or references to arrays of scalars. For example:
146              
147             $search->index_node( "Calthorpe Arms", "Nice pub in Bloomsbury.",
148             { category => [ "Pubs", "Bloomsbury" ],
149             postcode => "WC1X 8JR" } );
150              
151             =cut
152              
153             sub index_node {
154 0     0 1   my ($self, $node, $content) = @_;
155 0 0         croak "Must supply a node name" unless $node;
156 0 0         croak "Must supply node content" unless defined $content;
157              
158             # Index the individual words in the node content and title.
159 0           my @keys = $self->analyze("$content $node");
160 0           $self->_index_node($node, $content, \@keys);
161 0           $self->_index_fuzzy($node, $self->canonicalise_title( $node ));
162             }
163              
164 0     0     sub _index_node { shift->_abstract };
165 0     0     sub _index_fuzzy { shift->_abstract };
166              
167             =head2 B
168              
169             $fuzzy = $self->canonicalise_title( $ node);
170              
171             Returns the node title as suitable for fuzzy searching: with punctuation
172             and spaces removes, vowels removed, and double letters squashed.
173              
174             =cut
175              
176             sub canonicalise_title {
177 0     0 1   my ($self, $title) = @_;
178 0 0         return "" unless $title;
179 0           my $canonical = lc($title);
180 0           $canonical =~ s/\W//g; # remove non-word characters
181 0           $canonical =~ s/[aeiouy]//g; # remove vowels and 'y'
182 0           $canonical =~ tr/a-z//s; # collapse doubled (or tripled, etc) letters
183 0           return $canonical;
184             }
185              
186             =head2 C
187              
188             $search->delete_node($node);
189              
190             Removes the given node from the search indexes. NOTE: It's up to you to
191             make sure the node is removed from the backend store. Croaks on error.
192              
193             =cut
194              
195             sub delete_node {
196 0     0 1   my ($self, $node) = @_;
197 0 0         croak "Must supply a node name" unless $node;
198 0           $self->_delete_node($node);
199             }
200              
201 0     0     sub _delete_node { shift->_abstract };
202              
203             =head2 C
204              
205             if ( $search->supports_phrase_searches ) {
206             return $search->search_nodes( '"fox in socks"' );
207             }
208              
209             Returns true if this search backend supports phrase searching, and
210             false otherwise.
211              
212             =cut
213              
214 0     0 1   sub supports_phrase_searches { shift->_abstract };
215              
216             =head2 C
217              
218             if ( $search->supports_fuzzy_searches ) {
219             return $search->fuzzy_title_match("Kings Cross St Pancreas");
220             }
221              
222             Returns true if this search backend supports fuzzy title matching, and
223             false otherwise.
224              
225             =cut
226              
227 0     0 1   sub supports_fuzzy_searches { shift->_abstract };
228              
229             =head2 C
230              
231             if ( $search->supports_metadata_indexing ) {
232             print "This search backend indexes metadata as well as content.";
233             }
234              
235             Returns true if this search backend supports metadata indexing, and
236             false otherwise.
237              
238             =cut
239              
240 0     0 1   sub supports_metadata_indexing { 0; };
241              
242             =head1 SEE ALSO
243              
244             L
245              
246             =cut
247              
248             1;