File Coverage

blib/lib/SVN/Log/Index.pm
Criterion Covered Total %
statement 10 12 83.3
branch n/a
condition n/a
subroutine 4 4 100.0
pod n/a
total 14 16 87.5


line stmt bran cond sub pod time code
1             package SVN::Log::Index;
2              
3 5     5   248801 use strict;
  5         13  
  5         150  
4 5     5   27 use warnings;
  5         7  
  5         142  
5              
6 5     5   26 use File::Path;
  5         12  
  5         271  
7              
8 5     5   4052 use KinoSearch::InvIndexer;
  0            
  0            
9             use KinoSearch::Searcher;
10              
11             use Params::Validate qw(:all);
12             use Exception::Class(
13             'SVN::Log::Index::X::Args'
14             => { alias => 'throw_args', },
15             'SVN::Log::Index::X::Fault'
16             => { alias => 'throw_fault', },
17             );
18              
19             Params::Validate::validation_options(
20             on_fail => sub { throw_args error => shift },
21             );
22              
23             use SVN::Log;
24             use YAML ();
25              
26             our $VERSION = '0.51';
27              
28             =head1 NAME
29              
30             SVN::Log::Index - Index and search over Subversion commit logs.
31              
32             =head1 SYNOPSIS
33              
34             my $index = SVN::Log::Index->new({ index_path => '/path/to/index' });
35              
36             if($creating) { # Create from scratch if necessary
37             $index->create({ repo_url => 'url://for/repo' });
38             }
39              
40             $index->open(); # And then open it
41              
42             # Now add revisions from the repo to the index
43             $index->add({ start_rev => $start_rev,
44             end_rev => $end_rev);
45              
46             # And query the index
47             my $results = $index->search('query');
48              
49             =head1 DESCRIPTION
50              
51             SVN::Log::Index builds a KinoSearch index of commit logs from a
52             Subversion repository and allows you to do arbitrary full text searches
53             over it.
54              
55             =head1 METHODS
56              
57             =head2 new
58              
59             my $index = SVN::Log::Index->new({
60             index_path => '/path/to/index'
61             });
62              
63             Create a new index object.
64              
65             The single argument is a hash ref. Currently only one key is valid.
66              
67             =over 4
68              
69             =item index_path
70              
71             The path that contains (or will contain) the index files.
72              
73             =back
74              
75             This method prepares the object for use, but does not make any changes
76             on disk.
77              
78             =cut
79              
80             sub new {
81             my $proto = shift;
82             my $args = validate(@_, {
83             index_path => 1,
84             });
85              
86             my $class = ref($proto) || $proto;
87             my $self = {};
88              
89             $self->{index_path} = $args->{index_path};
90              
91             bless $self, $class;
92             }
93              
94             =head2 create
95              
96             $index->create({
97             repo_url => 'url://for/repo',
98             analyzer_class => 'KinoSearch::Analysis::PolyAnalyzer',
99             analyzer_opts => [ language => 'en' ],
100             overwrite => 0, # Optional
101             });
102              
103             This method creates a new index, in the C given when the
104             object was created.
105              
106             The single argument is a hash ref, with the following possible keys.
107              
108             =over 4
109              
110             =item repo_url
111              
112             The URL for the Subversion repository that is going to be indexed.
113              
114             =item analyzer_class
115              
116             A string giving the name of the class that will analyse log message
117             text and tokenise it. This should derive from the
118             L class. SVN::Log::Index will call this
119             class' C method.
120              
121             Once an analyzer class has been chosen for an index it can not be
122             changed without deleting the index and creating it afresh.
123              
124             The default value is C.
125              
126             =item analyzer_opts
127              
128             A list of options to be passed, as is, to the constructor for the
129             C object.
130              
131             =item overwrite
132              
133             A boolean indicating whether or not a pre-existing index_path should
134             be overwritten.
135              
136             Given this sequence;
137              
138             my $index = SVN::Log::Index->new({index_path => '/path'});
139             $index->create({repo_url => 'url://for/repo'});
140              
141             The call to C will fail if C already exists.
142              
143             If C is set to a true value then C will be cleared.
144              
145             The default is false.
146              
147             =back
148              
149             After creation the index directory will exist on disk, and a
150             configuration file containing the create()-time parameters will be
151             created in the index directory.
152              
153             Newly created indexes must still be opened.
154              
155             =cut
156              
157             sub create {
158             my $self = shift;
159             my $args = validate(@_, {
160             repo_url => {
161             type => SCALAR,
162             regex => qr{^[a-z/]},
163             },
164             analyzer_class => {
165             type => SCALAR,
166             default => 'KinoSearch::Analysis::PolyAnalyzer',
167             },
168             analyzer_opts => {
169             type => ARRAYREF,
170             default => [ language => 'en' ],
171             },
172             overwrite => {
173             type => BOOLEAN,
174             default => 0,
175             },
176             });
177              
178             throw_fault("Can't call create() after open()")
179             if exists $self->{config};
180              
181             if(-d $self->{index_path} and ! $args->{overwrite}) {
182             throw_fault("create() $self->{index_path} exists and 'overwrite' is false");
183             }
184              
185             if($args->{repo_url} !~ m/^(http|https|svn|file|svn\+ssh):\/\//) {
186             $args->{repo_url} = 'file://' . $args->{repo_url};
187             }
188              
189             $self->{config} = $args;
190             $self->{config}{last_indexed_rev} = 0;
191              
192             $self->_create_analyzer();
193             $self->_create_writer($args->{overwrite});
194              
195             $self->_save_config();
196              
197             delete $self->{config}; # Gets reloaded in open()
198             }
199              
200             sub _save_config {
201             my $self = shift;
202              
203             YAML::DumpFile($self->{index_path} . '/config.yaml', $self->{config})
204             or throw_fault("Saving config failed: $!");
205             }
206              
207             sub _load_config {
208             my $self = shift;
209              
210             $self->{config} = YAML::LoadFile($self->{index_path} . '/config.yaml')
211             or throw_fault("Could not load state from $self->{index_path}/config.yaml: $!");
212             }
213              
214             sub _create_writer {
215             my $self = shift;
216             my $create = shift;
217              
218             return if exists $self->{writer} and defined $self->{analyzer};
219              
220             throw_fault("_create_analyzer() must be called first")
221             if ! exists $self->{analyzer};
222             throw_fault("analyzer is empty") if ! defined $self->{analyzer};
223              
224             $self->{writer} = KinoSearch::InvIndexer->new(
225             invindex => $self->{index_path},
226             create => $create,
227             analyzer => $self->{analyzer},
228             ) or throw_fault("error creating writer: $!");
229              
230             foreach my $field (qw(paths revision author date message)) {
231             $self->{writer}->spec_field(name => $field);
232             }
233              
234             return;
235             }
236              
237             sub _delete_writer {
238             my $self = shift;
239             my $optimize = shift;
240              
241             $self->{writer}->finish(optimize => $optimize);
242             delete $self->{writer};
243             return;
244             }
245              
246             sub _create_analyzer {
247             my $self = shift;
248              
249             return if exists $self->{analyzer} and defined $self->{analyzer};
250              
251             eval "require $self->{config}{analyzer_class}"
252             or throw_fault "require($self->{config}{analyzer_class} failed: $!";
253              
254             $self->{analyzer} = $self->{config}{analyzer_class}->new(
255             @{ $self->{config}{analyzer_opts} }
256             ) or throw_fault("error creating $self->{config}{analyzer_class} object: $!");
257             }
258              
259             =head2 open
260              
261             $index->open();
262              
263             Opens the index, in preparation for adding or removing entries.
264              
265             =cut
266              
267             sub open {
268             my $self = shift;
269             my $args = shift;
270              
271             throw_fault("$self->{index_path} does not exist")
272             if ! -d $self->{index_path};
273             throw_fault("$self->{index_path}/config.yaml does not exist")
274             if ! -f "$self->{index_path}/config.yaml";
275              
276             $self->_load_config();
277             $self->_create_analyzer();
278             }
279              
280             =head2 add
281              
282             $index->add ({
283             start_rev => $start_rev, # number, or 'HEAD'
284             end_rev => $end_rev, # number, or 'HEAD'
285             });
286              
287             Add one or more log messages to the index.
288              
289             The single argument is a hash ref, with the following possible keys.
290              
291             =over
292              
293             =item start_rev
294              
295             The first revision to add to the index. May be given as C to mean
296             the repository's most recent (youngest) revision.
297              
298             This key is mandatory.
299              
300             =item end_rev
301              
302             The last revision to add to the index. May be given as C to mean
303             the repository's most recent (youngest) revision.
304              
305             This key is optional. If not included then only the revision specified
306             by C will be indexed.
307              
308             =back
309              
310             Revisions from C to C are added inclusive.
311             C and C may be given in ascending or descending order.
312             Either:
313              
314             $index->add({ start_rev => 1, end_rev => 10 });
315              
316             or
317              
318             $index->add({ start_rev => 10, end_rev => 1 });
319              
320             In both cases, revisons are indexed in ascending order, so revision 1,
321             followed by revision 2, and so on, up to revision 10.
322              
323             =cut
324              
325             sub add {
326             my $self = shift;
327             my $args = validate(@_, {
328             start_rev => {
329             type => SCALAR
330             },
331             end_rev => {
332             type => SCALAR,
333             optional => 1
334             },
335             });
336              
337             $args->{end_rev} = $args->{start_rev} unless defined $args->{end_rev};
338              
339             foreach (qw(start_rev end_rev)) {
340             throw_args("$_ value '$args->{$_}' is invalid")
341             if $args->{$_} !~ /^(?:\d+|HEAD)$/;
342             }
343              
344             # Get start_rev and end_rev in to ascending order.
345             if($args->{start_rev} ne $args->{end_rev} and $args->{end_rev} ne 'HEAD') {
346             if(($args->{start_rev} eq 'HEAD') or ($args->{start_rev} > $args->{end_rev})) {
347             ($args->{start_rev}, $args->{end_rev}) =
348             ($args->{end_rev}, $args->{start_rev});
349             }
350             }
351              
352             $self->_create_writer(0);
353              
354             SVN::Log::retrieve ({ repository => $self->{config}{repo_url},
355             start => $args->{start_rev},
356             end => $args->{end_rev},
357             callback => sub { $self->_handle_log({ rev => \@_ }) }
358             });
359              
360             $self->_delete_writer(1);
361              
362             return 1;
363             }
364              
365             sub _handle_log {
366             my ($self, $args) = @_;
367              
368             my ($paths, $rev, $author, $date, $msg) = @{$args->{rev}};
369              
370             my $doc = $self->{writer}->new_doc();
371              
372             $doc->set_value(revision => $rev);
373              
374             # it's certainly possible to get a undefined author, you just need either
375             # mod_dav_svn with no auth, or svnserve with anonymous write access turned
376             # on.
377             $doc->set_value(author => $author) if defined $author;
378              
379             # XXX might want to convert the date to something more easily searchable,
380             # but for now let's settle for just not tokenizing it.
381             $doc->set_value(date => $date);
382              
383             $doc->set_value(paths => join(' ', keys %$paths))
384             if defined $paths; # i'm still not entirely clear how this can happen...
385              
386             $doc->set_value(message => $msg)
387             unless $msg =~ m/^\s*$/;
388              
389             $self->{writer}->add_doc($doc);
390              
391             $self->{config}{last_indexed_rev} = $rev;
392              
393             $self->_save_config();
394              
395             return;
396             }
397              
398             =head2 get_last_indexed_rev
399              
400             my $rev = $index->get_last_indexed_rev();
401              
402             Returns the revision number that was most recently added to the index.
403              
404             Most useful in repeated calls to C.
405              
406             # Loop forever. Every five minutes wake up, and add all newly
407             # committed revisions to the index.
408             while(1) {
409             sleep 300;
410             $index->add({ start_rev => $index->get_last_indexed_rev() + 1,
411             end_rev => 'HEAD' });
412             }
413              
414             The last indexed revision number is saved as a property of the index.
415              
416             =cut
417              
418             sub get_last_indexed_rev {
419             my $self = shift;
420              
421             throw_fault("Can't call get_last_indexed_rev() before open()")
422             unless exists $self->{config};
423             throw_fault("Empty configuration") unless defined $self->{config};
424              
425             return $self->{config}{last_indexed_rev};
426             }
427              
428             =head2 search
429              
430             my $hits = $index->search($query);
431              
432             Search for $query and returns a KinoSearch::Search::Hits object which
433             contains the result.
434              
435             =cut
436              
437             sub search {
438             my ($self, $query) = @_;
439              
440             throw_fault("open() must be called first")
441             unless exists $self->{config};
442              
443             my $searcher = KinoSearch::Searcher->new(
444             invindex => $self->{index_path},
445             analyzer => $self->{analyzer},
446             );
447              
448             return $searcher->search(query => $query);
449             }
450              
451             =head1 QUERY SYNTAX
452              
453             This module supports the Lucene query syntax, described in detail at
454             L. A brief
455             overview follows.
456              
457             =over
458              
459             =item *
460              
461             A query consists of one or more terms, joined with boolean operators.
462              
463             =item *
464              
465             A term is either a single word, or two or more words, enclosed in double
466             quotes. So
467              
468             foo bar baz
469              
470             is a different query from
471              
472             "foo bar" baz
473              
474             The first searches for any of C, C, or C, the second
475             searches for any of C, or C.
476              
477             =item *
478              
479             By default, multiple terms in a query are OR'd together. You may also
480             use C, or C between terms.
481              
482             foo AND bar
483             foo NOT bar
484              
485             Use C<+> before a term to indicate that it must appear, and C<->
486             before a term to indicate that it must not appear.
487              
488             foo +bar
489             -foo bar
490              
491             =item *
492              
493             Use parantheses to control the ordering.
494              
495             (foo OR bar) AND baz
496              
497             =item *
498              
499             Searches are conducted in I. The default field to search is
500             the log message. Other fields are indicated by placing the field name
501             before the term, separating them both with a C<:>.
502              
503             Available fields are:
504              
505             =over
506              
507             =item revision
508              
509             =item author
510              
511             =item date
512              
513             =item paths
514              
515             =back
516              
517             For example, to find all commit messages where C was the committer,
518             that contained the string "foo bar":
519              
520             author:nik AND "foo bar"
521              
522             =back
523              
524             =head1 DIAGNOSTICS
525              
526             Any of these methods may fail. If they do, they throw an
527             L subclass representing the error, trappable with
528             C. Uncaught exceptions will cause the client application to
529             C.
530              
531             =head2 SVN::Log::Index::X::Args
532              
533             Represents an error that occurs if the parameters given to any of the
534             methods are wrong. This might be because there are too few or too many
535             parameters, or that the types of those parameters are wrong.
536              
537             The text of the error can be retrieved with the C method.
538              
539             =head2 SVN::Log::Index::X::Fault
540              
541             Represents any other error.
542              
543             =head2 Example
544              
545             my $e;
546             eval { $index->search('query string'); };
547              
548             if($e = SVN::Log::Index::X::Fault->caught()) {
549             print "An error occured: ", $e->string(), "\n";
550             } elsif ($e = Exception::Class->caught()) {
551             # Something else failed, rethrow the error
552             ref $e ? $e->rethrow() : die $e;
553             }
554              
555             =head1 SEE ALSO
556              
557             L, L
558              
559             =head1 BUGS
560              
561             Please report any bugs or feature requests to
562             C, or through the web interface at
563             L.
564             I will be notified, and then you'll automatically be notified of progress on
565             your bug as I make changes.
566              
567             =head1 AUTHOR
568              
569             The current maintainer is Nik Clayton, .
570              
571             The original author was Garrett Rooney,
572              
573             =head1 COPYRIGHT AND LICENSE
574              
575             Copyright 2006-2007 Nik Clayton. All Rights Reserved.
576              
577             Copyright 2004 Garrett Rooney. All Rights Reserved.
578              
579             This software is licensed under the same terms as Perl itself.
580              
581             =cut
582              
583             1;