File Coverage

blib/lib/Search/Lemur.pm
Criterion Covered Total %
statement 90 124 72.5
branch 23 32 71.8
condition n/a
subroutine 15 19 78.9
pod 6 6 100.0
total 134 181 74.0


line stmt bran cond sub pod time code
1             package Search::Lemur;
2              
3 5     5   159487 use warnings;
  5         13  
  5         169  
4 5     5   27 use strict;
  5         10  
  5         166  
5 5     5   27 use Carp qw( carp );
  5         14  
  5         263  
6              
7 5     5   2782 use Search::Lemur::Result;
  5         12  
  5         148  
8 5     5   2513 use Search::Lemur::ResultItem;
  5         13  
  5         131  
9 5     5   2645 use Search::Lemur::Database;
  5         15  
  5         132  
10              
11 5     5   4573 use LWP;
  5         308759  
  5         217  
12 5     5   69 use Data::Dumper;
  5         10  
  5         415  
13              
14 5     5   50 use vars qw( $VERSION );
  5         8  
  5         7378  
15              
16             =head1 NAME
17              
18             Lemur - class to query a Lemur server, and parse the results
19              
20             =head1 VERSION
21              
22             Version 1.00
23              
24             =cut
25              
26             our $VERSION = '1.00';
27              
28             =head1 SYNOPSYS
29              
30             use Search::Lemur;
31              
32             my $lem = Search::Lemur->new("http://url/to/lemur.cgi");
33              
34             # run some queries, and get back an array of results
35             # a query with a single term:
36             my @results1 = $lem->query("encryption");
37             # a query with two terms:
38             my @results2 = $lem->query("encryption MD5");
39              
40             # get corpus term frequency of 'MD5':
41             my $md5ctf = $results2[1]->ctf();
42              
43             =head1 DESCRIPTION
44              
45             This module will make it easy to interact with a Lemur
46             Toolkit for Language Modeling and Information Retrieval
47             server for information retreival exercises. For more
48             information on Lemur, see L.
49              
50             This module takes care of all parsing of responses from
51             the server. You can just pass a query as a
52             space-separated list of terms, and the module will give
53             you back an array of C objects.
54              
55             =cut
56              
57              
58             =head2 Main Methods
59              
60             =over 2
61              
62             =item new($url)
63              
64             Create a new Lemur object, connecting to the given Lemur server.
65             The C<$url> should be a full URL, ending in something like 'lemur.cgi'.
66              
67             =cut
68              
69             sub new {
70 4     4 1 949 my $class = shift;
71 4         9 my $url;
72 4 100       65 if (@_) { $url = shift;
  3         10  
73 1         4 } else { return undef; }
74 3         20 my $self = { baseurl => $url,
75             db => 0,
76             n => undef,
77             fullurl => undef };
78 3         12 bless $self, $class;
79 3         14 $self->{fullurl} = $self->_makeurl();
80 3         10 return $self;
81             }
82              
83             =item url()
84              
85             Return the URL of the Lemur server
86              
87             =cut
88              
89             sub url {
90 7     7 1 9 my $self = shift;
91 7         37 return $self->{baseurl};
92             }
93              
94             =item listdb()
95              
96             Get some information about the databases available
97              
98             Returns an array of Lemur::Database objects.
99              
100             =cut
101              
102             sub listdb {
103 0     0 1 0 my $self = shift;
104 0         0 $self->_makeurl();
105 0         0 my $url = $self->{fullurl} . "&d=?";
106 0         0 my $result = $self->_strip($url);
107 0         0 return $self->_makedbs($result);
108             }
109              
110             =item d([num])
111              
112             Set the database number to query. This will specify the
113             database number instead of just using the default databse 0.
114              
115             If the C is not specified, the the current database is returned.
116              
117             =cut
118              
119             sub d {
120 1     1 1 3 my $self = shift;
121 1 50       4 if (@_) { $self->{d} = shift; $self->_makeurl(); }
  1         2  
  1         4  
122 1         3 return $self->{d};
123             }
124              
125            
126              
127             =item v(string)
128              
129             Make a query to the Lemur server. The query should be a space-delimited
130             list of query terms. If the URL is has not been specified, this will die.
131              
132             Be sure there is only one space between words, or something unexpected may
133             happen.
134              
135             Returns an array of results (See L). There will
136             be a result for each query term.
137              
138             =cut
139              
140             # This method really just queries the server, and passes the response on to
141             # &_parse(string). This was done to make testing easier, without having to
142             # query a real server for testing.
143             sub v {
144 0     0 1 0 my $self = shift;
145 0         0 my $query = shift;
146 0         0 $query =~ s/ +/ /g;
147 0 0       0 croak("Something went wrong; I have no URL") unless $self->{baseurl};
148 0         0 my @terms = split(/ +/, $query);
149 0         0 my $url = $self->{fullurl};
150 0         0 foreach my $term (@terms) {
151 0         0 $url = "$url&v=$term";
152             }
153 0         0 return $self->_parse([$query, $self->_strip($url)]);
154             }
155              
156             =item m(string)
157              
158             Returns the lexicalized (stopped & stemmed) version of the given
159             word. This is affected by weather or not the current database
160             is stemmed and/or stopworded. Basically, this is the real word
161             you will end up searching for.
162              
163             Returns a string.
164              
165             =cut
166              
167             sub m {
168 0     0 1 0 my $self = shift;
169 0         0 my $word = shift;
170 0         0 my $url = $self->{fullurl} . "&m=$word";
171 0         0 my $return = $self->_strip($url);
172 0 0       0 if ($return eq "[OOV]") { $return = ""; }
  0         0  
173 0         0 return $return;
174             }
175              
176             # parse information about available databases into an array of
177             # Search::Lemur::Database objects
178             #
179             # string -> arrayref
180             sub _makedbs {
181 1     1   11 my $self = shift;
182 1         2 my $input = shift;
183 1         11 my @input = split(/\n/, $input);
184 1         2 my @return;
185 1         1 my ($num, $title, $stop, $stem, $numdocs,
186             $numterms, $numuniq, $avgdoclen);
187 1         4 while (scalar(@input) >= 1){
188 24         24 my $line = shift(@input);
189 24 100       129 if ($line =~ m/(\d*): ([\w|\d|\s]*) (NOSTOP|STOP) (NOSTEMM|STEMM);/){
    100          
    100          
    100          
    100          
    50          
190 4         7 $num = $1;
191 4         7 $title = $2;
192 4 100       9 $stop = ($3 eq "STOP") ? 1 : 0;
193 4 100       13 $stem = ($4 eq "STEMM") ? 1 : 0;
194             } elsif ($line =~ m/ NUM_DOCS = ?(\d*);/){
195 4         9 $numdocs = $1;
196             } elsif ($line =~ m/ NUM_UNIQUE_TERMS = ?(\d*);/){
197 4         11 $numuniq = $1;
198             } elsif ($line =~ m/ NUM_TERMS = ?(\d*);/){
199 4         16 $numterms = $1;
200             } elsif ($line =~ m/ AVE_DOCLEN = ?(\d*);/){
201 4         12 $avgdoclen = $1;
202             } elsif ($line =~ m/
/){
203 4         16 my $db = Search::Lemur::Database->_new($num, $title, $stop,
204             $stem, $numdocs, $numterms, $numuniq, $avgdoclen);
205 4         10 push @return, $db;
206             }
207             }
208 1         4 return \@return;
209             }
210              
211             # parse the result from the server
212             #
213             # Takes a reference to an array with two items:
214             # - a string containing the query terms, separated by spaces
215             # - a string containing the response
216             #
217             # returns array of results
218             sub _parse {
219 2     2   459 my $self = shift;
220 2         4 my $inputref = shift;
221 2         6 my @input = @$inputref;
222 2         6 my @terms = split(/ /, $input[0]);
223             # print Dumper($input[1]);
224 2         17 my @response = split(/\D+/, $input[1]);
225 2 50       11 shift(@response) if ($response[0] eq ""); #TODO Why am I doing this? this makes tests fail.
226 2         5 my $numterms = scalar(@terms);
227              
228 2         3 my @return;
229            
230             # build a result object for each term
231 2         3 foreach my $term (@terms) {
232             # print Dumper(@response);
233 2         6 my $ctf = shift(@response);
234 2         3 my $df = shift(@response);
235 2         16 my $result = Search::Lemur::Result->_new($term, $ctf, $df);
236             # build a resultItem object for each document
237 2         10 for (my $i = 0; $i < $df; $i++){
238 3         5 my $docid = shift(@response);
239 3         6 my $doclen = shift(@response);
240 3         14 my $tf = shift(@response);
241 3         16 my $resultItem = Search::Lemur::ResultItem->_new($docid, $doclen, $tf);
242 3         11 $result->_add($resultItem);
243             }
244 2         6 push(@return, $result);
245             }
246              
247 2         11 return \@return;
248             }
249              
250             # build the full url to use for all queries
251             # This url consists of the base url (ending in lemur.cgi) plus
252             # d=n (specifies the database) and n=x (the number of results
253             # to return. If either of these are undef, then they are left
254             # off, and the server is free to use its defaults
255             #
256             # the n value seems to only affect the q= query, and not the
257             # inverted list v= query.
258             #
259             # returns a string, and updates the fullurl instance variable
260             sub _makeurl {
261 7     7   21 my $self = shift;
262 7         22 my $return = $self->url() . "?g=p";
263 7 100       35 if ($self->{d}) { $return = $return . "&d=$self->{d}"; }
  3         6  
264 7 100       23 if ($self->{n}) { $return = $return . "&n=$self->{n}"; }
  1         4  
265 7         17 $self->{fullurl} = $return;
266 7         24 return $return;
267             }
268            
269             # strip_: make a request to the server, and strip out anything
270             # useless
271             #
272             # This will get the result from the server, and strip put any
273             # html, etc that is not useful to the parser.
274             #
275             # string -> string
276             #
277             # takes in a url argument to fetch, and returns the stripped
278             # result.
279             sub _strip {
280 0     0     my $self = shift;
281 0           my $url = shift;
282             # print "$url\n\n";
283 0           my $ua = LWP::UserAgent->new;
284 0           $ua->agent("Lemur.pm/$VERSION");
285 0           my $req = HTTP::Request->new(GET => $url);
286 0           $req->content_type('application/x-www-form-urlencoded');
287 0           $req->content('query=libwww-perl&mode=dist');
288             # make request
289 0           my $res = $ua->request($req);
290              
291 0 0         if ($res->is_success) {
292 0           $res->content() =~ m/.*\n\n((\s|\d|\n|\w|\[|\]|:|;|=|<|>)*?)\n
/;
293             # print $1 . "\n\n";
294 0           return $1;
295             }
296             else {
297 0           Carp::carp($res->status_line, "\n");
298 0           return undef;
299             }
300             }
301              
302              
303              
304              
305             =back
306              
307             =head1 AUTHOR
308              
309             Patrick Kaeding, C<< >>
310              
311             =head1 BUGS
312              
313             Please report any bugs or feature requests to
314             C, or through the web interface at
315             L.
316             I will be notified, and then you'll automatically be notified of progress on
317             your bug as I make changes.
318              
319             =head1 SUPPORT
320              
321             You can find documentation for this module with the perldoc command.
322              
323             perldoc Search::Lemur
324              
325             You can also look for information at:
326              
327             =over 4
328              
329             =item * AnnoCPAN: Annotated CPAN documentation
330              
331             L
332              
333             =item * CPAN Ratings
334              
335             L
336              
337             =item * RT: CPAN's request tracker
338              
339             L
340              
341             =item * Search CPAN
342              
343             L
344              
345             =back
346              
347             =head1 ACKNOWLEDGEMENTS
348              
349             =head1 COPYRIGHT & LICENSE
350              
351             Copyright 2007 Patrick Kaeding, all rights reserved.
352              
353             This program is free software; you can redistribute it and/or modify it
354             under the same terms as Perl itself.
355              
356             =cut
357              
358             1; # End of Search::Lemur