File Coverage

blib/lib/Search/Lemur.pm

Criterion	Covered	Total	%
statement	90	124	72.5
branch	23	32	71.8
condition			n/a
subroutine	15	19	78.9
pod	6	6	100.0
total	134	181	74.0

line	stmt	bran	sub	pod	time	code
1						package Search::Lemur;
2
3	5		5		159487	use warnings;
	5				13
	5				169
4	5		5		27	use strict;
	5				10
	5				166
5	5		5		27	use Carp qw( carp );
	5				14
	5				263
6
7	5		5		2782	use Search::Lemur::Result;
	5				12
	5				148
8	5		5		2513	use Search::Lemur::ResultItem;
	5				13
	5				131
9	5		5		2645	use Search::Lemur::Database;
	5				15
	5				132
10
11	5		5		4573	use LWP;
	5				308759
	5				217
12	5		5		69	use Data::Dumper;
	5				10
	5				415
13
14	5		5		50	use vars qw( $VERSION );
	5				8
	5				7378
15
16						=head1 NAME
17
18						Lemur - class to query a Lemur server, and parse the results
19
20						=head1 VERSION
21
22						Version 1.00
23
24						=cut
25
26						our $VERSION = '1.00';
27
28						=head1 SYNOPSYS
29
30						use Search::Lemur;
31
32						my $lem = Search::Lemur->new("http://url/to/lemur.cgi");
33
34						# run some queries, and get back an array of results
35						# a query with a single term:
36						my @results1 = $lem->query("encryption");
37						# a query with two terms:
38						my @results2 = $lem->query("encryption MD5");
39
40						# get corpus term frequency of 'MD5':
41						my $md5ctf = $results2[1]->ctf();
42
43						=head1 DESCRIPTION
44
45						This module will make it easy to interact with a Lemur
46						Toolkit for Language Modeling and Information Retrieval
47						server for information retreival exercises. For more
48						information on Lemur, see L.
49
50						This module takes care of all parsing of responses from
51						the server. You can just pass a query as a
52						space-separated list of terms, and the module will give
53						you back an array of C objects.
54
55						=cut
56
57
58						=head2 Main Methods
59
60						=over 2
61
62						=item new($url)
63
64						Create a new Lemur object, connecting to the given Lemur server.
65						The C<$url> should be a full URL, ending in something like 'lemur.cgi'.
66
67						=cut
68
69						sub new {
70	4		4	1	949	my $class = shift;
71	4				9	my $url;
72	4	100			65	if (@_) { $url = shift;
	3				10
73	1				4	} else { return undef; }
74	3				20	my $self = { baseurl => $url,
75						db => 0,
76						n => undef,
77						fullurl => undef };
78	3				12	bless $self, $class;
79	3				14	$self->{fullurl} = $self->_makeurl();
80	3				10	return $self;
81						}
82
83						=item url()
84
85						Return the URL of the Lemur server
86
87						=cut
88
89						sub url {
90	7		7	1	9	my $self = shift;
91	7				37	return $self->{baseurl};
92						}
93
94						=item listdb()
95
96						Get some information about the databases available
97
98						Returns an array of Lemur::Database objects.
99
100						=cut
101
102						sub listdb {
103	0		0	1	0	my $self = shift;
104	0				0	$self->_makeurl();
105	0				0	my $url = $self->{fullurl} . "&d=?";
106	0				0	my $result = $self->_strip($url);
107	0				0	return $self->_makedbs($result);
108						}
109
110						=item d([num])
111
112						Set the database number to query. This will specify the
113						database number instead of just using the default databse 0.
114
115						If the C is not specified, the the current database is returned.
116
117						=cut
118
119						sub d {
120	1		1	1	3	my $self = shift;
121	1	50			4	if (@_) { $self->{d} = shift; $self->_makeurl(); }
	1				2
	1				4
122	1				3	return $self->{d};
123						}
124
125
126
127						=item v(string)
128
129						Make a query to the Lemur server. The query should be a space-delimited
130						list of query terms. If the URL is has not been specified, this will die.
131
132						Be sure there is only one space between words, or something unexpected may
133						happen.
134
135						Returns an array of results (See L). There will
136						be a result for each query term.
137
138						=cut
139
140						# This method really just queries the server, and passes the response on to
141						# &_parse(string). This was done to make testing easier, without having to
142						# query a real server for testing.
143						sub v {
144	0		0	1	0	my $self = shift;
145	0				0	my $query = shift;
146	0				0	$query =~ s/ +/ /g;
147	0	0			0	croak("Something went wrong; I have no URL") unless $self->{baseurl};
148	0				0	my @terms = split(/ +/, $query);
149	0				0	my $url = $self->{fullurl};
150	0				0	foreach my $term (@terms) {
151	0				0	$url = "$url&v=$term";
152						}
153	0				0	return $self->_parse([$query, $self->_strip($url)]);
154						}
155
156						=item m(string)
157
158						Returns the lexicalized (stopped & stemmed) version of the given
159						word. This is affected by weather or not the current database
160						is stemmed and/or stopworded. Basically, this is the real word
161						you will end up searching for.
162
163						Returns a string.
164
165						=cut
166
167						sub m {
168	0		0	1	0	my $self = shift;
169	0				0	my $word = shift;
170	0				0	my $url = $self->{fullurl} . "&m=$word";
171	0				0	my $return = $self->_strip($url);
172	0	0			0	if ($return eq "[OOV]") { $return = ""; }
	0				0
173	0				0	return $return;
174						}
175
176						# parse information about available databases into an array of
177						# Search::Lemur::Database objects
178						#
179						# string -> arrayref
180						sub _makedbs {
181	1		1		11	my $self = shift;
182	1				2	my $input = shift;
183	1				11	my @input = split(/\n/, $input);
184	1				2	my @return;
185	1				1	my ($num, $title, $stop, $stem, $numdocs,
186						$numterms, $numuniq, $avgdoclen);
187	1				4	while (scalar(@input) >= 1){
188	24				24	my $line = shift(@input);
189	24	100			129	if ($line =~ m/(\d): ([\w\|\d\|\s]) (NOSTOP\|STOP) (NOSTEMM\|STEMM);/){
		100
		100
		100
		100
		50
190	4				7	$num = $1;
191	4				7	$title = $2;
192	4	100			9	$stop = ($3 eq "STOP") ? 1 : 0;
193	4	100			13	$stem = ($4 eq "STEMM") ? 1 : 0;
194						} elsif ($line =~ m/ NUM_DOCS = ?(\d*);/){
195	4				9	$numdocs = $1;
196						} elsif ($line =~ m/ NUM_UNIQUE_TERMS = ?(\d*);/){
197	4				11	$numuniq = $1;
198						} elsif ($line =~ m/ NUM_TERMS = ?(\d*);/){
199	4				16	$numterms = $1;
200						} elsif ($line =~ m/ AVE_DOCLEN = ?(\d*);/){
201	4				12	$avgdoclen = $1;
202						} elsif ($line =~ m/ /){
203	4				16	my $db = Search::Lemur::Database->_new($num, $title, $stop,
204						$stem, $numdocs, $numterms, $numuniq, $avgdoclen);
205	4				10	push @return, $db;
206						}
207						}
208	1				4	return \@return;
209						}
210
211						# parse the result from the server
212						#
213						# Takes a reference to an array with two items:
214						# - a string containing the query terms, separated by spaces
215						# - a string containing the response
216						#
217						# returns array of results
218						sub _parse {
219	2		2		459	my $self = shift;
220	2				4	my $inputref = shift;
221	2				6	my @input = @$inputref;
222	2				6	my @terms = split(/ /, $input[0]);
223						# print Dumper($input[1]);
224	2				17	my @response = split(/\D+/, $input[1]);
225	2	50			11	shift(@response) if ($response[0] eq ""); #TODO Why am I doing this? this makes tests fail.
226	2				5	my $numterms = scalar(@terms);
227
228	2				3	my @return;
229
230						# build a result object for each term
231	2				3	foreach my $term (@terms) {
232						# print Dumper(@response);
233	2				6	my $ctf = shift(@response);
234	2				3	my $df = shift(@response);
235	2				16	my $result = Search::Lemur::Result->_new($term, $ctf, $df);
236						# build a resultItem object for each document
237	2				10	for (my $i = 0; $i < $df; $i++){
238	3				5	my $docid = shift(@response);
239	3				6	my $doclen = shift(@response);
240	3				14	my $tf = shift(@response);
241	3				16	my $resultItem = Search::Lemur::ResultItem->_new($docid, $doclen, $tf);
242	3				11	$result->_add($resultItem);
243						}
244	2				6	push(@return, $result);
245						}
246
247	2				11	return \@return;
248						}
249
250						# build the full url to use for all queries
251						# This url consists of the base url (ending in lemur.cgi) plus
252						# d=n (specifies the database) and n=x (the number of results
253						# to return. If either of these are undef, then they are left
254						# off, and the server is free to use its defaults
255						#
256						# the n value seems to only affect the q= query, and not the
257						# inverted list v= query.
258						#
259						# returns a string, and updates the fullurl instance variable
260						sub _makeurl {
261	7		7		21	my $self = shift;
262	7				22	my $return = $self->url() . "?g=p";
263	7	100			35	if ($self->{d}) { $return = $return . "&d=$self->{d}"; }
	3				6
264	7	100			23	if ($self->{n}) { $return = $return . "&n=$self->{n}"; }
	1				4
265	7				17	$self->{fullurl} = $return;
266	7				24	return $return;
267						}
268
269						# strip_: make a request to the server, and strip out anything
270						# useless
271						#
272						# This will get the result from the server, and strip put any
273						# html, etc that is not useful to the parser.
274						#
275						# string -> string
276						#
277						# takes in a url argument to fetch, and returns the stripped
278						# result.
279						sub _strip {
280	0		0			my $self = shift;
281	0					my $url = shift;
282						# print "$url\n\n";
283	0					my $ua = LWP::UserAgent->new;
284	0					$ua->agent("Lemur.pm/$VERSION");
285	0					my $req = HTTP::Request->new(GET => $url);
286	0					$req->content_type('application/x-www-form-urlencoded');
287	0					$req->content('query=libwww-perl&mode=dist');
288						# make request
289	0					my $res = $ua->request($req);
290
291	0	0				if ($res->is_success) {
292	0					$res->content() =~ m/.\n\n((\s\|\d\|\n\|\w\|\[\|\]\|:\|;\|=\|<\|>)?)\n /;
293						# print $1 . "\n\n";
294	0					return $1;
295						}
296						else {
297	0					Carp::carp($res->status_line, "\n");
298	0					return undef;
299						}
300						}
301
302
303
304
305						=back
306
307						=head1 AUTHOR
308
309						Patrick Kaeding, C<< >>
310
311						=head1 BUGS
312
313						Please report any bugs or feature requests to
314						C, or through the web interface at
315						L.
316						I will be notified, and then you'll automatically be notified of progress on
317						your bug as I make changes.
318
319						=head1 SUPPORT
320
321						You can find documentation for this module with the perldoc command.
322
323						perldoc Search::Lemur
324
325						You can also look for information at:
326
327						=over 4
328
329						=item * AnnoCPAN: Annotated CPAN documentation
330
331						L
332
333						=item * CPAN Ratings
334
335						L
336
337						=item * RT: CPAN's request tracker
338
339						L
340
341						=item * Search CPAN
342
343						L
344
345						=back
346
347						=head1 ACKNOWLEDGEMENTS
348
349						=head1 COPYRIGHT & LICENSE
350
351						Copyright 2007 Patrick Kaeding, all rights reserved.
352
353						This program is free software; you can redistribute it and/or modify it
354						under the same terms as Perl itself.
355
356						=cut
357
358						1; # End of Search::Lemur