| blib/lib/Search/Lemur.pm | |||
|---|---|---|---|
| Criterion | Covered | Total | % |
| statement | 90 | 124 | 72.5 |
| branch | 23 | 32 | 71.8 |
| condition | n/a | ||
| subroutine | 15 | 19 | 78.9 |
| pod | 6 | 6 | 100.0 |
| total | 134 | 181 | 74.0 |
| line | stmt | bran | cond | sub | pod | time | code |
|---|---|---|---|---|---|---|---|
| 1 | package Search::Lemur; | ||||||
| 2 | |||||||
| 3 | 5 | 5 | 159487 | use warnings; | |||
| 5 | 13 | ||||||
| 5 | 169 | ||||||
| 4 | 5 | 5 | 27 | use strict; | |||
| 5 | 10 | ||||||
| 5 | 166 | ||||||
| 5 | 5 | 5 | 27 | use Carp qw( carp ); | |||
| 5 | 14 | ||||||
| 5 | 263 | ||||||
| 6 | |||||||
| 7 | 5 | 5 | 2782 | use Search::Lemur::Result; | |||
| 5 | 12 | ||||||
| 5 | 148 | ||||||
| 8 | 5 | 5 | 2513 | use Search::Lemur::ResultItem; | |||
| 5 | 13 | ||||||
| 5 | 131 | ||||||
| 9 | 5 | 5 | 2645 | use Search::Lemur::Database; | |||
| 5 | 15 | ||||||
| 5 | 132 | ||||||
| 10 | |||||||
| 11 | 5 | 5 | 4573 | use LWP; | |||
| 5 | 308759 | ||||||
| 5 | 217 | ||||||
| 12 | 5 | 5 | 69 | use Data::Dumper; | |||
| 5 | 10 | ||||||
| 5 | 415 | ||||||
| 13 | |||||||
| 14 | 5 | 5 | 50 | use vars qw( $VERSION ); | |||
| 5 | 8 | ||||||
| 5 | 7378 | ||||||
| 15 | |||||||
| 16 | =head1 NAME | ||||||
| 17 | |||||||
| 18 | Lemur - class to query a Lemur server, and parse the results | ||||||
| 19 | |||||||
| 20 | =head1 VERSION | ||||||
| 21 | |||||||
| 22 | Version 1.00 | ||||||
| 23 | |||||||
| 24 | =cut | ||||||
| 25 | |||||||
| 26 | our $VERSION = '1.00'; | ||||||
| 27 | |||||||
| 28 | =head1 SYNOPSYS | ||||||
| 29 | |||||||
| 30 | use Search::Lemur; | ||||||
| 31 | |||||||
| 32 | my $lem = Search::Lemur->new("http://url/to/lemur.cgi"); | ||||||
| 33 | |||||||
| 34 | # run some queries, and get back an array of results | ||||||
| 35 | # a query with a single term: | ||||||
| 36 | my @results1 = $lem->query("encryption"); | ||||||
| 37 | # a query with two terms: | ||||||
| 38 | my @results2 = $lem->query("encryption MD5"); | ||||||
| 39 | |||||||
| 40 | # get corpus term frequency of 'MD5': | ||||||
| 41 | my $md5ctf = $results2[1]->ctf(); | ||||||
| 42 | |||||||
| 43 | =head1 DESCRIPTION | ||||||
| 44 | |||||||
| 45 | This module will make it easy to interact with a Lemur | ||||||
| 46 | Toolkit for Language Modeling and Information Retrieval | ||||||
| 47 | server for information retreival exercises. For more | ||||||
| 48 | information on Lemur, see L |
||||||
| 49 | |||||||
| 50 | This module takes care of all parsing of responses from | ||||||
| 51 | the server. You can just pass a query as a | ||||||
| 52 | space-separated list of terms, and the module will give | ||||||
| 53 | you back an array of C |
||||||
| 54 | |||||||
| 55 | =cut | ||||||
| 56 | |||||||
| 57 | |||||||
| 58 | =head2 Main Methods | ||||||
| 59 | |||||||
| 60 | =over 2 | ||||||
| 61 | |||||||
| 62 | =item new($url) | ||||||
| 63 | |||||||
| 64 | Create a new Lemur object, connecting to the given Lemur server. | ||||||
| 65 | The C<$url> should be a full URL, ending in something like 'lemur.cgi'. | ||||||
| 66 | |||||||
| 67 | =cut | ||||||
| 68 | |||||||
| 69 | sub new { | ||||||
| 70 | 4 | 4 | 1 | 949 | my $class = shift; | ||
| 71 | 4 | 9 | my $url; | ||||
| 72 | 4 | 100 | 65 | if (@_) { $url = shift; | |||
| 3 | 10 | ||||||
| 73 | 1 | 4 | } else { return undef; } | ||||
| 74 | 3 | 20 | my $self = { baseurl => $url, | ||||
| 75 | db => 0, | ||||||
| 76 | n => undef, | ||||||
| 77 | fullurl => undef }; | ||||||
| 78 | 3 | 12 | bless $self, $class; | ||||
| 79 | 3 | 14 | $self->{fullurl} = $self->_makeurl(); | ||||
| 80 | 3 | 10 | return $self; | ||||
| 81 | } | ||||||
| 82 | |||||||
| 83 | =item url() | ||||||
| 84 | |||||||
| 85 | Return the URL of the Lemur server | ||||||
| 86 | |||||||
| 87 | =cut | ||||||
| 88 | |||||||
| 89 | sub url { | ||||||
| 90 | 7 | 7 | 1 | 9 | my $self = shift; | ||
| 91 | 7 | 37 | return $self->{baseurl}; | ||||
| 92 | } | ||||||
| 93 | |||||||
| 94 | =item listdb() | ||||||
| 95 | |||||||
| 96 | Get some information about the databases available | ||||||
| 97 | |||||||
| 98 | Returns an array of Lemur::Database objects. | ||||||
| 99 | |||||||
| 100 | =cut | ||||||
| 101 | |||||||
| 102 | sub listdb { | ||||||
| 103 | 0 | 0 | 1 | 0 | my $self = shift; | ||
| 104 | 0 | 0 | $self->_makeurl(); | ||||
| 105 | 0 | 0 | my $url = $self->{fullurl} . "&d=?"; | ||||
| 106 | 0 | 0 | my $result = $self->_strip($url); | ||||
| 107 | 0 | 0 | return $self->_makedbs($result); | ||||
| 108 | } | ||||||
| 109 | |||||||
| 110 | =item d([num]) | ||||||
| 111 | |||||||
| 112 | Set the database number to query. This will specify the | ||||||
| 113 | database number instead of just using the default databse 0. | ||||||
| 114 | |||||||
| 115 | If the C |
||||||
| 116 | |||||||
| 117 | =cut | ||||||
| 118 | |||||||
| 119 | sub d { | ||||||
| 120 | 1 | 1 | 1 | 3 | my $self = shift; | ||
| 121 | 1 | 50 | 4 | if (@_) { $self->{d} = shift; $self->_makeurl(); } | |||
| 1 | 2 | ||||||
| 1 | 4 | ||||||
| 122 | 1 | 3 | return $self->{d}; | ||||
| 123 | } | ||||||
| 124 | |||||||
| 125 | |||||||
| 126 | |||||||
| 127 | =item v(string) | ||||||
| 128 | |||||||
| 129 | Make a query to the Lemur server. The query should be a space-delimited | ||||||
| 130 | list of query terms. If the URL is has not been specified, this will die. | ||||||
| 131 | |||||||
| 132 | Be sure there is only one space between words, or something unexpected may | ||||||
| 133 | happen. | ||||||
| 134 | |||||||
| 135 | Returns an array of results (See L |
||||||
| 136 | be a result for each query term. | ||||||
| 137 | |||||||
| 138 | =cut | ||||||
| 139 | |||||||
| 140 | # This method really just queries the server, and passes the response on to | ||||||
| 141 | # &_parse(string). This was done to make testing easier, without having to | ||||||
| 142 | # query a real server for testing. | ||||||
| 143 | sub v { | ||||||
| 144 | 0 | 0 | 1 | 0 | my $self = shift; | ||
| 145 | 0 | 0 | my $query = shift; | ||||
| 146 | 0 | 0 | $query =~ s/ +/ /g; | ||||
| 147 | 0 | 0 | 0 | croak("Something went wrong; I have no URL") unless $self->{baseurl}; | |||
| 148 | 0 | 0 | my @terms = split(/ +/, $query); | ||||
| 149 | 0 | 0 | my $url = $self->{fullurl}; | ||||
| 150 | 0 | 0 | foreach my $term (@terms) { | ||||
| 151 | 0 | 0 | $url = "$url&v=$term"; | ||||
| 152 | } | ||||||
| 153 | 0 | 0 | return $self->_parse([$query, $self->_strip($url)]); | ||||
| 154 | } | ||||||
| 155 | |||||||
| 156 | =item m(string) | ||||||
| 157 | |||||||
| 158 | Returns the lexicalized (stopped & stemmed) version of the given | ||||||
| 159 | word. This is affected by weather or not the current database | ||||||
| 160 | is stemmed and/or stopworded. Basically, this is the real word | ||||||
| 161 | you will end up searching for. | ||||||
| 162 | |||||||
| 163 | Returns a string. | ||||||
| 164 | |||||||
| 165 | =cut | ||||||
| 166 | |||||||
| 167 | sub m { | ||||||
| 168 | 0 | 0 | 1 | 0 | my $self = shift; | ||
| 169 | 0 | 0 | my $word = shift; | ||||
| 170 | 0 | 0 | my $url = $self->{fullurl} . "&m=$word"; | ||||
| 171 | 0 | 0 | my $return = $self->_strip($url); | ||||
| 172 | 0 | 0 | 0 | if ($return eq "[OOV]") { $return = ""; } | |||
| 0 | 0 | ||||||
| 173 | 0 | 0 | return $return; | ||||
| 174 | } | ||||||
| 175 | |||||||
| 176 | # parse information about available databases into an array of | ||||||
| 177 | # Search::Lemur::Database objects | ||||||
| 178 | # | ||||||
| 179 | # string -> arrayref | ||||||
| 180 | sub _makedbs { | ||||||
| 181 | 1 | 1 | 11 | my $self = shift; | |||
| 182 | 1 | 2 | my $input = shift; | ||||
| 183 | 1 | 11 | my @input = split(/\n/, $input); | ||||
| 184 | 1 | 2 | my @return; | ||||
| 185 | 1 | 1 | my ($num, $title, $stop, $stem, $numdocs, | ||||
| 186 | $numterms, $numuniq, $avgdoclen); | ||||||
| 187 | 1 | 4 | while (scalar(@input) >= 1){ | ||||
| 188 | 24 | 24 | my $line = shift(@input); | ||||
| 189 | 24 | 100 | 129 | if ($line =~ m/(\d*): ([\w|\d|\s]*) (NOSTOP|STOP) (NOSTEMM|STEMM);/){ | |||
| 100 | |||||||
| 100 | |||||||
| 100 | |||||||
| 100 | |||||||
| 50 | |||||||
| 190 | 4 | 7 | $num = $1; | ||||
| 191 | 4 | 7 | $title = $2; | ||||
| 192 | 4 | 100 | 9 | $stop = ($3 eq "STOP") ? 1 : 0; | |||
| 193 | 4 | 100 | 13 | $stem = ($4 eq "STEMM") ? 1 : 0; | |||
| 194 | } elsif ($line =~ m/ NUM_DOCS = ?(\d*);/){ | ||||||
| 195 | 4 | 9 | $numdocs = $1; | ||||
| 196 | } elsif ($line =~ m/ NUM_UNIQUE_TERMS = ?(\d*);/){ | ||||||
| 197 | 4 | 11 | $numuniq = $1; | ||||
| 198 | } elsif ($line =~ m/ NUM_TERMS = ?(\d*);/){ | ||||||
| 199 | 4 | 16 | $numterms = $1; | ||||
| 200 | } elsif ($line =~ m/ AVE_DOCLEN = ?(\d*);/){ | ||||||
| 201 | 4 | 12 | $avgdoclen = $1; | ||||
| 202 | } elsif ($line =~ m/ /){ |
||||||
| 203 | 4 | 16 | my $db = Search::Lemur::Database->_new($num, $title, $stop, | ||||
| 204 | $stem, $numdocs, $numterms, $numuniq, $avgdoclen); | ||||||
| 205 | 4 | 10 | push @return, $db; | ||||
| 206 | } | ||||||
| 207 | } | ||||||
| 208 | 1 | 4 | return \@return; | ||||
| 209 | } | ||||||
| 210 | |||||||
| 211 | # parse the result from the server | ||||||
| 212 | # | ||||||
| 213 | # Takes a reference to an array with two items: | ||||||
| 214 | # - a string containing the query terms, separated by spaces | ||||||
| 215 | # - a string containing the response | ||||||
| 216 | # | ||||||
| 217 | # returns array of results | ||||||
| 218 | sub _parse { | ||||||
| 219 | 2 | 2 | 459 | my $self = shift; | |||
| 220 | 2 | 4 | my $inputref = shift; | ||||
| 221 | 2 | 6 | my @input = @$inputref; | ||||
| 222 | 2 | 6 | my @terms = split(/ /, $input[0]); | ||||
| 223 | # print Dumper($input[1]); | ||||||
| 224 | 2 | 17 | my @response = split(/\D+/, $input[1]); | ||||
| 225 | 2 | 50 | 11 | shift(@response) if ($response[0] eq ""); #TODO Why am I doing this? this makes tests fail. | |||
| 226 | 2 | 5 | my $numterms = scalar(@terms); | ||||
| 227 | |||||||
| 228 | 2 | 3 | my @return; | ||||
| 229 | |||||||
| 230 | # build a result object for each term | ||||||
| 231 | 2 | 3 | foreach my $term (@terms) { | ||||
| 232 | # print Dumper(@response); | ||||||
| 233 | 2 | 6 | my $ctf = shift(@response); | ||||
| 234 | 2 | 3 | my $df = shift(@response); | ||||
| 235 | 2 | 16 | my $result = Search::Lemur::Result->_new($term, $ctf, $df); | ||||
| 236 | # build a resultItem object for each document | ||||||
| 237 | 2 | 10 | for (my $i = 0; $i < $df; $i++){ | ||||
| 238 | 3 | 5 | my $docid = shift(@response); | ||||
| 239 | 3 | 6 | my $doclen = shift(@response); | ||||
| 240 | 3 | 14 | my $tf = shift(@response); | ||||
| 241 | 3 | 16 | my $resultItem = Search::Lemur::ResultItem->_new($docid, $doclen, $tf); | ||||
| 242 | 3 | 11 | $result->_add($resultItem); | ||||
| 243 | } | ||||||
| 244 | 2 | 6 | push(@return, $result); | ||||
| 245 | } | ||||||
| 246 | |||||||
| 247 | 2 | 11 | return \@return; | ||||
| 248 | } | ||||||
| 249 | |||||||
| 250 | # build the full url to use for all queries | ||||||
| 251 | # This url consists of the base url (ending in lemur.cgi) plus | ||||||
| 252 | # d=n (specifies the database) and n=x (the number of results | ||||||
| 253 | # to return. If either of these are undef, then they are left | ||||||
| 254 | # off, and the server is free to use its defaults | ||||||
| 255 | # | ||||||
| 256 | # the n value seems to only affect the q= query, and not the | ||||||
| 257 | # inverted list v= query. | ||||||
| 258 | # | ||||||
| 259 | # returns a string, and updates the fullurl instance variable | ||||||
| 260 | sub _makeurl { | ||||||
| 261 | 7 | 7 | 21 | my $self = shift; | |||
| 262 | 7 | 22 | my $return = $self->url() . "?g=p"; | ||||
| 263 | 7 | 100 | 35 | if ($self->{d}) { $return = $return . "&d=$self->{d}"; } | |||
| 3 | 6 | ||||||
| 264 | 7 | 100 | 23 | if ($self->{n}) { $return = $return . "&n=$self->{n}"; } | |||
| 1 | 4 | ||||||
| 265 | 7 | 17 | $self->{fullurl} = $return; | ||||
| 266 | 7 | 24 | return $return; | ||||
| 267 | } | ||||||
| 268 | |||||||
| 269 | # strip_: make a request to the server, and strip out anything | ||||||
| 270 | # useless | ||||||
| 271 | # | ||||||
| 272 | # This will get the result from the server, and strip put any | ||||||
| 273 | # html, etc that is not useful to the parser. | ||||||
| 274 | # | ||||||
| 275 | # string -> string | ||||||
| 276 | # | ||||||
| 277 | # takes in a url argument to fetch, and returns the stripped | ||||||
| 278 | # result. | ||||||
| 279 | sub _strip { | ||||||
| 280 | 0 | 0 | my $self = shift; | ||||
| 281 | 0 | my $url = shift; | |||||
| 282 | # print "$url\n\n"; | ||||||
| 283 | 0 | my $ua = LWP::UserAgent->new; | |||||
| 284 | 0 | $ua->agent("Lemur.pm/$VERSION"); | |||||
| 285 | 0 | my $req = HTTP::Request->new(GET => $url); | |||||
| 286 | 0 | $req->content_type('application/x-www-form-urlencoded'); | |||||
| 287 | 0 | $req->content('query=libwww-perl&mode=dist'); | |||||
| 288 | # make request | ||||||
| 289 | 0 | my $res = $ua->request($req); | |||||
| 290 | |||||||
| 291 | 0 | 0 | if ($res->is_success) { | ||||
| 292 | 0 | $res->content() =~ m/.*\n\n((\s|\d|\n|\w|\[|\]|:|;|=|<|>)*?)\n /; |
|||||
| 293 | # print $1 . "\n\n"; | ||||||
| 294 | 0 | return $1; | |||||
| 295 | } | ||||||
| 296 | else { | ||||||
| 297 | 0 | Carp::carp($res->status_line, "\n"); | |||||
| 298 | 0 | return undef; | |||||
| 299 | } | ||||||
| 300 | } | ||||||
| 301 | |||||||
| 302 | |||||||
| 303 | |||||||
| 304 | |||||||
| 305 | =back | ||||||
| 306 | |||||||
| 307 | =head1 AUTHOR | ||||||
| 308 | |||||||
| 309 | Patrick Kaeding, C<< |
||||||
| 310 | |||||||
| 311 | =head1 BUGS | ||||||
| 312 | |||||||
| 313 | Please report any bugs or feature requests to | ||||||
| 314 | C |
||||||
| 315 | L |
||||||
| 316 | I will be notified, and then you'll automatically be notified of progress on | ||||||
| 317 | your bug as I make changes. | ||||||
| 318 | |||||||
| 319 | =head1 SUPPORT | ||||||
| 320 | |||||||
| 321 | You can find documentation for this module with the perldoc command. | ||||||
| 322 | |||||||
| 323 | perldoc Search::Lemur | ||||||
| 324 | |||||||
| 325 | You can also look for information at: | ||||||
| 326 | |||||||
| 327 | =over 4 | ||||||
| 328 | |||||||
| 329 | =item * AnnoCPAN: Annotated CPAN documentation | ||||||
| 330 | |||||||
| 331 | L |
||||||
| 332 | |||||||
| 333 | =item * CPAN Ratings | ||||||
| 334 | |||||||
| 335 | L |
||||||
| 336 | |||||||
| 337 | =item * RT: CPAN's request tracker | ||||||
| 338 | |||||||
| 339 | L |
||||||
| 340 | |||||||
| 341 | =item * Search CPAN | ||||||
| 342 | |||||||
| 343 | L |
||||||
| 344 | |||||||
| 345 | =back | ||||||
| 346 | |||||||
| 347 | =head1 ACKNOWLEDGEMENTS | ||||||
| 348 | |||||||
| 349 | =head1 COPYRIGHT & LICENSE | ||||||
| 350 | |||||||
| 351 | Copyright 2007 Patrick Kaeding, all rights reserved. | ||||||
| 352 | |||||||
| 353 | This program is free software; you can redistribute it and/or modify it | ||||||
| 354 | under the same terms as Perl itself. | ||||||
| 355 | |||||||
| 356 | =cut | ||||||
| 357 | |||||||
| 358 | 1; # End of Search::Lemur |