File Coverage

blib/lib/WWW/Search.pm
Criterion Covered Total %
statement 371 528 70.2
branch 90 202 44.5
condition 47 76 61.8
subroutine 81 89 91.0
pod 50 51 98.0
total 639 946 67.5


line stmt bran cond sub pod time code
1             # Search.pm
2             # by John Heidemann
3             # Copyright (C) 1996 by USC/ISI
4             # $Id: Search.pm,v 2.565 2008-11-29 05:52:08 Martin Exp $
5             #
6             # A complete copyright notice appears at the end of this file.
7              
8             =head1 NAME
9              
10             WWW::Search - Virtual base class for WWW searches
11              
12             =head1 SYNOPSIS
13              
14             use WWW::Search;
15             my $sEngine = 'AltaVista';
16             my $oSearch = new WWW::Search($sEngine);
17              
18             =head1 DESCRIPTION
19              
20             This class is the parent for all access methods supported by the
21             C library. This library implements a Perl API
22             to web-based search engines.
23              
24             See README for a list of search engines currently supported, and for a
25             lot of interesting high-level information about this distribution.
26              
27             Search results can be limited, and there is a pause between each
28             request to avoid overloading either the client or the server.
29              
30             =head2 Sample program
31              
32             Here is a sample program:
33              
34             my $sQuery = 'Columbus Ohio sushi restaurant';
35             my $oSearch = new WWW::Search('AltaVista');
36             $oSearch->native_query(WWW::Search::escape_query($sQuery));
37             $oSearch->login($sUser, $sPassword);
38             while (my $oResult = $oSearch->next_result())
39             {
40             print $oResult->url, "\n";
41             } # while
42             $oSearch->logout;
43              
44             Results are objects of type C
45             (see L for details).
46             Note that different backends support different result fields.
47             All backends are required to support title and url.
48              
49             =head1 SEE ALSO
50              
51             For specific search engines, see L
52             (replacing TheEngineName with a particular search engine).
53              
54             For details about the results of a search,
55             see L.
56              
57             =head1 METHODS AND FUNCTIONS FOR SEARCHERS
58              
59             =over
60              
61             =cut
62              
63             #####################################################################
64              
65             package WWW::Search;
66              
67 7     7   219735 use strict qw( vars );
  6         14  
  6         198  
68 7     7   540 use warnings;
  7         19  
  7         193  
69              
70 7     7   39 use Carp ();
  7         15  
  7         128  
71 7     7   3729 use CGI;
  7         119378  
  7         94  
72             # use Data::Dumper; # for debugging only
73 7     7   479 use Exporter;
  7         18  
  7         302  
74 7     7   308 use File::Find;
  6         12  
  6         315  
75 7     7   2772 use File::Spec::Functions;
  7         3827  
  7         462  
76 6     6   3488 use HTML::TreeBuilder;
  6         148185  
  6         69  
77 6     6   3456 use HTTP::Cookies;
  6         50193  
  6         223  
78 6     6   1918 use HTTP::Request;
  6         92716  
  6         180  
79 6     6   2323 use HTTP::Response;
  6         29058  
  6         164  
80 6     6   42 use HTTP::Status;
  6         13  
  6         1148  
81 6     6   2129 use LWP::MemberMixin;
  6         90  
  6         157  
82 6     6   2060 use LWP::RobotUA;
  6         72437  
  6         202  
83 6     6   58 use LWP::UserAgent;
  6         13  
  6         103  
84             # use Net::Domain qw( hostfqdn );
85 6     6   27 use URI;
  6         13  
  6         113  
86 6     6   27 use URI::Escape;
  6         17  
  6         326  
87             # use User;
88              
89             # Internal states:
90 6     6   41 use constant SEARCH_BEFORE => 1;
  6         13  
  6         368  
91 6     6   33 use constant SEARCH_UNDERWAY => 2;
  6         11  
  6         211  
92 6     6   29 use constant SEARCH_DONE => 3;
  6         12  
  6         203  
93 6     6   29 use constant SEARCH_RETRIEVING => 4;
  6         12  
  6         221  
94              
95 6     6   30 use vars qw( @ISA @EXPORT @EXPORT_OK );
  6         12  
  6         1886  
96             @EXPORT = qw();
97             @EXPORT_OK = qw( escape_query unescape_query generic_option strip_tags );
98             @ISA = qw(Exporter LWP::MemberMixin);
99             our
100             $MAINTAINER = 'Martin Thurn ';
101             our
102             $VERSION = do { my @r = (q$Revision: 2.565 $ =~ /\d+/g); sprintf "%d."."%03d" x $#r, @r };
103              
104             =item new
105              
106             To create a new WWW::Search, call
107              
108             $oSearch = new WWW::Search('SearchEngineName');
109              
110             where SearchEngineName is replaced with a particular search engine.
111             For example:
112              
113             $oSearch = new WWW::Search('Yahoo');
114              
115             If no search engine is specified, a default (currently 'Null::Empty')
116             will be chosen for you.
117              
118             =cut
119              
120             sub new
121             {
122 11     11 1 5454 my $class = shift;
123 11         26 my $engine = shift;
124             # Remaining arguments will become hash args
125              
126             # The default backend (not currently more configurable :-< )
127 11         47 my $default_engine = 'Null::Empty';
128 11         38 my $default_agent_name = "$class/$VERSION";
129 11         22 my $default_agent_email = '';
130 11 100       42 $engine = $default_engine if (!defined($engine));
131             # Load the engine, if necessary.
132 11         26 my $subclass = "${class}::$engine";
133 11         31 my $sEval = "use $subclass;";
134 4     4   611 eval $sEval;
  4     2   11  
  4         167  
  2         15  
  2         5  
  2         58  
  11         775  
135 11 100       276 Carp::croak("can not load backend $engine ($@)") if ($@);
136 10         125 my $self = bless {
137             engine => $engine,
138             maximum_to_retrieve => 500, # both pages and hits
139             interrequest_delay => 0.25, # in seconds
140             agent_name => $default_agent_name,
141             agent_email => $default_agent_email,
142             env_proxy => 0,
143             http_method => 'GET',
144             http_proxy => undef,
145             http_proxy_user => undef,
146             http_proxy_pwd => undef,
147             timeout => 60,
148             _debug => 0,
149             _parse_debug => 0,
150             search_from_file => undef,
151             search_to_file => undef,
152             search_to_file_index => 0,
153             @_,
154             # variable initialization goes here
155             }, $subclass;
156 10         66 $self->reset_search();
157 10         38 return $self;
158             } # new
159              
160             =item version
161              
162             Returns the value of the $VERSION variable of the backend engine, or
163             $WWW::Search::VERSION if the backend does not contain $VERSION.
164              
165             =cut
166              
167             sub version
168             {
169 2     2 1 21 my $self = shift;
170 2         78 my $iVersion = eval '$'.ref($self).'::VERSION';
171             # print STDERR " + iVersion = >>>$iVersion<<<\n";
172 2   66     13 $iVersion ||= $VERSION;
173 2         10 return $iVersion;
174             } # version
175              
176             =item maintainer
177              
178             Returns the value of the $MAINTAINER variable of the backend engine,
179             or $WWW::Search::MAINTAINER if the backend does not contain
180             $MAINTAINER.
181              
182             =cut
183              
184             sub maintainer
185             {
186 2     2 1 5 my $self = shift;
187 2         81 my $sMaintainer = eval '$'.ref($self).'::MAINTAINER';
188             # print STDERR " + sMaintainer = >>>$sMaintainer<<<\n";
189 2   33     9 $sMaintainer ||= $MAINTAINER;
190 2         10 return $sMaintainer;
191             } # maintainer
192              
193              
194             =item installed_engines
195              
196             Returns a list of the names of all installed backends.
197             We can not tell if they are up-to-date or working, though.
198              
199             use WWW::Search;
200             my @asEngines = sort &WWW::Search::installed_engines();
201             local $" = ', ';
202             print (" + These WWW::Search backends are installed: @asEngines\n");
203             # Choose a backend at random (yes, this is rather silly):
204             my $oSearch = WWW::Search->new($asEngines[rand(scalar(@asEngines))]);
205              
206             =cut
207              
208 6     6   39 use constant DEBUG_ARC => 0;
  6         12  
  6         276  
209 6     6   32 use constant DEBUG_COOKIES => 0;
  6         12  
  6         214  
210 6     6   28 use constant DEBUG_FIND => 0;
  6         12  
  6         235  
211 6     6   30 use constant DEBUG_FUNC => 0;
  6         10  
  6         199  
212 6     6   29 use constant DEBUG_RETR => 0;
  6         11  
  6         24499  
213              
214             sub _wanted
215             {
216             # Code adapted from the following netnews post (Thank you, Tom!):
217             # From: Tom Christiansen (tchrist@mox.perl.com)
218             # Subject: SRC: pminst - find modules whose names match this pattern
219             # Newsgroups: comp.lang.perl.misc
220             # Date: 1999/02/15
221 20     20   30 my $startdir = shift;
222 20         26 my $sFullPath = $File::Find::name;
223 20         30 print STDERR " + _wanted($startdir, $sFullPath)\n" if DEBUG_FIND;
224 20 50 66     144 if (-d && /^[a-z]/)
225             {
226             # This is so we don't go down site_perl etc too early (whatever
227             # that means):
228 0         0 $File::Find::prune = 1;
229 0         0 DEBUG_FIND && print STDERR " + prune\n";
230 0         0 return;
231             } # if
232 20 100       72 unless ($sFullPath =~ s!\.pm\Z!!)
233             {
234 4         7 DEBUG_FIND && print STDERR " + not .pm\n";
235 4         218 return;
236             } # unless
237             # Delete absolute path off front of file path:
238 16         92 $sFullPath =~ s{^\Q$startdir\E[\\/]}{};
239 16         28 unless (1 || ($sFullPath =~ s!\AWWW/Search!!))
240             {
241             print STDERR " + not WWW/Search\n" if DEBUG_FIND;
242             return;
243             } # unless
244 16         21 print STDERR " + found $sFullPath\n" if DEBUG_FIND;
245 16         27 $sFullPath =~ s{/}{::}g;
246 16         24 $sFullPath =~ s!\A::!!;
247 16         201 return $sFullPath;
248             } # _wanted
249              
250             sub installed_engines
251             {
252             # Does NOT need a WWW::Search object to operate
253 1     1 1 878 my %hsi;
254 1         3 local $" = '|';
255 1         2 DEBUG_FIND && print STDERR " + installed_engines() start, INC is @INC...\n";
256             INC_DIR:
257 1         22 foreach my $sDir (map catdir($_, 'WWW', 'Search'), @INC)
258             {
259 10         14 DEBUG_FIND && print STDERR " + foreach ==$sDir==\n";
260 10 100       124 next INC_DIR unless -d $sDir;
261             File::Find::find(sub {
262 20   100 20   51 $hsi{&_wanted($sDir) || 'JUNKJUNK'}++;
263 2         157 }, $sDir);
264             } # foreach INC_DIR
265 1         4 delete $hsi{'JUNKJUNK'};
266 1         2 delete $hsi{'Test'};
267 1         2 delete $hsi{'Result'};
268 1         8 return keys %hsi;
269             } # installed_engines
270              
271              
272             =item native_query
273              
274             Specify a query (and optional options) to the current search object.
275             Previous query (if any) and its cached results (if any) will be thrown away.
276             The option values and the query must be escaped;
277             call L to escape a string.
278             The search process is not actually begun until C or
279             C is called (lazy!), so native_query does not return anything.
280              
281             Example:
282              
283             $oSearch->native_query('search-engine-specific+escaped+query+string',
284             { option1 => 'able', option2 => 'baker' } );
285              
286             The hash of options following the query string is optional.
287             The query string is backend-specific.
288             There are two kinds of options:
289             options specific to the backend,
290             and generic options applicable to multiple backends.
291              
292             Generic options all begin with 'search_'.
293             Currently a few are supported:
294              
295             =over 4
296              
297             =item search_url
298              
299             Specifies the base URL for the search engine.
300              
301             =item search_debug
302              
303             Enables backend debugging. The default is 0 (no debugging).
304              
305             =item search_parse_debug
306              
307             Enables backend parser debugging. The default is 0 (no debugging).
308              
309             =item search_to_file FILE
310              
311             Causes the search results to be saved in a set of files
312             prefixed by FILE.
313             (Used internally by the test-suite, not intended for general use.)
314              
315             =item search_from_file FILE
316              
317             Reads a search from a set of files prefixed by FILE.
318             (Used internally by the test-suite, not intended for general use.)
319              
320             =back
321              
322             Some backends may not implement these generic options,
323             but any which do implement them must provide these semantics.
324              
325             Backend-specific options are described
326             in the documentation for each backend.
327             In most cases the options and their values are packed together to create the query portion of
328             the final URL.
329              
330             Details about how the search string and option hash are interpreted
331             might be found in the search-engine-specific manual pages
332             (WWW::Search::SearchEngineName).
333              
334             =cut
335              
336             sub native_query
337             {
338 7     7 1 374 my $self = shift;
339 7 100       30 print STDERR " FFF native_query($_[0])\n" if (DEBUG_FUNC || $self->{_debug});
340 7         30 $self->reset_search();
341 7         18 $self->{'native_query'} = $_[0];
342 7         15 $self->{'native_options'} = $_[1];
343             # promote generic options
344 7         15 my $opts_ref = $_[1];
345 7         26 foreach my $sKey (keys %$opts_ref)
346             {
347 4 100       9 if (generic_option($sKey))
348             {
349 2 100       10 print STDERR " + promoting $sKey to $self\n" if $self->{_debug};
350 2         10 $self->{$sKey} = $opts_ref->{$sKey};
351             # delete $opts_ref->{$sKey};
352             } # if
353             } # foreach
354 7 50       45 $self->setup_search() if ($self->{state} == SEARCH_BEFORE);
355             } # native_query
356              
357              
358             =item gui_query
359              
360             Specify a query to the current search object;
361             the query will be performed with the engine's default options,
362             as if it were typed by a user in a browser window.
363              
364             Same arguments as C above.
365              
366             Currently, this feature is supported by only a few backends;
367             consult the documentation for each backend to see if it is implemented.
368              
369             =cut
370              
371             sub gui_query
372             {
373             # This function is a stub to prevent runtime errors. This function
374             # should be defined in each backend as appropriate. See Yahoo.pm in
375             # the WWW-Search-Yahoo distribution for an example of how to
376             # implement it.
377 2     2 1 111 my $self = shift;
378 2         10 return $self->native_query(@_);
379             } # gui_query
380              
381              
382             =item cookie_jar
383              
384             Call this method (anytime before asking for results) if you want to
385             communicate cookie data with the search engine. Takes one argument,
386             either a filename or an HTTP::Cookies object. If you give a filename,
387             WWW::Search will attempt to read/store cookies there (by passing the
388             filename to HTTP::Cookies::new).
389              
390             $oSearch->cookie_jar('/tmp/my_cookies');
391              
392             If you give an HTTP::Cookies object, it is up to you to save the
393             cookies if/when you wish.
394              
395             use HTTP::Cookies;
396             my $oJar = HTTP::Cookies->new(...);
397             $oSearch->cookie_jar($oJar);
398              
399             If you pass in no arguments, the cookie jar (if any) is returned.
400              
401             my $oJar = $oSearch->cookie_jar;
402             unless (ref $oJar) { print "No jar" };
403              
404             =cut
405              
406             sub cookie_jar
407             {
408 4     4 1 254 my $self = shift;
409 4 100       11 if (! @_)
410             {
411 1         4 return $self->{'_cookie_jar'};
412             } # if no arguments
413 3         5 my $arg = shift;
414 3         7 my $sRef = ref($arg);
415 3 100       12 if ($sRef =~ m!\AHTTP::Cookies!)
    100          
416             {
417 1         3 print STDERR " + WWW::Search using caller's $sRef object\n" if DEBUG_COOKIES;
418 1         2 $self->{'_cookie_jar'} = $arg;
419 1         3 $self->{'_cookie_jar_we_save'} = 0;
420             } # if
421             elsif (! ref($arg))
422             {
423             # Assume that $arg is a file name:
424 1         2 print STDERR " + WWW::Search using Cookies from file $arg\n" if DEBUG_COOKIES;
425 1         11 $self->{'_cookie_jar'} = HTTP::Cookies->new(
426             'file' => $arg,
427             'autosave' => 1,
428             'ignore_discard' => 1,
429             );
430 1         84 $self->{'_cookie_jar'}->load;
431 1         19 $self->{'_cookie_jar_we_save'} = 1;
432             # print STDERR " + WWW::Search just loaded cookies from $arg\n";
433             }
434             else
435             {
436 1         175 Carp::carp "argument to WWW::Search::cookie_jar() must be a scalar or a flavor of HTTP::Cookies";
437             }
438             } # cookie_jar
439              
440              
441             =item date_from
442              
443             Set/get the start date for limiting the query by a date range. See
444             the documentation for each backend to find out if date ranges are
445             supported.
446              
447             =item date_to
448              
449             Set/get the end date for limiting the query by a date range. See the
450             documentation for each backend to find out if date ranges are
451             supported.
452              
453             =cut
454              
455             sub date_from
456             {
457 3   100 3 1 438 return shift->_elem('date_from', @_) || '';
458             } # date_from
459              
460             sub date_to
461             {
462 3   100 3 1 21 return shift->_elem('date_to', @_) || '';
463             } # date_to
464              
465              
466             =item env_proxy
467              
468             Enable loading proxy settings from environment variables.
469             The proxy URL will be read from $ENV{http_proxy}.
470             The username for authentication will be read from $ENV{http_proxy_user}.
471             The password for authentication will be read from $ENV{http_proxy_pwd}.
472              
473             If you don't want to put passwords in the environment, one solution
474             would be to subclass LWP::UserAgent and use $ENV{WWW_SEARCH_USERAGENT}
475             instead (see user_agent below).
476              
477             env_proxy() must be called before the first retrieval is attempted.
478              
479             Example:
480              
481             $ENV{http_proxy } = 'http://my.proxy.com:80';
482             $ENV{http_proxy_user} = 'bugsbun';
483             $ENV{http_proxy_pwd } = 'c4rr0t5';
484             $oSearch->env_proxy('yes'); # Turn on with any true value
485             ...
486             $oSearch->env_proxy(0); # Turn off with zero
487             ...
488             if ($oSearch->env_proxy) # Test
489              
490             =cut
491              
492             # contributed by Klaus Johannes Rusch
493             sub env_proxy
494             {
495 7     7 1 35 return shift->_elem('env_proxy', @_);
496             } # env_proxy
497              
498              
499             =item http_proxy
500              
501             Set up an HTTP proxy (for connections from behind a firewall).
502              
503             Takes the same arguments as LWP::UserAgent::proxy().
504              
505             This routine should be called before calling any of the result
506             functions (any method with "result" in its name).
507              
508             Example:
509              
510             # Turn on and set address:
511             $oSearch->http_proxy(['http','ftp'] => 'http://proxy:8080');
512             # Turn off:
513             $oSearch->http_proxy('');
514              
515             =cut
516              
517             sub http_proxy
518             {
519 17     17 1 1198 my $self = shift;
520             # Make a copy of our arguments:
521 17 100       40 if (@_)
522             {
523 3         7 my @a = @_;
524 3         12 return $self->_elem('http_proxy', \@a);
525             }
526             else
527             {
528 14         46 return $self->_elem('http_proxy');
529             }
530             } # http_proxy
531              
532              
533             =item http_proxy_user, http_proxy_pwd
534              
535             Set/get HTTP proxy authentication data.
536              
537             These routines set/get username and password used in proxy
538             authentication.
539             Authentication is attempted only if all three items (proxy URL, username
540             and password) have been set.
541              
542             Example:
543              
544             $oSearch->http_proxy_user("myuser");
545             $oSearch->http_proxy_pwd("mypassword");
546             $oSearch->http_proxy_user(undef); # Example for no authentication
547              
548             $username = $oSearch->http_proxy_user();
549              
550             =cut
551              
552             sub http_proxy_user
553             {
554 9     9 1 1104 return shift->_elem('http_proxy_user', @_);
555             }
556              
557             sub http_proxy_pwd
558             {
559 7     7 1 1073 return shift->_elem('http_proxy_pwd', @_);
560             }
561              
562              
563             =item maximum_to_retrieve
564              
565             Set the maximum number of hits to return.
566             Queries resulting in more than this many hits will return
567             the first hits, up to this limit.
568             Although this specifies a maximum limit,
569             search engines might return less than this number.
570              
571             Defaults to 500.
572              
573             Example:
574             $max = $oSearch->maximum_to_retrieve(100);
575              
576             You can also spell this method "maximum_to_return".
577              
578             =cut
579              
580             sub maximum_to_retrieve
581             {
582 1     1 1 5 return shift->_elem('maximum_to_retrieve', @_);
583             }
584              
585              
586             =item maximum_to_return
587              
588             Synonym for maximum_to_retrieve
589              
590             =cut
591              
592             sub maximum_to_return
593             {
594 1     1 1 5 return shift->_elem('maximum_to_retrieve', @_);
595             }
596              
597              
598             =item timeout
599              
600             The maximum length of time any portion of the query should take,
601             in seconds.
602              
603             Defaults to 60.
604              
605             Example:
606             $oSearch->timeout(120);
607              
608             =cut
609              
610             sub timeout
611             {
612 3     3 1 611 return shift->_elem('timeout', @_);
613             }
614              
615              
616             =item login
617              
618             Backends which need to login to the search engine should implement
619             this function. Takes two arguments, user and password. Return
620             nonzero if login was successful. Return undef or 0 if login failed.
621              
622             =cut
623              
624             sub login
625             {
626 1     1 1 3 my $self = shift;
627             # Here is just a stub.
628 1         6 return 1;
629             # These are the arguments:
630 0         0 my ($sUser, $sPassword) = @_;
631             } # login
632              
633             =item logout
634              
635             Backends which need to logout from the search engine should implement
636             this function.
637              
638             =cut
639              
640             sub logout
641             {
642 1     1 1 6 my $self = shift; # no other args
643             # Here is just a stub.
644             } # logout
645              
646              
647             =item approximate_result_count
648              
649             Some backends indicate how many results they have found.
650             Typically this is an approximate value.
651              
652             =cut
653              
654             sub approximate_result_count
655             {
656 11     11 1 2030 my $self = shift;
657             # Optional arg1 = new value for this option.
658 11         20 my $iArg = shift;
659 11 100 100     54 $iArg = undef if (defined($iArg) && ($iArg eq ''));
660 11         15 DEBUG_FUNC && print STDERR " FFF a_r_c(state=$self->{state},iArg=$iArg)\n";
661 11         18 DEBUG_ARC && print STDERR " + a_r_c(state=$self->{state},iArg=$iArg)\n";
662 11 100 100     38 if (defined($iArg) && (0 <= $iArg))
663             {
664             # Caller is trying to SET the value:
665 3         5 DEBUG_ARC && print STDERR " + a_r_cSET(state=$self->{state},iArg=$iArg)\n";
666 3         6 $self->{'approx_count'} = $iArg;
667 3         14 return $iArg;
668             } # if
669 8 100 66     85 if (
670             # This prevents infinite recursion, for when retrieve_some()
671             # calls this function in order to SET the value:
672             ($self->{state} ne SEARCH_RETRIEVING)
673             &&
674             # This prevents useless repeat calls to retrieve_some() after
675             # the search has been completed:
676             ($self->{state} ne SEARCH_DONE))
677             {
678 1         2 DEBUG_ARC && print STDERR " DDD a_r_c prime the pump\n";
679             # Prime the pump, if necessary:
680 1         6 $self->retrieve_some();
681             } # if
682 8   100     32 $iArg = $self->{'approx_count'} || 0;
683             # print STDERR " + a_r_cGET(state=$self->{state},answer=$iArg)\n";
684 8         40 return $iArg;
685             } # approximate_result_count
686              
687             =item approximate_hit_count
688              
689             This is an alias for approximate_result_count().
690              
691             =cut
692              
693             sub approximate_hit_count
694             {
695 2     2 1 8 shift->approximate_result_count(@_);
696             } # approximate_hit_count
697              
698              
699             =item results
700              
701             Return all the results of a query as an array of WWW::SearchResult
702             objects.
703              
704             Note: This might take a while, because a web backend will keep asking
705             the search engine for "next page of results" over and over until there
706             are no more next pages, and THEN return from this function.
707              
708             If an error occurs at any time during query processing, it will be
709             indicated in the response().
710              
711             Example:
712              
713             @results = $oSearch->results();
714             # Go have a cup of coffee while the previous line executes...
715             foreach $oResult (@results)
716             {
717             print $oResult->url(), "\n";
718             } # foreach
719              
720             =cut
721              
722             sub results
723             {
724 6     6 1 388 my $self = shift;
725 6 50       23 print STDERR " FFF results(",$self->{'native_query'},")\n" if (DEBUG_FUNC || $self->{_debug});
726             # Put all the search results into the cache:
727 6         31 1 while ($self->retrieve_some());
728 6   50     25 $self->{cache} ||= [];
729 6         16 my $iMax = scalar(@{$self->{cache}});
  6         14  
730             # print STDERR " + mtr is ", $self->{maximum_to_retrieve}, "\n" if $self->{_debug};
731             # print STDERR " + cache contains $iMax results\n" if $self->{_debug};
732 6 50       23 $iMax = $self->{maximum_to_retrieve} if ($self->{maximum_to_retrieve} < $iMax);
733             # print STDERR " + returning $iMax results\n" if $self->{_debug};
734 6         22 return @{$self->{cache}}[0..$iMax-1];
  6         149  
735             } # results
736              
737             =item next_result
738              
739             Call this method repeatedly to return each result of a query as a
740             WWW::SearchResult object. Example:
741              
742             while ($oResult = $oSearch->next_result())
743             {
744             print $oResult->url(), "\n";
745             } # while
746              
747             When there are no more results, or if an error occurs, next_result()
748             will return undef.
749              
750             If an error occurs at any time during query processing, it will be
751             indicated in the response().
752              
753             =cut
754              
755             sub next_result
756             {
757 46     46 1 69 my $self = shift;
758             # Carp::croak "search not yet specified" if (!defined($self->{'native_query'}));
759 46 100       87 return undef if ($self->{next_to_return} >= $self->{maximum_to_retrieve});
760 45         57 while (1)
761             {
762 45 100       61 if ($self->{next_to_return} <= $#{$self->{cache}})
  45         92  
763             {
764             # The cache already contains the desired element; return it:
765 44         58 my $i = ($self->{next_to_return})++;
766 44         82 return $self->{cache}->[$i];
767             } # if
768             # If we get here, then the desired element is beyond the end of
769             # the cache.
770 1 50       4 if ($self->{state} == SEARCH_DONE)
771             {
772             # There are no more results to be gotten; fail & bail:
773 1         2 return undef;
774             } # if
775             # Get some more results into the cache:
776 0         0 $self->retrieve_some();
777             # Go back and try again:
778             } # while infinite
779             } # next_result
780              
781              
782             =item seek_result($offset)
783              
784             Set which result should be returned next time
785             C is called. Results are zero-indexed.
786              
787             The only guaranteed valid offset is 0,
788             which will replay the results from the beginning.
789             In particular, seeking past the end of the current cached
790             results probably will not do what you might think it should.
791              
792             Results are cached, so this does not re-issue the query
793             or cause IO (unless you go off the end of the results).
794             To re-do the query, create a new search object.
795              
796             Example:
797              
798             $oSearch->seek_result(0);
799              
800             =cut
801              
802             sub seek_result
803             {
804 3     3 1 582 my ($self, $desired) = @_;
805 3         7 my $old = $self->{next_to_return};
806 3 100 100     13 $self->{next_to_return} = $desired if (defined($desired) and (0 <= $desired));
807 3         7 return $old;
808             } # seek_result
809              
810              
811             =item response
812              
813             Returns an L object which resulted from the
814             most-recently-sent query. Errors can be detected like this:
815              
816             if (! $oSearch->response->is_success)
817             {
818             print STDERR "Error: " . $oSearch->response->as_string() . "\n";
819             } # if
820              
821             Note to backend authors: even if the backend does not involve the web,
822             it should return an L object.
823              
824             =cut
825              
826             sub response
827             {
828 7     7 1 22 my $self = shift;
829 7   33     24 $self->{response} ||= new HTTP::Response(RC_OK);
830 7         27 return $self->{response};
831             } # response
832              
833              
834             =item submit
835              
836             This method can be used to submit URLs to the search engines for indexing.
837             Consult the documentation for each backend to find out if it is implemented there,
838             and if so what the arguments are.
839              
840             Returns an HTTP::Response object describing the result of the submission request.
841             Consult the documentation for each backend to find out the meaning of the response.
842              
843             =cut
844              
845             sub submit
846             {
847 1     1 1 9 return new HTTP::Response(788, 'Sorry, this backend does not support the submit() method.');
848             } # submit
849              
850              
851             =item opaque
852              
853             This function provides an application a place to store
854             one opaque data element (or many, via a Perl reference).
855             This facility is useful to (for example),
856             maintain client-specific information in each active query
857             when you have multiple concurrent queries.
858              
859             =cut
860              
861 3     3 1 896 sub opaque { return shift->_elem('opaque', @_); }
862              
863              
864             =item escape_query
865              
866             Escape a query.
867             Before queries are sent to the internet, special characters must be escaped
868             so that a proper URL can be formed.
869             This is like escaping a URL,
870             but all non-alphanumeric characters are escaped and
871             and spaces are converted to "+"s.
872              
873             Example:
874              
875             $escaped = WWW::Search::escape_query('+hi +mom');
876             # $escaped is now '%2Bhi+%2Bmom'
877              
878             See also C.
879             NOTE that this is not a method, it is a plain function.
880              
881             =cut
882              
883             sub escape_query
884             {
885 2     2 1 14 my $text = join(' ', @_);
886 2   100     12 $text ||= '';
887             # print STDERR " + escape_query($text)\n";
888 2         15 $text =~ s/([^ A-Za-z0-9])/$URI::Escape::escapes{$1}/g; #"
889             # print STDERR " + escape_query($text)\n";
890 2         7 $text =~ s/ /+/g;
891             # print STDERR " + escape_query($text)\n";
892 2         12 return $text;
893             } # escape_query
894              
895             =item unescape_query
896              
897             Unescape a query.
898             See C for details.
899              
900             Example:
901              
902             $unescaped = WWW::Search::unescape_query('%22hi+mom%22');
903             # $unescaped eq q{"hi mom"}
904              
905             NOTE that this is not a method, it is a plain function.
906              
907             =cut
908              
909             sub unescape_query
910             {
911             # code stolen from URI::Escape.pm.
912 2     2 1 10 my @copy = @_;
913 2         7 for (@copy)
914             {
915 4         11 s!\+! !g;
916 4         12 s!\%([\dA-Fa-f]{2})!chr(hex($1))!eg;
  2         9  
917             } # for
918 2 100       12 return wantarray ? @copy : $copy[0];
919             } # unescape_query
920              
921             =item strip_tags
922              
923             Given a string, returns a copy of that string with HTML tags removed.
924             This should be used by each backend as they insert the title and
925             description values into the search results objects.
926              
927             NOTE that this is not a method, it is a plain function.
928              
929             =cut
930              
931             sub strip_tags
932             {
933             # Prevent undef warnings if we get passed any undefined values:
934 2     2 1 577 my @args = @_;
935 2   100     7 @args = map { $_ ||= '' } @args;
  5         23  
936 2         8 my $s = join('', @args);
937             # Special case: change BR to space:
938 2         8 $s =~ s!
! !gi;
939             # We assume for now that we will not be encountering tags with
940             # embedded '>' characters!
941 2         10 $s =~ s/\074.+?\076//g;
942 2         7 $s =~ s/ / /g;
943 2         4 $s =~ s/"/\042/g;
944 2         5 $s =~ s/&/\046/g;
945 2         6 $s =~ s/</\074/g;
946 2         4 $s =~ s/>/\076/g;
947             # Coalesce multiple spaces:
948 2         6 $s =~ tr!\040\t\r\n! !s;
949             # Delete leading & trailing spaces:
950 2         6 $s =~ s!\A\s+!!;
951 2         7 $s =~ s!\s+\Z!!;
952 2         7 return $s;
953             } # strip_tags
954              
955             =item is_http_proxy
956              
957             Returns true if proxy information is available.
958              
959             =cut
960              
961             sub is_http_proxy
962             {
963 12     12 1 24 my $self = shift;
964 12         32 my $ra = $self->http_proxy;
965 12   100     150 my $ret = (
966             ('ARRAY' eq ref($ra))
967             &&
968             defined($ra->[0])
969             &&
970             ($ra->[0] ne '')
971             );
972             # print STDERR " DDD is_http_proxy() return =$ret=\n";
973 12         52 return $ret;
974             } # is_http_proxy
975              
976             =back
977              
978             =head1 METHODS AND FUNCTIONS FOR BACKEND PROGRAMMERS
979              
980             =over
981              
982             =item reset_search
983              
984             Resets internal data structures to start over with a new search (on
985             the same engine).
986              
987             =cut
988              
989             sub reset_search
990             {
991 17     17 1 27 my $self = shift;
992 17 100       99 print STDERR " FFF reset_search(",$self->{'native_query'},")\n" if (DEBUG_FUNC || $self->{_debug});
993 17         48 $self->{'cache'} = [];
994 17         401 $self->{'native_query'} = '';
995 17         35 $self->{'next_to_retrieve'} = 1;
996 17         30 $self->{'next_to_return'} = 0;
997 17         36 $self->{'number_retrieved'} = 0;
998 17         30 $self->{'requests_made'} = 0;
999 17         33 $self->{'state'} = SEARCH_BEFORE;
1000 17         34 $self->{'_next_url'} = '';
1001 17         35 $self->{'approx_count'} = 0;
1002             # This method is called by native_query(). native_query() is called
1003             # either by gui_query() or by the user. In the case that
1004             # gui_query() was called, we do NOT want to clear out the _options
1005             # hash. For now, I implement a pretty ugly hack to make this work:
1006 17 100       48 if (caller(2))
1007             {
1008 2         16 my @as = caller(2);
1009 2 50       8 if (1 < scalar(@as))
1010             {
1011             # print STDERR " in reset_search(), as is (", join(',', @as), ")\n";
1012 2 50       90 return if $as[3] =~ m/gui_query/;
1013             } # if
1014             } # if
1015 15         33 $self->{_options} = ();
1016             } # reset_search
1017              
1018              
1019             =item is_http_proxy_auth_data
1020              
1021             Returns true if all authentication data
1022             (proxy URL, username, and password) are available.
1023              
1024             =cut
1025              
1026             sub is_http_proxy_auth_data
1027             {
1028 5     5 1 11 my $self = shift;
1029             # print STDERR (" DDD http_proxy is ", Dumper(\$self));
1030 5   66     12 my $ret = (
1031             $self->is_http_proxy
1032             &&
1033             defined($self->http_proxy_user)
1034             &&
1035             ($self->http_proxy_user ne '')
1036             &&
1037             defined($self->http_proxy_pwd)
1038             &&
1039             ($self->http_proxy_pwd ne '')
1040             );
1041             # print STDERR " DDD is_http_proxy_auth_data() return =$ret=\n";
1042 5         47 return $ret;
1043             } # is_http_proxy_auth_data
1044              
1045              
1046             =item agent_name($sName)
1047              
1048             If your search engine rejects certain browser,
1049             you can trick it into thinking you're any browser type you want.
1050             See below under user_agent().
1051              
1052             =cut
1053              
1054             sub agent_name
1055             {
1056 7     7 1 962 return shift->_elem('agent_name', @_);
1057             }
1058              
1059             =item agent_email($sName)
1060              
1061             =cut
1062              
1063             sub agent_email
1064             {
1065 7     7 1 1513 return shift->_elem('agent_email', @_);
1066             }
1067              
1068             =item user_agent($NON_ROBOT)
1069              
1070             This internal routine creates a user-agent for derived classes that
1071             query the web. If any non-false argument is given, a normal
1072             LWP::UserAgent (rather than a LWP::RobotUA) is used.
1073              
1074             Returns the user-agent object.
1075              
1076             If a backend needs the low-level LWP::UserAgent or LWP::RobotUA to
1077             have a particular name, $oSearch->agent_name() and possibly
1078             $oSearch->agent_email() should be called to set the desired values
1079             *before* calling $oSearch->user_agent().
1080              
1081             If the environment variable WWW_SEARCH_USERAGENT has a value, it will
1082             be used as the class for a new user agent object. This class should
1083             be a subclass of LWP::UserAgent. For example,
1084              
1085             $ENV{WWW_SEARCH_USERAGENT} = 'My::Own::UserAgent';
1086             # If this env.var. has no value,
1087             # LWP::UserAgent or LWP::RobotUA will be used.
1088             $oSearch = new WWW::Search('MyBackend');
1089             $oSearch->agent_name('MySpider');
1090             if ($iBackendWebsiteRequiresNonRobot)
1091             {
1092             $oSearch->user_agent('non-robot');
1093             }
1094             else
1095             {
1096             $oSearch->agent_email('me@here.com');
1097             $oSearch->user_agent();
1098             }
1099              
1100             Backends should use robot-style user-agents whenever possible.
1101              
1102             =cut
1103              
1104             sub _load_env_useragent
1105             {
1106 4     4   8 my $self = shift;
1107 4   100     19 my $sUA = $ENV{'WWW_SEARCH_USERAGENT'} || '';
1108 4         8 my $ua;
1109 4 100       18 if ($sUA ne '')
1110             {
1111 2         121 eval "use $sUA";
1112             # print STDERR " DDD this is after eval use $sUA\n";
1113 2 100       10 if (! $@)
1114             {
1115             # Successfully loaded module.
1116 1         3 eval { $ua = $sUA->new };
  1         15  
1117             # print STDERR " DDD this is after eval new $sUA\n";
1118 1 50 33     6 if (ref($ua) && ! $@)
1119             {
1120             # Successfully created object.
1121 0         0 return $ua;
1122             }
1123             else
1124             {
1125 1         10 warn " --- WWW::Search::user_agent can not create $sUA object: $@\n";
1126             # Fall through and try the other methods:
1127             }
1128             }
1129             else
1130             {
1131 1         14 warn " --- WWW::Search::user_agent can not load $sUA: $@\n";
1132             # Fall through and try the other methods:
1133             }
1134             } # if found WWW_SEARCH_USERAGENT in environment
1135             } # _load_env_useragent
1136              
1137             sub user_agent
1138             {
1139 8     8 1 1657 my $self = shift;
1140 8 100       25 unless (@_)
1141             {
1142             # If NO ARGS, return the previously-created agent (if any):
1143 4 50       21 return $self->{'user_agent'} if ref($self->{'user_agent'});
1144             } # unless
1145 4   50     14 my $non_robot = shift || 0;
1146 4         13 my $ua = _load_env_useragent();
1147             # If we loaded a UserAgent, don't do any of this stuff:
1148 4 50       32 if (! ref $ua)
1149             {
1150 4 50       11 if ($non_robot)
1151             {
1152             eval
1153 4         9 {
1154 4         30 $ua = new LWP::UserAgent;
1155 4         2847 $ua->agent($self->agent_name);
1156 4         267 $ua->from($self->agent_email);
1157             }; # end of eval block
1158             }
1159             else
1160             {
1161 0         0 $ua = LWP::RobotUA->new($self->agent_name, $self->agent_email);
1162 0         0 $ua->delay($self->{'interrequest_delay'});
1163             }
1164 4         227 $ua->timeout($self->{'timeout'});
1165 4 100       50 eval { $ua->proxy(@{$self->{'http_proxy'}}) } if $self->is_http_proxy;
  1         2  
  1         7  
1166 4 50       276 if ($self->env_proxy)
1167             {
1168 0         0 $ua->env_proxy($self->env_proxy);
1169             # Read password from ENV:
1170 0         0 $self->http_proxy_user($ENV{http_proxy_user});
1171 0         0 $self->http_proxy_pwd ($ENV{http_proxy_pwd});
1172             } # if
1173             } # if ! ref $ua
1174 4         42 $self->{'user_agent'} = $ua;
1175 4         43 return $ua;
1176             } # user_agent
1177              
1178              
1179             =item http_referer
1180              
1181             Get / set the value of the HTTP_REFERER variable for this search object.
1182             Some search engines might only accept requests that originated at some specific previous page.
1183             This method lets backend authors "fake" the previous page.
1184             Call this method before calling http_request.
1185              
1186             $oSearch->http_referer('http://prev.engine.com/wherever/setup.html');
1187             $oResponse = $oSearch->http_request('GET', $url);
1188              
1189             =cut
1190              
1191             sub http_referer
1192             {
1193 3     3 1 929 return shift->_elem('_http_referer', @_);
1194             }
1195              
1196              
1197             =item http_method
1198              
1199             Get / set the method to be used for the HTTP request.
1200             Must be either 'GET' or 'POST'.
1201             Call this method before calling http_request.
1202             (Normally you would set this during _native_setup_search().)
1203             The default is 'GET'.
1204              
1205             $oSearch->http_method('POST');
1206              
1207             =cut
1208              
1209             sub http_method
1210             {
1211 3     3 1 1515 shift->_elem('http_method', @_);
1212             }
1213              
1214              
1215             =item http_request($method, $url)
1216              
1217             Submit the HTTP request to the world, and return the response.
1218             Similar to LWP::UserAgent::request.
1219             Handles cookies, follows redirects, etc.
1220             Requires that http_referer already be set up, if needed.
1221              
1222             =cut
1223              
1224             sub http_request
1225             {
1226 0     0 1 0 my $self = shift;
1227 0         0 my ($method, $url) = @_;
1228 0         0 my $response;
1229 0 0       0 if (50 <= $self->{_debug})
1230             {
1231 0         0 eval q{ use LWP::Debug qw(+) }; # } # emacs bug
1232             } # if
1233 0 0       0 if ($self->{search_from_file})
1234             {
1235 0         0 $response = $self->_http_request_from_file($url);
1236             }
1237             else
1238             {
1239             # fetch it
1240 0         0 my $request;
1241 0 0       0 if ($method eq 'POST')
1242             {
1243 0         0 my $uri_url = URI->new($url);
1244 0         0 my $equery = $uri_url->query;
1245 0         0 $uri_url->query(undef); # we will handle the query ourselves
1246 0         0 $request = new HTTP::Request($method, $uri_url);
1247 0         0 $request->header('Content-Type', 'application/x-www-form-urlencoded');
1248 0         0 $request->header('Content-Length', length $equery);
1249 0         0 $request->content($equery);
1250             }
1251             else
1252             {
1253 0         0 $request = new HTTP::Request($method, $url);
1254             }
1255 0         0 $request->header('Accept-Charset' => 'iso-8859-1,*,utf-8');
1256 0 0       0 if ($self->is_http_proxy_auth_data)
1257             {
1258 0         0 $request->proxy_authorization_basic($self->http_proxy_user,
1259             $self->http_proxy_pwd);
1260             } # if
1261 0 0       0 $self->{'_cookie_jar'}->add_cookie_header($request) if ref($self->{'_cookie_jar'});
1262              
1263 0 0 0     0 if ($self->{'_http_referer'} && ($self->{'_http_referer'} ne ''))
1264             {
1265 0         0 my $s = uri_escape($self->{'_http_referer'});
1266             # print STDERR " + referer($s), ref(s) = ", ref($s), "\n";
1267 0 0       0 $s = $s->as_string if ref($s) =~ m!URI!;
1268 0         0 $request->referer($s);
1269             } # if referer
1270 0 0       0 print STDERR " DDD raw HTTP::Request is:\n", $request->as_string if (3 <= $self->{_debug});
1271 0         0 my $ua = $self->user_agent();
1272              
1273             TRY_GET:
1274 0         0 while (1)
1275             {
1276 0         0 $response = $ua->request($request);
1277             printf(STDERR " + got HTTP::Response (code=%d):\n%s",
1278             $response->code,
1279 0 0       0 $response->headers->as_string) if (3 <= $self->{_debug});
1280 0 0       0 if (ref($self->{'_cookie_jar'}))
1281             {
1282 0         0 $self->{'_cookie_jar'}->extract_cookies($response);
1283 0 0       0 $self->{'_cookie_jar'}->save if $self->{'_cookie_jar_we_save'};
1284 0         0 print STDERR " + WWW::Search just extracted cookies\n" if DEBUG_COOKIES;
1285 0         0 print STDERR $self->{'_cookie_jar'}->as_string if DEBUG_COOKIES;
1286             # print STDERR Dumper($self->{'_cookie_jar'}) if DEBUG_COOKIES;
1287             } # if
1288 0 0 0     0 if ($self->{'search_to_file'} && $response->is_success)
1289             {
1290 0         0 $self->_http_request_to_file($url, $response);
1291             } # if
1292 0 0       0 last TRY_GET if ($response->is_success);
1293 0 0       0 last TRY_GET if ($response->is_error);
1294 0 0       0 last TRY_GET if ($response->headers->header('Client-Warning') =~ m!redirect loop detected!i);
1295 0 0 0     0 if ($response->is_redirect
1296             ||
1297             # Some engines spoof us with a false 302 code, so look at
1298             # the message rather than the code:
1299             ($response->message =~ m!Object moved!i)
1300             )
1301             {
1302 0         0 my $sURL = $response->request->uri->as_string;
1303 0         0 my $sURLredir = $response->headers->header('Location');
1304             # Low-level loop detection:
1305 0 0       0 last TRY_GET if ($sURLredir eq $sURL);
1306 0 0       0 print STDERR " + 'Object moved' from $sURL to $sURLredir\n" if (2 <= $self->{_debug});
1307             # Follow the redirect:
1308 0         0 $request = new HTTP::Request('GET',
1309             URI->new_abs($sURLredir, $sURL),
1310             );
1311 0         0 $request->referer($sURL);
1312 0 0       0 $self->{'_cookie_jar'}->add_cookie_header($request) if ref($self->{'_cookie_jar'});
1313 0 0       0 print STDERR " + 'Object moved', new HTTP::Request is:\n", $request->as_string if (3 <= $self->{_debug});
1314             # Go back and try again
1315             } # if
1316             } # while infinite
1317             } # if not from_file
1318 0         0 return $response;
1319             } # http_request
1320              
1321             sub _http_request_get_filename {
1322 0     0   0 my $self = shift;
1323 0         0 my $fn;
1324             # filename?
1325 0 0       0 if (!defined($self->{search_filename})) {
1326 0         0 $fn = $self->{search_from_file};
1327 0 0       0 $fn = $self->{search_to_file} if (!defined($fn));
1328 0         0 $self->{search_filename} = WWW::Search::unescape_query($fn);
1329             }
1330 0         0 $fn = $self->{search_filename};
1331 0 0       0 die "$0: bogus filename.\n" if (!defined($fn));
1332 0         0 return $fn;
1333             } # _http_request_get_filename
1334              
1335             sub _http_request_from_file {
1336 0     0   0 my $self = shift;
1337 0         0 my ($url) = @_;
1338              
1339 0         0 my $fn = $self->_http_request_get_filename();
1340              
1341             # read index?
1342 0 0       0 if (!defined($self->{search_from_file_hash})) {
1343 0 0       0 open(TABLE, "<$fn") || die "$0: open $fn failed.\n";
1344 0         0 my $i = 0;
1345 0         0 while () {
1346 0         0 chomp;
1347 0         0 $self->{search_from_file_hash}{$_} = $i;
1348             # print STDERR "$0: file index: $i <$_>\n";
1349 0         0 $i++;
1350             };
1351 0         0 close TABLE;
1352             };
1353             # read file
1354 0         0 my $i = $self->{search_from_file_hash}{$url};
1355 0 0       0 if (defined($i)) {
1356             # print STDERR "$0: saved request <$url> found in $fn.$i\n";
1357             # read the data
1358 0 0       0 open(FILE, "<$fn.$i") || die "$0: open $fn.$i\n";
1359 0         0 my $d = '';
1360 0         0 while () {
1361 0         0 $d .= $_;
1362             };
1363 0         0 close FILE;
1364             # make up the response
1365 0         0 my $r = new HTTP::Response(RC_OK);
1366 0         0 $r->content($d);
1367 0         0 return $r;
1368             } else {
1369 0         0 print STDERR "$0: saved request <$url> not found.\n";
1370 0         0 my $r = new HTTP::Response(RC_NOT_FOUND);
1371 0         0 return $r;
1372             };
1373             } # _http_request_from_file
1374              
1375             sub _http_request_to_file {
1376 0     0   0 my $self = shift;
1377             # The LAST arg is a LWP::Response object:
1378 0         0 my $response = pop;
1379             # The only other arg we care about is the FIRST arg, a url:
1380 0         0 my ($url, ) = @_;
1381              
1382 0         0 my $fn = $self->_http_request_get_filename();
1383              
1384             unlink($fn)
1385 0 0       0 if ($self->{search_to_file_index} == 0);
1386 0 0       0 open(TABLE, ">>$fn") || die "$0: open $fn\n";
1387 0         0 print TABLE "$url\n";
1388 0         0 close TABLE;
1389 0         0 my $i = ($self->{search_to_file_index})++;
1390 0 0       0 open (FILE, ">$fn.$i") || die "$0: open $fn.$i\n";
1391 0         0 print FILE $response->content();
1392 0         0 close FILE;
1393             } # _http_request_to_file
1394              
1395              
1396             =item next_url
1397              
1398             Get or set the URL for the next backend request. This can be used to
1399             save the WWW::Search state between sessions (e.g. if you are showing
1400             pages of results to the user in a web browser). Before closing down a
1401             session, save the value of next_url:
1402              
1403             ...
1404             $oSearch->maximum_to_return(10);
1405             while ($oSearch->next_result) { ... }
1406             my $urlSave = $oSearch->next_url;
1407              
1408             Then, when you start up the next session (e.g. after the user clicks
1409             your "next" button), restore this value before calling for the results:
1410              
1411             $oSearch->native_query(...);
1412             $oSearch->next_url($urlSave);
1413             $oSearch->maximum_to_return(20);
1414             while ($oSearch->next_result) { ... }
1415              
1416             WARNING: It is entirely up to you to keep your interface in sync with
1417             the number of hits per page being returned from the backend. And, we
1418             make no guarantees whether this method will work for any given
1419             backend. (Their caching scheme might not enable you to jump into the
1420             middle of a list of search results, for example.)
1421              
1422             =cut
1423              
1424             sub next_url
1425             {
1426 0     0 1 0 return shift->_elem('_next_url', @_);
1427             }
1428              
1429              
1430             =item split_lines
1431              
1432             This internal routine splits data (typically the result of the web
1433             page retrieval) into lines in a way that is OS independent. If the
1434             first argument is a reference to an array, that array is taken to be a
1435             list of possible delimiters for this split. For example, Yahoo.pm
1436             uses

and

  • as "line" delimiters for convenience.
  • 1437              
    1438             =cut
    1439              
    1440             sub split_lines
    1441             {
    1442             # This probably fails on an EBCDIC box where input is in text mode.
    1443             # Too bad Macs do not just use binmode like Windows boxen.
    1444 2     2 1 722 my $self = shift;
    1445 2         5 my $s = shift;
    1446 2         4 my $patt = '\015?\012';
    1447 2 100       9 if (ref($s))
    1448             {
    1449 1         4 $patt = '('. $patt;
    1450 1         3 foreach (@$s)
    1451             {
    1452 1         3 $patt .= "|$_";
    1453             } # foreach
    1454 1         4 $patt .= ')';
    1455             # print STDERR " + patt is >>>$patt<<<\n";
    1456 1         3 $s = shift;
    1457             } # if
    1458 2         41 return split(/$patt/i, $s);
    1459             # If we require perl 5.005, this can be done by:
    1460             # use Socket qw(:crlf :DEFAULT);
    1461             # split(/$CR?$LF/,$_[0])
    1462             } # split_lines
    1463              
    1464              
    1465             =item generic_option
    1466              
    1467             This internal routine checks if an option
    1468             is generic or backend specific.
    1469             Currently all generic options begin with 'search_'.
    1470             This routine is not a method.
    1471              
    1472             =cut
    1473              
    1474             sub generic_option
    1475             {
    1476 9   50 9 1 34 my $option = shift || '';
    1477 9         31 return ($option =~ /^search_/);
    1478             } # generic_option
    1479              
    1480              
    1481             =item _native_setup_search
    1482              
    1483             Do some backend-specific initialization.
    1484             It will be called with the same arguments as native_query().
    1485              
    1486             =cut
    1487              
    1488             sub _native_setup_search
    1489             {
    1490 0     0   0 my $self = shift;
    1491 0 0       0 print STDERR " FFF _n_s_s\n" if (DEBUG_FUNC || $self->{_debug});
    1492             # Backward-compatibility for backends that define the old
    1493             # native_setup_search(), but not the new _native_setup_search()
    1494 0 0       0 if ($self->can('native_setup_search'))
    1495             {
    1496 0         0 return $self->native_setup_search(@_);
    1497             } # if
    1498             } # _native_setup_search
    1499              
    1500              
    1501             =item setup_search
    1502              
    1503             This internal routine does generic Search setup.
    1504             It calls C<_native_setup_search()> to do backend-specific setup.
    1505              
    1506             =cut
    1507              
    1508             sub setup_search
    1509             {
    1510 9     9 1 22 my ($self) = @_;
    1511 9 100       77 print STDERR " FFF setup_search(",$self->{'native_query'},")\n" if (DEBUG_FUNC || $self->{_debug});
    1512 9         24 $self->{cache} = [];
    1513 9         19 $self->{next_to_retrieve} = 1;
    1514 9         16 $self->{number_retrieved} = 0;
    1515 9         17 $self->{state} = SEARCH_UNDERWAY;
    1516             # $self->{_options} = ();
    1517 9         59 $self->_native_setup_search($self->{'native_query'}, $self->{'native_options'});
    1518             } # setup_search
    1519              
    1520              
    1521             =item need_to_delay
    1522              
    1523             A backend should override this method in order to dictate whether
    1524             user_agent_delay() needs to be called before the next HTTP request is
    1525             sent. Return any perlish true or zero value.
    1526              
    1527             =cut
    1528              
    1529             sub need_to_delay
    1530             {
    1531 1     1 1 11 my $self = shift;
    1532             # This is a NOP stub. Unless the subclass overrides this method,
    1533             # there is no reason to delay.
    1534 1         2 return 0;
    1535             } # need_to_delay
    1536              
    1537              
    1538             =item user_agent_delay
    1539              
    1540             According to what need_to_delay() returns,
    1541             user_agent_delay() will be called between requests to remote
    1542             servers to avoid overloading them with many back-to-back requests.
    1543              
    1544             =cut
    1545              
    1546             sub user_agent_delay
    1547             {
    1548 2     2 1 33 my $self = shift;
    1549             # Sleep for some number of seconds:
    1550 2         500659 select(undef, undef, undef, $self->{interrequest_delay});
    1551             } # user_agent_delay
    1552              
    1553              
    1554             =item absurl
    1555              
    1556             An internal routine to convert a relative URL into a absolute URL. It
    1557             takes two arguments, the 'base' url (usually the search engine CGI
    1558             URL) and the URL to be converted. Returns a URI object.
    1559              
    1560             =cut
    1561              
    1562             sub absurl
    1563             {
    1564 3     3 1 61 my ($self, $base, $url) = @_;
    1565 3   100     26 $base ||= '';
    1566 3   100     13 $url ||= '';
    1567             # print STDERR " + this is WWW::Search::absurl($base,$url)\n" if 1 < $self->{_debug};
    1568 3 100       13 $base = $self->{_prev_url} if ($base eq '');
    1569 3 100 66     18 return '' unless (($url ne '') && ($base ne ''));
    1570 1         13 my $link = URI->new_abs($url, $base);
    1571 1         6015 return $link;
    1572             } # absurl
    1573              
    1574              
    1575             =item retrieve_some
    1576              
    1577             An internal routine to interface with C<_native_retrieve_some()>.
    1578             Checks for overflow.
    1579              
    1580             =cut
    1581              
    1582             sub retrieve_some
    1583             {
    1584 7     7 1 15 my $self = shift;
    1585 7 50       23 print STDERR " FFF retrieve_some(", $self->{'native_query'}, ")\n" if (DEBUG_FUNC || $self->{_debug});
    1586 7 50       23 return undef if ($self->{state} == SEARCH_DONE);
    1587 7 100       30 $self->setup_search() if ($self->{state} == SEARCH_BEFORE);
    1588 7         19 $self->{state} = SEARCH_RETRIEVING;
    1589 7 100       28 if (! $self->{'_allow_empty_query'})
    1590             {
    1591 4 50       14 if (! defined($self->{'native_query'}))
    1592             {
    1593 0         0 $self->{response} = new HTTP::Response(500, "query string is not defined");
    1594 0         0 $self->{state} = SEARCH_DONE;
    1595 0         0 return;
    1596             } # if
    1597 4 100       14 if ($self->{'native_query'} eq '')
    1598             {
    1599 2         13 $self->{response} = new HTTP::Response(500, "query string is empty");
    1600 2         116 $self->{state} = SEARCH_DONE;
    1601 2         7 return;
    1602             } # if
    1603             } # if
    1604             # Got enough already?
    1605 5 50       18 if ($self->{number_retrieved} >= $self->{'maximum_to_retrieve'})
    1606             {
    1607 0 0       0 print STDERR " DDD retrieve_some() got enough already\n" if (DEBUG_RETR || $self->{_debug});
    1608 0         0 $self->{state} = SEARCH_DONE;
    1609 0         0 return;
    1610             } # if
    1611             # Spinning our wheels?
    1612 5 50       20 if ($self->{requests_made} > $self->{'maximum_to_retrieve'})
    1613             {
    1614 0 0       0 print STDERR " DDD retrieve_some() too many requests\n" if (DEBUG_RETR || $self->{_debug});
    1615 0         0 $self->{state} = SEARCH_DONE;
    1616 0         0 return;
    1617             } # if
    1618             # Need more results:
    1619 5   50     25 my $res = $self->_native_retrieve_some() || 0;
    1620 5 50       30 print STDERR " + _native_retrieve_some() returned $res\n" if (DEBUG_RETR || $self->{_debug});
    1621 5         13 $self->{requests_made}++;
    1622 5         14 $self->{number_retrieved} += $res;
    1623 5 50       28 $self->{state} = SEARCH_DONE if ($res == 0);
    1624 5         21 return $res;
    1625             } # retrieve_some
    1626              
    1627              
    1628             sub HTML::TreeBuilder::www_search_reset
    1629             {
    1630             # If a reset() method becomes available in HTML::TreeBuilder, we
    1631             # won't need this any more.
    1632 0     0 0 0 my $self = shift;
    1633 0         0 $self->delete;
    1634             # These 4 lines copied from HTML::TreeBuilder::new
    1635 0         0 $self->{'_head'} = $self->insert_element('head',1);
    1636 0         0 $self->{'_pos'} = undef; # pull it back up
    1637 0         0 $self->{'_body'} = $self->insert_element('body',1);
    1638 0         0 $self->{'_pos'} = undef; # pull it back up again
    1639             } # HTML::TreeBuilder::www_search_reset
    1640              
    1641              
    1642             =item _native_retrieve_some
    1643              
    1644             Fetch the next page of results from the web engine, parse the results,
    1645             and prepare for the next page of results.
    1646              
    1647             If a backend defines this method, it is in total control of the WWW
    1648             fetch, parsing, and preparing for the next page of results. See the
    1649             WWW::Search::AltaVista module for example usage of the
    1650             _native_retrieve_some method.
    1651              
    1652             An easier way to achieve this in a backend is to inherit
    1653             _native_retrieve_some from WWW::Search, and do only the HTML parsing.
    1654             Simply define a method _parse_tree which takes one argument, an
    1655             HTML::TreeBuilder object, and returns an integer, the number of
    1656             results found on this page. See the WWW::Search::Yahoo module for
    1657             example usage of the _parse_tree method.
    1658              
    1659             A backend should, in general, define either _parse_tree() or
    1660             _native_retrieve_some(), but not both.
    1661              
    1662             Additional features of the default _native_retrieve_some method:
    1663              
    1664             Sets $self->{_prev_url} to the URL of the page just retrieved.
    1665              
    1666             Calls $self->preprocess_results_page() on the raw HTML of the page.
    1667              
    1668             Then, parses the page with an HTML::TreeBuilder object and passes that
    1669             populated object to $self->_parse_tree().
    1670              
    1671             Additional notes on using the _parse_tree method:
    1672              
    1673             The built-in HTML::TreeBuilder object used to parse the page has
    1674             store_comments turned ON. If a backend needs to use a subclassed or
    1675             modified HTML::TreeBuilder object, the backend should set
    1676             $self->{'_treebuilder'} to that object before any results are
    1677             retrieved. The best place to do this is at the end of
    1678             _native_setup_search.
    1679              
    1680             my $oTree = new myTreeBuilder;
    1681             $oTree->store_pis(1); # for example
    1682             $self->{'_treebuilder'} = $oTree;
    1683              
    1684             When _parse_tree() is called, the $self->next_url is cleared.
    1685             During parsing, the backend should set $self->next_url to the appropriate URL for the next page of results.
    1686             (If _parse_tree() does not set the value, the search will end after parsing this page of results.)
    1687              
    1688             When _parse_tree() is called, the URL for the page being parsed can be
    1689             found in $self->{_prev_url}.
    1690              
    1691             =cut
    1692              
    1693             sub _parse_tree
    1694             {
    1695 1     1   9 my $self = shift;
    1696             # This is a NOP stub. Backend MUST define their own parse function!
    1697 1 50       6 print STDERR " FFF stub _parse_tree\n" if (DEBUG_FUNC || $self->{_debug});
    1698             # This is for backward-compatibility, for backends that define the
    1699             # old parse_tree(), but not the new _parse_tree():
    1700 1 50       17 return $self->parse_tree(@_) if $self->can('parse_tree');
    1701 1         4 return 0;
    1702             } # _parse_tree
    1703              
    1704              
    1705             sub _native_retrieve_some
    1706             {
    1707 0     0   0 my $self = shift;
    1708 0 0       0 if ($self->can('native_retrieve_some'))
    1709             {
    1710             # This is for backward-compatibility, for backends that define the
    1711             # old native_retrieve_some(), but not the new
    1712             # _native_retrieve_some():
    1713 0         0 return $self->native_retrieve_some(@_);
    1714             } # if
    1715 0 0       0 print STDERR " FFF _n_r_s\n" if (DEBUG_FUNC || $self->{_debug});
    1716             # Fast exit if already done:
    1717 0 0       0 return if (!defined($self->{_next_url}));
    1718 0 0       0 return if ($self->{_next_url} eq q{});
    1719             # If this is not the first page of results, sleep so as to not
    1720             # overload the server:
    1721 0   0     0 $self->{_next_to_retrieve} ||= 1;
    1722             $self->user_agent_delay if (
    1723             (1 < $self->{_next_to_retrieve})
    1724 0 0 0     0 ||
    1725             $self->need_to_delay
    1726             );
    1727             # Get one page of results:
    1728 0 0       0 print STDERR " + submitting URL (", $self->{'_next_url'}, ")\n" if $self->{_debug};
    1729 0         0 my $response = $self->http_request($self->http_method, $self->{'_next_url'});
    1730 0 0       0 print STDERR " + got response\n", $response->headers->as_string, "\n" if 2 <= $self->{_debug};
    1731 0         0 $self->{_prev_url} = $self->{_next_url};
    1732             # Assume there are no more results, unless we find out otherwise
    1733             # when we parse the html:
    1734 0         0 $self->{_next_url} = undef;
    1735 0         0 $self->{response} = $response;
    1736             # print STDERR " --- HTTP response is:\n", $response->as_string if 4 < $self->{_debug};
    1737 0 0       0 if (! $response->is_success)
    1738             {
    1739 0 0       0 if ($self->{_debug})
    1740             {
    1741 0         0 print STDERR " --- HTTP request failed, response is:\n", $response->as_string;
    1742             } # if
    1743 0         0 return;
    1744             } # if
    1745             # Pre-process the output:
    1746 0         0 my $sPage = $self->preprocess_results_page($response->content);
    1747             # Parse the output:
    1748 0         0 my $tree;
    1749 0 0       0 if (ref $self->{'_treebuilder'})
    1750             {
    1751             # print STDERR " + using existing _treebuilder\n" if 1 < $self->{_debug};
    1752             # Assume that the backend has installed their own TreeBuilder
    1753 0         0 $tree = $self->{'_treebuilder'};
    1754             }
    1755             else
    1756             {
    1757             # print STDERR " + creating new _treebuilder\n" if 1 < $self->{_debug};
    1758 0         0 $tree = HTML::TreeBuilder->new(
    1759             # use all default options
    1760             );
    1761 0         0 $tree->store_comments('yes');
    1762 0         0 $self->{'_treebuilder'} = $tree;
    1763             }
    1764             # If a reset() method becomes available in HTML::TreeBuilder, we can
    1765             # change this:
    1766 0         0 $tree->www_search_reset;
    1767             # print STDERR " + parsing content, tree is ", Dumper(\$tree) if 1 < $self->{_debug};
    1768             # use Encode;
    1769             # my $sPageOctets = Encode::encode_utf8($sPage);
    1770 0         0 $tree->utf8_mode('true');
    1771 0         0 $tree->parse($sPage);
    1772             # print STDERR " + done parsing content.\n" if 1 < $self->{_debug};
    1773 0         0 $tree->eof();
    1774 0 0       0 print STDERR " + calling _parse_tree...\n" if (1 < $self->{_debug});
    1775 0         0 return $self->_parse_tree($tree);
    1776             } # _native_retrieve_some
    1777              
    1778              
    1779             =item result_as_HTML
    1780              
    1781             Given a WWW::SearchResult object, formats it human-readable with HTML.
    1782              
    1783             =cut
    1784              
    1785             sub result_as_HTML
    1786             {
    1787             # print STDERR " DDD r_as_H(@_)\n";
    1788 6     6 1 422 my $self = shift;
    1789 6 100       27 my $oSR = shift or return '';
    1790 3 100       21 return '' unless (ref($oSR) =~ m'WWW::Search(::)?Result');
    1791 1         11 my $o = new CGI;
    1792 1         330 return join('',
    1793             $o->a(
    1794             { href => $oSR->url, },
    1795             $oSR->title,
    1796             ),
    1797             $o->br,
    1798             $oSR->description,
    1799             );
    1800             } # result_as_HTML
    1801              
    1802              
    1803             =item preprocess_results_page
    1804              
    1805             A filter on the raw HTML of the results page.
    1806             This allows the backend to alter the HTML before it is parsed,
    1807             such as to correct for known problems, HTML that can not be parsed correctly, etc.
    1808              
    1809             Takes one argument, a string (the HTML webpage);
    1810             returns one string (the same HTML, modified).
    1811              
    1812             This method is called from within _native_retrieve_some (above)
    1813             before the HTML of the page is parsed.
    1814              
    1815             See the WWW::Search::Ebay distribution 2.07 or higher for example
    1816             usage.
    1817              
    1818             =cut
    1819              
    1820             sub preprocess_results_page
    1821             {
    1822             # Here is just a stub. Return our argument without modification.
    1823 2     2 1 12 my $self = shift;
    1824 2         4 return shift;
    1825             } # preprocess_results_page
    1826              
    1827              
    1828             =item test_cases (DEPRECATED)
    1829              
    1830             Deprecated.
    1831              
    1832             Returns the value of the $TEST_CASES variable of the backend engine.
    1833              
    1834             =cut
    1835              
    1836             sub test_cases
    1837             {
    1838 1     1 1 6 my $self = shift;
    1839 1         46 return eval '$'.ref($self).'::TEST_CASES';
    1840             } # test_cases
    1841              
    1842             =item hash_to_cgi_string (DEPRECATED)
    1843              
    1844             Given a reference to a hash of string => string, constructs a CGI
    1845             parameter string that looks like 'key1=value1&key2=value2'.
    1846              
    1847             If the value is undef, the key will not be added to the string.
    1848              
    1849             At one time, for testing purposes, we asked backends to use this
    1850             function rather than piecing the URL together by hand, to ensure that
    1851             URLs are identical across platforms and software versions. But this
    1852             is no longer necessary.
    1853              
    1854             Example:
    1855              
    1856             $self->{_options} = {
    1857             'opt3' => 'val3',
    1858             'search_url' => 'http://www.deja.com/dnquery.xp',
    1859             'opt1' => 'val1',
    1860             'QRY' => $native_query,
    1861             'opt2' => 'val2',
    1862             };
    1863             $self->{_next_url} = $self->{_options}{'search_url'} .'?'.
    1864             $self->hash_to_cgi_string($self->{_options});
    1865              
    1866             =cut
    1867              
    1868             sub hash_to_cgi_string
    1869             {
    1870 2     2 1 20 my $self = shift;
    1871             # Because of the design of our test suite, we need our generated
    1872             # URLs to be identical on all systems, all versions of perl. Ergo
    1873             # we must explicitly control the order in which our CGI parameter
    1874             # strings are cobbled together. For now, I assume sorting the hash
    1875             # keys will suffice.
    1876 2         3 my $rh = shift;
    1877 2         4 my $ret = '';
    1878 2         13 foreach my $key (sort keys %$rh)
    1879             {
    1880             # printf STDERR "option: $key is " . $rh->{$key} . "\n";
    1881 4 50       11 next if generic_option($key);
    1882             # Throw out keys with undef values.
    1883 4 100       11 next unless defined($rh->{$key});
    1884             # If we want to let the user delete options, uncomment the next
    1885             # line. (They can still blank them out, which may or may not have
    1886             # the same effect):
    1887             # next unless $rh->{$key} ne '';
    1888 3         9 $ret .= $key .'='. $rh->{$key} .'&';
    1889             } # foreach $key
    1890             # Remove the trailing '&':
    1891 2         6 chop $ret;
    1892 2         4 return $ret;
    1893             } # hash_to_cgi_string
    1894              
    1895              
    1896             =back
    1897              
    1898             =head1 IMPLEMENTING NEW BACKENDS
    1899              
    1900             C supports backends to separate search engines. Each
    1901             backend is implemented as a subclass of C.
    1902             L provides a good sample backend.
    1903              
    1904             A backend must have the routine C<_native_setup_search()>. A backend
    1905             must have the routine C<_native_retrieve_some()> or C<_parse_tree()>.
    1906              
    1907             C<_native_setup_search()> is invoked before the search. It is passed
    1908             a single argument: the escaped, native version of the query.
    1909              
    1910             C<_native_retrieve_some()> is the core of a backend. It will be called
    1911             periodically to fetch URLs. It should retrieve several hits from the
    1912             search service and add them to the cache. It should return the number
    1913             of hits found, or undef when there are no more hits.
    1914              
    1915             Internally, C<_native_retrieve_some()> typically sends an HTTP request to
    1916             the search service, parses the HTML, extracts the links and
    1917             descriptions, then saves the URL for the next page of results. See
    1918             the code for the C module for an example.
    1919              
    1920             Alternatively, a backend can define the method C<_parse_tree()> instead
    1921             of C<_native_retrieve_some()>. See the C module for a
    1922             good example.
    1923              
    1924             If you implement a new backend, please let the authors know.
    1925              
    1926              
    1927             =head1 BUGS AND DESIRED FEATURES
    1928              
    1929             The bugs are there for you to find (some people call them Easter Eggs).
    1930              
    1931             Desired features:
    1932              
    1933             =over 4
    1934              
    1935             =item A portable query language.
    1936              
    1937             A portable language would easily allow you to move queries easily
    1938             between different search engines. A query abstraction is non-trivial
    1939             and unfortunately will not be done any time soon by the current
    1940             maintainer. If you want to take a shot at it, please let me know.
    1941              
    1942             =back
    1943              
    1944             =head1 AUTHOR
    1945              
    1946             John Heidemann
    1947             Maintained by Martin Thurn, C, L.
    1948              
    1949             =head1 COPYRIGHT
    1950              
    1951             Copyright (c) 1996 University of Southern California.
    1952             All rights reserved.
    1953              
    1954             Redistribution and use in source and binary forms are permitted
    1955             provided that the above copyright notice and this paragraph are
    1956             duplicated in all such forms and that any documentation, advertising
    1957             materials, and other materials related to such distribution and use
    1958             acknowledge that the software was developed by the University of
    1959             Southern California, Information Sciences Institute. The name of the
    1960             University may not be used to endorse or promote products derived from
    1961             this software without specific prior written permission.
    1962              
    1963             THIS SOFTWARE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
    1964             WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
    1965             MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
    1966              
    1967             =cut
    1968              
    1969              
    1970             1;
    1971              
    1972             __END__