File Coverage

blib/lib/WWW/Search.pm
Criterion Covered Total %
statement 368 525 70.1
branch 90 202 44.5
condition 45 74 60.8
subroutine 80 88 90.9
pod 50 51 98.0
total 633 940 67.3


line stmt bran cond sub pod time code
1             # Search.pm
2             # by John Heidemann
3             # Copyright (C) 1996 by USC/ISI
4             #
5             # A complete copyright notice appears at the end of this file.
6              
7             =head1 NAME
8              
9             WWW::Search - Virtual base class for WWW searches
10              
11             =head1 SYNOPSIS
12              
13             use WWW::Search;
14             my $sEngine = 'AltaVista';
15             my $oSearch = new WWW::Search($sEngine);
16              
17             =head1 DESCRIPTION
18              
19             This class is the parent for all access methods supported by the
20             C library. This library implements a Perl API
21             to web-based search engines.
22              
23             See README for a list of search engines currently supported, and for a
24             lot of interesting high-level information about this distribution.
25              
26             Search results can be limited, and there is a pause between each
27             request to avoid overloading either the client or the server.
28              
29             =head2 Sample program
30              
31             Here is a sample program:
32              
33             my $sQuery = 'Columbus Ohio sushi restaurant';
34             my $oSearch = new WWW::Search('AltaVista');
35             $oSearch->native_query(WWW::Search::escape_query($sQuery));
36             $oSearch->login($sUser, $sPassword);
37             while (my $oResult = $oSearch->next_result())
38             {
39             print $oResult->url, "\n";
40             } # while
41             $oSearch->logout;
42              
43             Results are objects of type C
44             (see L for details).
45             Note that different backends support different result fields.
46             All backends are required to support title and url.
47              
48             =head1 SEE ALSO
49              
50             For specific search engines, see L
51             (replacing TheEngineName with a particular search engine).
52              
53             For details about the results of a search,
54             see L.
55              
56             =head1 METHODS AND FUNCTIONS FOR SEARCHERS
57              
58             =over
59              
60             =cut
61              
62             #####################################################################
63              
64             package WWW::Search;
65              
66 10     10   379208 use strict qw( vars );
  9         67  
  9         313  
67 8     8   694 use warnings;
  8         17  
  8         211  
68              
69 8     8   54 use Carp ();
  8         15  
  8         170  
70 8     8   7133 use CGI;
  8         225300  
  8         87  
71             # use Data::Dumper; # for debugging only
72 8     8   547 use Exporter;
  8         17  
  8         351  
73 8     8   281 use File::Find;
  7         14  
  7         511  
74 8     8   3839 use File::Spec::Functions;
  8         5360  
  8         573  
75 7     7   5307 use HTML::TreeBuilder;
  7         227697  
  7         77  
76 7     7   4549 use HTTP::Cookies;
  7         87290  
  7         228  
77 7     7   3215 use HTTP::Request;
  7         155092  
  7         254  
78 7     7   3721 use HTTP::Response;
  7         49999  
  7         246  
79 7     7   51 use HTTP::Status;
  7         15  
  7         1691  
80 7     7   3744 use LWP::MemberMixin;
  7         111  
  7         240  
81 7     7   3339 use LWP::RobotUA;
  7         121485  
  7         285  
82 7     7   54 use LWP::UserAgent;
  7         16  
  7         150  
83             # use Net::Domain qw( hostfqdn );
84 7     7   37 use URI;
  7         14  
  7         250  
85 7     7   47 use URI::Escape;
  7         15  
  7         485  
86             # use User;
87              
88             # Internal states:
89 7     7   45 use constant SEARCH_BEFORE => 1;
  7         15  
  7         407  
90 7     7   42 use constant SEARCH_UNDERWAY => 2;
  7         14  
  7         342  
91 7     7   40 use constant SEARCH_DONE => 3;
  7         15  
  7         289  
92 7     7   38 use constant SEARCH_RETRIEVING => 4;
  7         14  
  7         366  
93              
94 7     7   46 use vars qw( @ISA @EXPORT @EXPORT_OK );
  7         70  
  7         2970  
95             @EXPORT = qw();
96             @EXPORT_OK = qw( escape_query unescape_query generic_option strip_tags );
97             @ISA = qw(Exporter LWP::MemberMixin);
98             our
99             $MAINTAINER = 'Martin Thurn ';
100             our
101             $VERSION = 2.566;
102              
103             =item new
104              
105             To create a new WWW::Search, call
106              
107             $oSearch = new WWW::Search('SearchEngineName');
108              
109             where SearchEngineName is replaced with a particular search engine.
110             For example:
111              
112             $oSearch = new WWW::Search('Yahoo');
113              
114             If no search engine is specified, a default (currently 'Null::Empty')
115             will be chosen for you.
116              
117             =cut
118              
119             sub new
120             {
121 12     12 1 4943 my $class = shift;
122 12         28 my $engine = shift;
123             # Remaining arguments will become hash args
124              
125             # The default backend (not currently more configurable :-< )
126 12         23 my $default_engine = 'Null::Empty';
127 12         89 my $default_agent_name = "$class/$VERSION";
128 12         27 my $default_agent_email = '';
129 12 100       41 $engine = $default_engine if (!defined($engine));
130             # Load the engine, if necessary.
131 12         28 my $subclass = "${class}::$engine";
132 12         33 my $sEval = "use $subclass;";
133 5     5   1099 eval $sEval;
  5         13  
  5         173  
  12         851  
134 12 100       286 Carp::croak("can not load backend $engine ($@)") if ($@);
135 11         139 my $self = bless {
136             engine => $engine,
137             maximum_to_retrieve => 500, # both pages and hits
138             interrequest_delay => 0.25, # in seconds
139             agent_name => $default_agent_name,
140             agent_email => $default_agent_email,
141             env_proxy => 0,
142             http_method => 'GET',
143             http_proxy => undef,
144             http_proxy_user => undef,
145             http_proxy_pwd => undef,
146             timeout => 60,
147             _debug => 0,
148             _parse_debug => 0,
149             search_from_file => undef,
150             search_to_file => undef,
151             search_to_file_index => 0,
152             @_,
153             # variable initialization goes here
154             }, $subclass;
155 11         106 $self->reset_search();
156 11         55 return $self;
157             } # new
158              
159             =item version
160              
161             Returns the value of the $VERSION variable of the backend engine, or
162             $WWW::Search::VERSION if the backend does not contain $VERSION.
163              
164             =cut
165              
166             sub version
167             {
168 2     2 1 15 my $self = shift;
169 2         147 my $iVersion = eval '$'.ref($self).'::VERSION';
170             # print STDERR " + iVersion = >>>$iVersion<<<\n";
171 2   66     14 $iVersion ||= $VERSION;
172 2         12 return $iVersion;
173             } # version
174              
175             =item maintainer
176              
177             Returns the value of the $MAINTAINER variable of the backend engine,
178             or $WWW::Search::MAINTAINER if the backend does not contain
179             $MAINTAINER.
180              
181             =cut
182              
183             sub maintainer
184             {
185 2     2 1 8 my $self = shift;
186 2         95 my $sMaintainer = eval '$'.ref($self).'::MAINTAINER';
187             # print STDERR " + sMaintainer = >>>$sMaintainer<<<\n";
188 2   33     10 $sMaintainer ||= $MAINTAINER;
189 2         19 return $sMaintainer;
190             } # maintainer
191              
192              
193             =item installed_engines
194              
195             Returns a list of the names of all installed backends.
196             We can not tell if they are up-to-date or working, though.
197              
198             use WWW::Search;
199             my @asEngines = sort &WWW::Search::installed_engines();
200             local $" = ', ';
201             print (" + These WWW::Search backends are installed: @asEngines\n");
202             # Choose a backend at random (yes, this is rather silly):
203             my $oSearch = WWW::Search->new($asEngines[rand(scalar(@asEngines))]);
204              
205             =cut
206              
207 7     7   57 use constant DEBUG_ARC => 0;
  7         20  
  7         411  
208 7     7   43 use constant DEBUG_COOKIES => 0;
  7         14  
  7         365  
209 7     7   41 use constant DEBUG_FIND => 0;
  7         12  
  7         466  
210 7     7   45 use constant DEBUG_FUNC => 0;
  7         22  
  7         367  
211 7     7   47 use constant DEBUG_RETR => 0;
  7         10  
  7         42083  
212              
213             sub _wanted
214             {
215             # Code adapted from the following netnews post (Thank you, Tom!):
216             # From: Tom Christiansen (tchrist@mox.perl.com)
217             # Subject: SRC: pminst - find modules whose names match this pattern
218             # Newsgroups: comp.lang.perl.misc
219             # Date: 1999/02/15
220 20     20   29 my $startdir = shift;
221 20         28 my $sFullPath = $File::Find::name;
222 20         26 print STDERR " + _wanted($startdir, $sFullPath)\n" if DEBUG_FIND;
223 20 50 66     255 if (-d && /^[a-z]/)
224             {
225             # This is so we don't go down site_perl etc too early (whatever
226             # that means):
227 0         0 $File::Find::prune = 1;
228 0         0 DEBUG_FIND && print STDERR " + prune\n";
229 0         0 return;
230             } # if
231 20 100       114 unless ($sFullPath =~ s!\.pm\Z!!)
232             {
233 4         5 DEBUG_FIND && print STDERR " + not .pm\n";
234 4         354 return;
235             } # unless
236             # Delete absolute path off front of file path:
237 16         108 $sFullPath =~ s{^\Q$startdir\E[\\/]}{};
238 16         28 unless (1 || ($sFullPath =~ s!\AWWW/Search!!))
239             {
240             print STDERR " + not WWW/Search\n" if DEBUG_FIND;
241             return;
242             } # unless
243 16         21 print STDERR " + found $sFullPath\n" if DEBUG_FIND;
244 16         30 $sFullPath =~ s{/}{::}g;
245 16         22 $sFullPath =~ s!\A::!!;
246 16         304 return $sFullPath;
247             } # _wanted
248              
249             sub installed_engines
250             {
251             # Does NOT need a WWW::Search object to operate
252 1     1 1 676 my %hsi;
253 1         3 local $" = '|';
254 1         2 DEBUG_FIND && print STDERR " + installed_engines() start, INC is @INC...\n";
255             INC_DIR:
256 1         23 foreach my $sDir (map catdir($_, 'WWW', 'Search'), @INC)
257             {
258 10         17 DEBUG_FIND && print STDERR " + foreach ==$sDir==\n";
259 10 100       121 next INC_DIR unless -d $sDir;
260             File::Find::find(sub {
261 20   100 20   60 $hsi{&_wanted($sDir) || 'JUNKJUNK'}++;
262 2         164 }, $sDir);
263             } # foreach INC_DIR
264 1         6 delete $hsi{'JUNKJUNK'};
265 1         3 delete $hsi{'Test'};
266 1         1 delete $hsi{'Result'};
267 1         9 return keys %hsi;
268             } # installed_engines
269              
270              
271             =item native_query
272              
273             Specify a query (and optional options) to the current search object.
274             Previous query (if any) and its cached results (if any) will be thrown away.
275             The option values and the query must be escaped;
276             call L to escape a string.
277             The search process is not actually begun until C or
278             C is called (lazy!), so native_query does not return anything.
279              
280             Example:
281              
282             $oSearch->native_query('search-engine-specific+escaped+query+string',
283             { option1 => 'able', option2 => 'baker' } );
284              
285             The hash of options following the query string is optional.
286             The query string is backend-specific.
287             There are two kinds of options:
288             options specific to the backend,
289             and generic options applicable to multiple backends.
290              
291             Generic options all begin with 'search_'.
292             Currently a few are supported:
293              
294             =over 4
295              
296             =item search_url
297              
298             Specifies the base URL for the search engine.
299              
300             =item search_debug
301              
302             Enables backend debugging. The default is 0 (no debugging).
303              
304             =item search_parse_debug
305              
306             Enables backend parser debugging. The default is 0 (no debugging).
307              
308             =item search_to_file FILE
309              
310             Causes the search results to be saved in a set of files
311             prefixed by FILE.
312             (Used internally by the test-suite, not intended for general use.)
313              
314             =item search_from_file FILE
315              
316             Reads a search from a set of files prefixed by FILE.
317             (Used internally by the test-suite, not intended for general use.)
318              
319             =back
320              
321             Some backends may not implement these generic options,
322             but any which do implement them must provide these semantics.
323              
324             Backend-specific options are described
325             in the documentation for each backend.
326             In most cases the options and their values are packed together to create the query portion of
327             the final URL.
328              
329             Details about how the search string and option hash are interpreted
330             might be found in the search-engine-specific manual pages
331             (WWW::Search::SearchEngineName).
332              
333             =cut
334              
335             sub native_query
336             {
337 7     7 1 317 my $self = shift;
338 7 100       29 print STDERR " FFF native_query($_[0])\n" if (DEBUG_FUNC || $self->{_debug});
339 7         29 $self->reset_search();
340 7         19 $self->{'native_query'} = $_[0];
341 7         17 $self->{'native_options'} = $_[1];
342             # promote generic options
343 7         12 my $opts_ref = $_[1];
344 7         30 foreach my $sKey (keys %$opts_ref)
345             {
346 4 100       10 if (generic_option($sKey))
347             {
348 2 100       12 print STDERR " + promoting $sKey to $self\n" if $self->{_debug};
349 2         11 $self->{$sKey} = $opts_ref->{$sKey};
350             # delete $opts_ref->{$sKey};
351             } # if
352             } # foreach
353 7 50       44 $self->setup_search() if ($self->{state} == SEARCH_BEFORE);
354             } # native_query
355              
356              
357             =item gui_query
358              
359             Specify a query to the current search object;
360             the query will be performed with the engine's default options,
361             as if it were typed by a user in a browser window.
362              
363             Same arguments as C above.
364              
365             Currently, this feature is supported by only a few backends;
366             consult the documentation for each backend to see if it is implemented.
367              
368             =cut
369              
370             sub gui_query
371             {
372             # This function is a stub to prevent runtime errors. This function
373             # should be defined in each backend as appropriate. See Yahoo.pm in
374             # the WWW-Search-Yahoo distribution for an example of how to
375             # implement it.
376 2     2 1 160 my $self = shift;
377 2         13 return $self->native_query(@_);
378             } # gui_query
379              
380              
381             =item cookie_jar
382              
383             Call this method (anytime before asking for results) if you want to
384             communicate cookie data with the search engine. Takes one argument,
385             either a filename or an HTTP::Cookies object. If you give a filename,
386             WWW::Search will attempt to read/store cookies there (by passing the
387             filename to HTTP::Cookies::new).
388              
389             $oSearch->cookie_jar('/tmp/my_cookies');
390              
391             If you give an HTTP::Cookies object, it is up to you to save the
392             cookies if/when you wish.
393              
394             use HTTP::Cookies;
395             my $oJar = HTTP::Cookies->new(...);
396             $oSearch->cookie_jar($oJar);
397              
398             If you pass in no arguments, the cookie jar (if any) is returned.
399              
400             my $oJar = $oSearch->cookie_jar;
401             unless (ref $oJar) { print "No jar" };
402              
403             =cut
404              
405             sub cookie_jar
406             {
407 4     4 1 333 my $self = shift;
408 4 100       14 if (! @_)
409             {
410 1         4 return $self->{'_cookie_jar'};
411             } # if no arguments
412 3         6 my $arg = shift;
413 3         7 my $sRef = ref($arg);
414 3 100       14 if ($sRef =~ m!\AHTTP::Cookies!)
    100          
415             {
416 1         3 print STDERR " + WWW::Search using caller's $sRef object\n" if DEBUG_COOKIES;
417 1         2 $self->{'_cookie_jar'} = $arg;
418 1         3 $self->{'_cookie_jar_we_save'} = 0;
419             } # if
420             elsif (! ref($arg))
421             {
422             # Assume that $arg is a file name:
423 1         3 print STDERR " + WWW::Search using Cookies from file $arg\n" if DEBUG_COOKIES;
424 1         11 $self->{'_cookie_jar'} = HTTP::Cookies->new(
425             'file' => $arg,
426             'autosave' => 1,
427             'ignore_discard' => 1,
428             );
429 1         107 $self->{'_cookie_jar'}->load;
430 1         31 $self->{'_cookie_jar_we_save'} = 1;
431             # print STDERR " + WWW::Search just loaded cookies from $arg\n";
432             }
433             else
434             {
435 1         207 Carp::carp "argument to WWW::Search::cookie_jar() must be a scalar or a flavor of HTTP::Cookies";
436             }
437             } # cookie_jar
438              
439              
440             =item date_from
441              
442             Set/get the start date for limiting the query by a date range. See
443             the documentation for each backend to find out if date ranges are
444             supported.
445              
446             =item date_to
447              
448             Set/get the end date for limiting the query by a date range. See the
449             documentation for each backend to find out if date ranges are
450             supported.
451              
452             =cut
453              
454             sub date_from
455             {
456 3   100 3 1 802 return shift->_elem('date_from', @_) || '';
457             } # date_from
458              
459             sub date_to
460             {
461 3   100 3 1 24 return shift->_elem('date_to', @_) || '';
462             } # date_to
463              
464              
465             =item env_proxy
466              
467             Enable loading proxy settings from environment variables.
468             The proxy URL will be read from $ENV{http_proxy}.
469             The username for authentication will be read from $ENV{http_proxy_user}.
470             The password for authentication will be read from $ENV{http_proxy_pwd}.
471              
472             If you don't want to put passwords in the environment, one solution
473             would be to subclass LWP::UserAgent and use $ENV{WWW_SEARCH_USERAGENT}
474             instead (see user_agent below).
475              
476             env_proxy() must be called before the first retrieval is attempted.
477              
478             Example:
479              
480             $ENV{http_proxy } = 'http://my.proxy.com:80';
481             $ENV{http_proxy_user} = 'bugsbun';
482             $ENV{http_proxy_pwd } = 'c4rr0t5';
483             $oSearch->env_proxy('yes'); # Turn on with any true value
484             ...
485             $oSearch->env_proxy(0); # Turn off with zero
486             ...
487             if ($oSearch->env_proxy) # Test
488              
489             =cut
490              
491             # contributed by Klaus Johannes Rusch
492             sub env_proxy
493             {
494 7     7 1 32 return shift->_elem('env_proxy', @_);
495             } # env_proxy
496              
497              
498             =item http_proxy
499              
500             Set up an HTTP proxy (for connections from behind a firewall).
501              
502             Takes the same arguments as LWP::UserAgent::proxy().
503              
504             This routine should be called before calling any of the result
505             functions (any method with "result" in its name).
506              
507             Example:
508              
509             # Turn on and set address:
510             $oSearch->http_proxy(['http','ftp'] => 'http://proxy:8080');
511             # Turn off:
512             $oSearch->http_proxy('');
513              
514             =cut
515              
516             sub http_proxy
517             {
518 17     17 1 1166 my $self = shift;
519             # Make a copy of our arguments:
520 17 100       39 if (@_)
521             {
522 3         10 my @a = @_;
523 3         10 return $self->_elem('http_proxy', \@a);
524             }
525             else
526             {
527 14         38 return $self->_elem('http_proxy');
528             }
529             } # http_proxy
530              
531              
532             =item http_proxy_user, http_proxy_pwd
533              
534             Set/get HTTP proxy authentication data.
535              
536             These routines set/get username and password used in proxy
537             authentication.
538             Authentication is attempted only if all three items (proxy URL, username
539             and password) have been set.
540              
541             Example:
542              
543             $oSearch->http_proxy_user("myuser");
544             $oSearch->http_proxy_pwd("mypassword");
545             $oSearch->http_proxy_user(undef); # Example for no authentication
546              
547             $username = $oSearch->http_proxy_user();
548              
549             =cut
550              
551             sub http_proxy_user
552             {
553 9     9 1 1191 return shift->_elem('http_proxy_user', @_);
554             }
555              
556             sub http_proxy_pwd
557             {
558 7     7 1 1264 return shift->_elem('http_proxy_pwd', @_);
559             }
560              
561              
562             =item maximum_to_retrieve
563              
564             Set the maximum number of hits to return.
565             Queries resulting in more than this many hits will return
566             the first hits, up to this limit.
567             Although this specifies a maximum limit,
568             search engines might return less than this number.
569              
570             Defaults to 500.
571              
572             Example:
573             $max = $oSearch->maximum_to_retrieve(100);
574              
575             You can also spell this method "maximum_to_return".
576              
577             =cut
578              
579             sub maximum_to_retrieve
580             {
581 1     1 1 6 return shift->_elem('maximum_to_retrieve', @_);
582             }
583              
584              
585             =item maximum_to_return
586              
587             Synonym for maximum_to_retrieve
588              
589             =cut
590              
591             sub maximum_to_return
592             {
593 1     1 1 7 return shift->_elem('maximum_to_retrieve', @_);
594             }
595              
596              
597             =item timeout
598              
599             The maximum length of time any portion of the query should take,
600             in seconds.
601              
602             Defaults to 60.
603              
604             Example:
605             $oSearch->timeout(120);
606              
607             =cut
608              
609             sub timeout
610             {
611 3     3 1 548 return shift->_elem('timeout', @_);
612             }
613              
614              
615             =item login
616              
617             Backends which need to login to the search engine should implement
618             this function. Takes two arguments, user and password. Return
619             nonzero if login was successful. Return undef or 0 if login failed.
620              
621             =cut
622              
623             sub login
624             {
625 1     1 1 3 my $self = shift;
626             # Here is just a stub.
627 1         7 return 1;
628             # These are the arguments:
629 0         0 my ($sUser, $sPassword) = @_;
630             } # login
631              
632             =item logout
633              
634             Backends which need to logout from the search engine should implement
635             this function.
636              
637             =cut
638              
639             sub logout
640             {
641 1     1 1 6 my $self = shift; # no other args
642             # Here is just a stub.
643             } # logout
644              
645              
646             =item approximate_result_count
647              
648             Some backends indicate how many results they have found,
649             e.g. with a number shown at the top of the search results page.
650             Note that there is no corresponding method that returns the actual count of results;
651             that's because results are normally retrieved in batches (i.e. pages)
652             and at any given time there's no way to know how big the final list of results will be.
653             NEW: if request has not been made to the search provider,
654             this method will return undef (used to return zero).
655             NEW: if the results page does not explicitly indicate the result count,
656             this method will return undef (used to return zero).
657              
658             =cut
659              
660             sub approximate_result_count
661             {
662 14     14 1 1944 my $self = shift;
663             # Optional arg1 = new value for this option.
664 14         25 my $iArg = shift;
665 14 100 100     110 $iArg = undef if (defined($iArg) && ($iArg eq ''));
666 14         27 DEBUG_FUNC && print STDERR " FFF a_r_c(state=$self->{state},iArg=$iArg)\n";
667 14         19 DEBUG_ARC && print STDERR " + a_r_c(state=$self->{state},iArg=$iArg)\n";
668 14 100 100     53 if (defined($iArg) && (0 <= $iArg))
669             {
670             # Caller is trying to SET the value:
671 4         7 DEBUG_ARC && print STDERR " + a_r_cSET(state=$self->{state},iArg=$iArg)\n";
672 4         10 $self->{'approx_count'} = $iArg;
673 4         18 return $iArg;
674             } # if
675 10 100 66     73 if (
676             # This prevents infinite recursion, for when retrieve_some()
677             # calls this function in order to SET the value:
678             ($self->{state} ne SEARCH_RETRIEVING)
679             &&
680             # This prevents useless repeat calls to retrieve_some() after
681             # the search has been completed:
682             ($self->{state} ne SEARCH_DONE))
683             {
684 2         4 DEBUG_ARC && print STDERR " DDD a_r_c prime the pump\n";
685             # Prime the pump, if necessary:
686 2         12 $self->retrieve_some();
687             } # if
688 10         24 $iArg = $self->{'approx_count'};
689 10         12 DEBUG_ARC && print STDERR " + a_r_cGET(state=$self->{state},answer=$iArg)\n";
690 10         49 return $iArg;
691             } # approximate_result_count
692              
693             =item approximate_hit_count
694              
695             This is an alias for approximate_result_count().
696              
697             =cut
698              
699             sub approximate_hit_count
700             {
701 2     2 1 7 shift->approximate_result_count(@_);
702             } # approximate_hit_count
703              
704              
705             =item results
706              
707             Return all the results of a query as an array of WWW::SearchResult
708             objects.
709              
710             Note: This might take a while, because a web backend will keep asking
711             the search engine for "next page of results" over and over until there
712             are no more next pages, and THEN return from this function.
713              
714             If an error occurs at any time during query processing, it will be
715             indicated in the response().
716              
717             Example:
718              
719             @results = $oSearch->results();
720             # Go have a cup of coffee while the previous line executes...
721             foreach $oResult (@results)
722             {
723             print $oResult->url(), "\n";
724             } # foreach
725              
726             =cut
727              
728             sub results
729             {
730 6     6 1 337 my $self = shift;
731 6 50       21 print STDERR " FFF results(",$self->{'native_query'},")\n" if (DEBUG_FUNC || $self->{_debug});
732             # Put all the search results into the cache:
733 6         29 1 while ($self->retrieve_some());
734 6   50     32 $self->{cache} ||= [];
735 6         13 my $iMax = scalar(@{$self->{cache}});
  6         15  
736             # print STDERR " + mtr is ", $self->{maximum_to_retrieve}, "\n" if $self->{_debug};
737             # print STDERR " + cache contains $iMax results\n" if $self->{_debug};
738 6 50       23 $iMax = $self->{maximum_to_retrieve} if ($self->{maximum_to_retrieve} < $iMax);
739             # print STDERR " + returning $iMax results\n" if $self->{_debug};
740 6         16 return @{$self->{cache}}[0..$iMax-1];
  6         230  
741             } # results
742              
743             =item next_result
744              
745             Call this method repeatedly to return each result of a query as a
746             WWW::SearchResult object. Example:
747              
748             while ($oResult = $oSearch->next_result())
749             {
750             print $oResult->url(), "\n";
751             } # while
752              
753             When there are no more results, or if an error occurs, next_result()
754             will return undef.
755              
756             If an error occurs at any time during query processing, it will be
757             indicated in the response().
758              
759             =cut
760              
761             sub next_result
762             {
763 46     46 1 59 my $self = shift;
764             # Carp::croak "search not yet specified" if (!defined($self->{'native_query'}));
765 46 100       79 return undef if ($self->{next_to_return} >= $self->{maximum_to_retrieve});
766 45         51 while (1)
767             {
768 45 100       53 if ($self->{next_to_return} <= $#{$self->{cache}})
  45         78  
769             {
770             # The cache already contains the desired element; return it:
771 44         55 my $i = ($self->{next_to_return})++;
772 44         75 return $self->{cache}->[$i];
773             } # if
774             # If we get here, then the desired element is beyond the end of
775             # the cache.
776 1 50       6 if ($self->{state} == SEARCH_DONE)
777             {
778             # There are no more results to be gotten; fail & bail:
779 1         3 return undef;
780             } # if
781             # Get some more results into the cache:
782 0         0 $self->retrieve_some();
783             # Go back and try again:
784             } # while infinite
785             } # next_result
786              
787              
788             =item seek_result($offset)
789              
790             Set which result should be returned next time
791             C is called. Results are zero-indexed.
792              
793             The only guaranteed valid offset is 0,
794             which will replay the results from the beginning.
795             In particular, seeking past the end of the current cached
796             results probably will not do what you might think it should.
797              
798             Results are cached, so this does not re-issue the query
799             or cause IO (unless you go off the end of the results).
800             To re-do the query, create a new search object.
801              
802             Example:
803              
804             $oSearch->seek_result(0);
805              
806             =cut
807              
808             sub seek_result
809             {
810 3     3 1 549 my ($self, $desired) = @_;
811 3         5 my $old = $self->{next_to_return};
812 3 100 100     17 $self->{next_to_return} = $desired if (defined($desired) and (0 <= $desired));
813 3         7 return $old;
814             } # seek_result
815              
816              
817             =item response
818              
819             Returns an L object which resulted from the
820             most-recently-sent query. Errors can be detected like this:
821              
822             if (! $oSearch->response->is_success)
823             {
824             print STDERR "Error: " . $oSearch->response->as_string() . "\n";
825             } # if
826              
827             Note to backend authors: even if the backend does not involve the web,
828             it should return an L object.
829              
830             =cut
831              
832             sub response
833             {
834 7     7 1 24 my $self = shift;
835 7   33     24 $self->{response} ||= new HTTP::Response(RC_OK);
836 7         28 return $self->{response};
837             } # response
838              
839              
840             =item submit
841              
842             This method can be used to submit URLs to the search engines for indexing.
843             Consult the documentation for each backend to find out if it is implemented there,
844             and if so what the arguments are.
845              
846             Returns an HTTP::Response object describing the result of the submission request.
847             Consult the documentation for each backend to find out the meaning of the response.
848              
849             =cut
850              
851             sub submit
852             {
853 1     1 1 10 return new HTTP::Response(788, 'Sorry, this backend does not support the submit() method.');
854             } # submit
855              
856              
857             =item opaque
858              
859             This function provides an application a place to store
860             one opaque data element (or many, via a Perl reference).
861             This facility is useful to (for example),
862             maintain client-specific information in each active query
863             when you have multiple concurrent queries.
864              
865             =cut
866              
867 3     3 1 766 sub opaque { return shift->_elem('opaque', @_); }
868              
869              
870             =item escape_query
871              
872             Escape a query.
873             Before queries are sent to the internet, special characters must be escaped
874             so that a proper URL can be formed.
875             This is like escaping a URL,
876             but all non-alphanumeric characters are escaped and
877             and spaces are converted to "+"s.
878              
879             Example:
880              
881             $escaped = WWW::Search::escape_query('+hi +mom');
882             # $escaped is now '%2Bhi+%2Bmom'
883              
884             See also C.
885             NOTE that this is not a method, it is a plain function.
886              
887             =cut
888              
889             sub escape_query
890             {
891 2     2 1 13 my $text = join(' ', @_);
892 2   100     10 $text ||= '';
893             # print STDERR " + escape_query($text)\n";
894 2         17 $text =~ s/([^ A-Za-z0-9])/$URI::Escape::escapes{$1}/g; #"
895             # print STDERR " + escape_query($text)\n";
896 2         7 $text =~ s/ /+/g;
897             # print STDERR " + escape_query($text)\n";
898 2         9 return $text;
899             } # escape_query
900              
901             =item unescape_query
902              
903             Unescape a query.
904             See C for details.
905              
906             Example:
907              
908             $unescaped = WWW::Search::unescape_query('%22hi+mom%22');
909             # $unescaped eq q{"hi mom"}
910              
911             NOTE that this is not a method, it is a plain function.
912              
913             =cut
914              
915             sub unescape_query
916             {
917             # code stolen from URI::Escape.pm.
918 2     2 1 8 my @copy = @_;
919 2         6 for (@copy)
920             {
921 4         12 s!\+! !g;
922 4         9 s!\%([\dA-Fa-f]{2})!chr(hex($1))!eg;
  2         13  
923             } # for
924 2 100       13 return wantarray ? @copy : $copy[0];
925             } # unescape_query
926              
927             =item strip_tags
928              
929             Given a string, returns a copy of that string with HTML tags removed.
930             This should be used by each backend as they insert the title and
931             description values into the search results objects.
932              
933             NOTE that this is not a method, it is a plain function.
934              
935             =cut
936              
937             sub strip_tags
938             {
939             # Prevent undef warnings if we get passed any undefined values:
940 2     2 1 545 my @args = @_;
941 2   100     7 @args = map { $_ ||= '' } @args;
  5         22  
942 2         9 my $s = join('', @args);
943             # Special case: change BR to space:
944 2         7 $s =~ s!
! !gi;
945             # We assume for now that we will not be encountering tags with
946             # embedded '>' characters!
947 2         12 $s =~ s/\074.+?\076//g;
948 2         5 $s =~ s/ / /g;
949 2         5 $s =~ s/"/\042/g;
950 2         5 $s =~ s/&/\046/g;
951 2         3 $s =~ s/</\074/g;
952 2         5 $s =~ s/>/\076/g;
953             # Coalesce multiple spaces:
954 2         9 $s =~ tr!\040\t\r\n! !s;
955             # Delete leading & trailing spaces:
956 2         7 $s =~ s!\A\s+!!;
957 2         8 $s =~ s!\s+\Z!!;
958 2         8 return $s;
959             } # strip_tags
960              
961             =item is_http_proxy
962              
963             Returns true if proxy information is available.
964              
965             =cut
966              
967             sub is_http_proxy
968             {
969 12     12 1 28 my $self = shift;
970 12         38 my $ra = $self->http_proxy;
971 12   100     167 my $ret = (
972             ('ARRAY' eq ref($ra))
973             &&
974             defined($ra->[0])
975             &&
976             ($ra->[0] ne '')
977             );
978             # print STDERR " DDD is_http_proxy() return =$ret=\n";
979 12         48 return $ret;
980             } # is_http_proxy
981              
982             =back
983              
984             =head1 METHODS AND FUNCTIONS FOR BACKEND PROGRAMMERS
985              
986             =over
987              
988             =item reset_search
989              
990             Resets internal data structures to start over with a new search (on
991             the same engine).
992              
993             =cut
994              
995             sub reset_search
996             {
997 18     18 1 37 my $self = shift;
998 18 100       95 print STDERR " FFF reset_search(",$self->{'native_query'},")\n" if (DEBUG_FUNC || $self->{_debug});
999 18         341 $self->{'cache'} = [];
1000 18         41 $self->{'native_query'} = '';
1001 18         37 $self->{'next_to_retrieve'} = 1;
1002 18         36 $self->{'next_to_return'} = 0;
1003 18         31 $self->{'number_retrieved'} = 0;
1004 18         34 $self->{'requests_made'} = 0;
1005 18         32 $self->{'state'} = SEARCH_BEFORE;
1006 18         28 $self->{'_next_url'} = '';
1007             # This method is called by native_query(). native_query() is called
1008             # either by gui_query() or by the user. In the case that
1009             # gui_query() was called, we do NOT want to clear out the _options
1010             # hash. For now, I implement a pretty ugly hack to make this work:
1011 18 100       69 if (caller(2))
1012             {
1013 2         16 my @as = caller(2);
1014 2 50       10 if (1 < scalar(@as))
1015             {
1016             # print STDERR " in reset_search(), as is (", join(',', @as), ")\n";
1017 2 50       12 return if $as[3] =~ m/gui_query/;
1018             } # if
1019             } # if
1020 16         50 $self->{_options} = ();
1021             } # reset_search
1022              
1023              
1024             =item is_http_proxy_auth_data
1025              
1026             Returns true if all authentication data
1027             (proxy URL, username, and password) are available.
1028              
1029             =cut
1030              
1031             sub is_http_proxy_auth_data
1032             {
1033 5     5 1 10 my $self = shift;
1034             # print STDERR (" DDD http_proxy is ", Dumper(\$self));
1035 5   66     11 my $ret = (
1036             $self->is_http_proxy
1037             &&
1038             defined($self->http_proxy_user)
1039             &&
1040             ($self->http_proxy_user ne '')
1041             &&
1042             defined($self->http_proxy_pwd)
1043             &&
1044             ($self->http_proxy_pwd ne '')
1045             );
1046             # print STDERR " DDD is_http_proxy_auth_data() return =$ret=\n";
1047 5         49 return $ret;
1048             } # is_http_proxy_auth_data
1049              
1050              
1051             =item agent_name($sName)
1052              
1053             If your search engine rejects certain browser,
1054             you can trick it into thinking you're any browser type you want.
1055             See below under user_agent().
1056              
1057             =cut
1058              
1059             sub agent_name
1060             {
1061 7     7 1 868 return shift->_elem('agent_name', @_);
1062             }
1063              
1064             =item agent_email($sName)
1065              
1066             =cut
1067              
1068             sub agent_email
1069             {
1070 7     7 1 1352 return shift->_elem('agent_email', @_);
1071             }
1072              
1073             =item user_agent($NON_ROBOT)
1074              
1075             This internal routine creates a user-agent for derived classes that
1076             query the web. If any non-false argument is given, a normal
1077             LWP::UserAgent (rather than a LWP::RobotUA) is used.
1078              
1079             Returns the user-agent object.
1080              
1081             If a backend needs the low-level LWP::UserAgent or LWP::RobotUA to
1082             have a particular name, $oSearch->agent_name() and possibly
1083             $oSearch->agent_email() should be called to set the desired values
1084             *before* calling $oSearch->user_agent().
1085              
1086             If the environment variable WWW_SEARCH_USERAGENT has a value, it will
1087             be used as the class for a new user agent object. This class should
1088             be a subclass of LWP::UserAgent. For example,
1089              
1090             $ENV{WWW_SEARCH_USERAGENT} = 'My::Own::UserAgent';
1091             # If this env.var. has no value,
1092             # LWP::UserAgent or LWP::RobotUA will be used.
1093             $oSearch = new WWW::Search('MyBackend');
1094             $oSearch->agent_name('MySpider');
1095             if ($iBackendWebsiteRequiresNonRobot)
1096             {
1097             $oSearch->user_agent('non-robot');
1098             }
1099             else
1100             {
1101             $oSearch->agent_email('me@here.com');
1102             $oSearch->user_agent();
1103             }
1104              
1105             Backends should use robot-style user-agents whenever possible.
1106              
1107             =cut
1108              
1109             sub _load_env_useragent
1110             {
1111 4     4   6 my $self = shift;
1112 4   100     16 my $sUA = $ENV{'WWW_SEARCH_USERAGENT'} || '';
1113 4         7 my $ua;
1114 4 100       16 if ($sUA ne '')
1115             {
1116 2         131 eval "use $sUA";
1117             # print STDERR " DDD this is after eval use $sUA\n";
1118 2 100       12 if (! $@)
1119             {
1120             # Successfully loaded module.
1121 1         4 eval { $ua = $sUA->new };
  1         13  
1122             # print STDERR " DDD this is after eval new $sUA\n";
1123 1 50 33     6 if (ref($ua) && ! $@)
1124             {
1125             # Successfully created object.
1126 0         0 return $ua;
1127             }
1128             else
1129             {
1130 1         10 warn " --- WWW::Search::user_agent can not create $sUA object: $@\n";
1131             # Fall through and try the other methods:
1132             }
1133             }
1134             else
1135             {
1136 1         12 warn " --- WWW::Search::user_agent can not load $sUA: $@\n";
1137             # Fall through and try the other methods:
1138             }
1139             } # if found WWW_SEARCH_USERAGENT in environment
1140             } # _load_env_useragent
1141              
1142             sub user_agent
1143             {
1144 8     8 1 1527 my $self = shift;
1145 8 100       20 unless (@_)
1146             {
1147             # If NO ARGS, return the previously-created agent (if any):
1148 4 50       18 return $self->{'user_agent'} if ref($self->{'user_agent'});
1149             } # unless
1150 4   50     12 my $non_robot = shift || 0;
1151 4         9 my $ua = _load_env_useragent();
1152             # If we loaded a UserAgent, don't do any of this stuff:
1153 4 50       30 if (! ref $ua)
1154             {
1155 4 50       9 if ($non_robot)
1156             {
1157             eval
1158 4         8 {
1159 4         23 $ua = new LWP::UserAgent;
1160 4         3289 $ua->agent($self->agent_name);
1161 4         275 $ua->from($self->agent_email);
1162             }; # end of eval block
1163             }
1164             else
1165             {
1166 0         0 $ua = LWP::RobotUA->new($self->agent_name, $self->agent_email);
1167 0         0 $ua->delay($self->{'interrequest_delay'});
1168             }
1169 4         230 $ua->timeout($self->{'timeout'});
1170 4 100       59 eval { $ua->proxy(@{$self->{'http_proxy'}}) } if $self->is_http_proxy;
  1         3  
  1         7  
1171 4 50       246 if ($self->env_proxy)
1172             {
1173 0         0 $ua->env_proxy($self->env_proxy);
1174             # Read password from ENV:
1175 0         0 $self->http_proxy_user($ENV{http_proxy_user});
1176 0         0 $self->http_proxy_pwd ($ENV{http_proxy_pwd});
1177             } # if
1178             } # if ! ref $ua
1179 4         40 $self->{'user_agent'} = $ua;
1180 4         37 return $ua;
1181             } # user_agent
1182              
1183              
1184             =item http_referer
1185              
1186             Get / set the value of the HTTP_REFERER variable for this search object.
1187             Some search engines might only accept requests that originated at some specific previous page.
1188             This method lets backend authors "fake" the previous page.
1189             Call this method before calling http_request.
1190              
1191             $oSearch->http_referer('http://prev.engine.com/wherever/setup.html');
1192             $oResponse = $oSearch->http_request('GET', $url);
1193              
1194             =cut
1195              
1196             sub http_referer
1197             {
1198 3     3 1 801 return shift->_elem('_http_referer', @_);
1199             }
1200              
1201              
1202             =item http_method
1203              
1204             Get / set the method to be used for the HTTP request.
1205             Must be either 'GET' or 'POST'.
1206             Call this method before calling http_request.
1207             (Normally you would set this during _native_setup_search().)
1208             The default is 'GET'.
1209              
1210             $oSearch->http_method('POST');
1211              
1212             =cut
1213              
1214             sub http_method
1215             {
1216 3     3 1 1329 shift->_elem('http_method', @_);
1217             }
1218              
1219              
1220             =item http_request($method, $url)
1221              
1222             Submit the HTTP request to the world, and return the response.
1223             Similar to LWP::UserAgent::request.
1224             Handles cookies, follows redirects, etc.
1225             Requires that http_referer already be set up, if needed.
1226              
1227             =cut
1228              
1229             sub http_request
1230             {
1231 0     0 1 0 my $self = shift;
1232 0         0 my ($method, $url) = @_;
1233 0         0 my $response;
1234 0 0       0 if (50 <= $self->{_debug})
1235             {
1236 0         0 eval q{ use LWP::Debug qw(+) }; # } # emacs bug
1237             } # if
1238 0 0       0 if ($self->{search_from_file})
1239             {
1240 0         0 $response = $self->_http_request_from_file($url);
1241             }
1242             else
1243             {
1244             # fetch it
1245 0         0 my $request;
1246 0 0       0 if ($method eq 'POST')
1247             {
1248 0         0 my $uri_url = URI->new($url);
1249 0         0 my $equery = $uri_url->query;
1250 0         0 $uri_url->query(undef); # we will handle the query ourselves
1251 0         0 $request = new HTTP::Request($method, $uri_url);
1252 0         0 $request->header('Content-Type', 'application/x-www-form-urlencoded');
1253 0         0 $request->header('Content-Length', length $equery);
1254 0         0 $request->content($equery);
1255             }
1256             else
1257             {
1258 0         0 $request = new HTTP::Request($method, $url);
1259             }
1260 0         0 $request->header('Accept-Charset' => 'iso-8859-1,*,utf-8');
1261 0 0       0 if ($self->is_http_proxy_auth_data)
1262             {
1263 0         0 $request->proxy_authorization_basic($self->http_proxy_user,
1264             $self->http_proxy_pwd);
1265             } # if
1266 0 0       0 $self->{'_cookie_jar'}->add_cookie_header($request) if ref($self->{'_cookie_jar'});
1267              
1268 0 0 0     0 if ($self->{'_http_referer'} && ($self->{'_http_referer'} ne ''))
1269             {
1270 0         0 my $s = uri_escape($self->{'_http_referer'});
1271             # print STDERR " + referer($s), ref(s) = ", ref($s), "\n";
1272 0 0       0 $s = $s->as_string if ref($s) =~ m!URI!;
1273 0         0 $request->referer($s);
1274             } # if referer
1275 0 0       0 print STDERR " DDD raw HTTP::Request is:\n", $request->as_string if (3 <= $self->{_debug});
1276 0         0 my $ua = $self->user_agent();
1277              
1278             TRY_GET:
1279 0         0 while (1)
1280             {
1281 0         0 $response = $ua->request($request);
1282             printf(STDERR " + got HTTP::Response (code=%d):\n%s",
1283             $response->code,
1284 0 0       0 $response->headers->as_string) if (3 <= $self->{_debug});
1285 0 0       0 if (ref($self->{'_cookie_jar'}))
1286             {
1287 0         0 $self->{'_cookie_jar'}->extract_cookies($response);
1288 0 0       0 $self->{'_cookie_jar'}->save if $self->{'_cookie_jar_we_save'};
1289 0         0 print STDERR " + WWW::Search just extracted cookies\n" if DEBUG_COOKIES;
1290 0         0 print STDERR $self->{'_cookie_jar'}->as_string if DEBUG_COOKIES;
1291             # print STDERR Dumper($self->{'_cookie_jar'}) if DEBUG_COOKIES;
1292             } # if
1293 0 0 0     0 if ($self->{'search_to_file'} && $response->is_success)
1294             {
1295 0         0 $self->_http_request_to_file($url, $response);
1296             } # if
1297 0 0       0 last TRY_GET if ($response->is_success);
1298 0 0       0 last TRY_GET if ($response->is_error);
1299 0 0       0 last TRY_GET if ($response->headers->header('Client-Warning') =~ m!redirect loop detected!i);
1300 0 0 0     0 if ($response->is_redirect
1301             ||
1302             # Some engines spoof us with a false 302 code, so look at
1303             # the message rather than the code:
1304             ($response->message =~ m!Object moved!i)
1305             )
1306             {
1307 0         0 my $sURL = $response->request->uri->as_string;
1308 0         0 my $sURLredir = $response->headers->header('Location');
1309             # Low-level loop detection:
1310 0 0       0 last TRY_GET if ($sURLredir eq $sURL);
1311 0 0       0 print STDERR " + 'Object moved' from $sURL to $sURLredir\n" if (2 <= $self->{_debug});
1312             # Follow the redirect:
1313 0         0 $request = new HTTP::Request('GET',
1314             URI->new_abs($sURLredir, $sURL),
1315             );
1316 0         0 $request->referer($sURL);
1317 0 0       0 $self->{'_cookie_jar'}->add_cookie_header($request) if ref($self->{'_cookie_jar'});
1318 0 0       0 print STDERR " + 'Object moved', new HTTP::Request is:\n", $request->as_string if (3 <= $self->{_debug});
1319             # Go back and try again
1320             } # if
1321             } # while infinite
1322             } # if not from_file
1323 0         0 return $response;
1324             } # http_request
1325              
1326             sub _http_request_get_filename {
1327 0     0   0 my $self = shift;
1328 0         0 my $fn;
1329             # filename?
1330 0 0       0 if (!defined($self->{search_filename})) {
1331 0         0 $fn = $self->{search_from_file};
1332 0 0       0 $fn = $self->{search_to_file} if (!defined($fn));
1333 0         0 $self->{search_filename} = WWW::Search::unescape_query($fn);
1334             }
1335 0         0 $fn = $self->{search_filename};
1336 0 0       0 die "$0: bogus filename.\n" if (!defined($fn));
1337 0         0 return $fn;
1338             } # _http_request_get_filename
1339              
1340             sub _http_request_from_file {
1341 0     0   0 my $self = shift;
1342 0         0 my ($url) = @_;
1343              
1344 0         0 my $fn = $self->_http_request_get_filename();
1345              
1346             # read index?
1347 0 0       0 if (!defined($self->{search_from_file_hash})) {
1348 0 0       0 open(TABLE, "<$fn") || die "$0: open $fn failed.\n";
1349 0         0 my $i = 0;
1350 0         0 while () {
1351 0         0 chomp;
1352 0         0 $self->{search_from_file_hash}{$_} = $i;
1353             # print STDERR "$0: file index: $i <$_>\n";
1354 0         0 $i++;
1355             };
1356 0         0 close TABLE;
1357             };
1358             # read file
1359 0         0 my $i = $self->{search_from_file_hash}{$url};
1360 0 0       0 if (defined($i)) {
1361             # print STDERR "$0: saved request <$url> found in $fn.$i\n";
1362             # read the data
1363 0 0       0 open(FILE, "<$fn.$i") || die "$0: open $fn.$i\n";
1364 0         0 my $d = '';
1365 0         0 while () {
1366 0         0 $d .= $_;
1367             };
1368 0         0 close FILE;
1369             # make up the response
1370 0         0 my $r = new HTTP::Response(RC_OK);
1371 0         0 $r->content($d);
1372 0         0 return $r;
1373             } else {
1374 0         0 print STDERR "$0: saved request <$url> not found.\n";
1375 0         0 my $r = new HTTP::Response(RC_NOT_FOUND);
1376 0         0 return $r;
1377             };
1378             } # _http_request_from_file
1379              
1380             sub _http_request_to_file {
1381 0     0   0 my $self = shift;
1382             # The LAST arg is a LWP::Response object:
1383 0         0 my $response = pop;
1384             # The only other arg we care about is the FIRST arg, a url:
1385 0         0 my ($url, ) = @_;
1386              
1387 0         0 my $fn = $self->_http_request_get_filename();
1388              
1389             unlink($fn)
1390 0 0       0 if ($self->{search_to_file_index} == 0);
1391 0 0       0 open(TABLE, ">>$fn") || die "$0: open $fn\n";
1392 0         0 print TABLE "$url\n";
1393 0         0 close TABLE;
1394 0         0 my $i = ($self->{search_to_file_index})++;
1395 0 0       0 open (FILE, ">$fn.$i") || die "$0: open $fn.$i\n";
1396 0         0 print FILE $response->content();
1397 0         0 close FILE;
1398             } # _http_request_to_file
1399              
1400              
1401             =item next_url
1402              
1403             Get or set the URL for the next backend request. This can be used to
1404             save the WWW::Search state between sessions (e.g. if you are showing
1405             pages of results to the user in a web browser). Before closing down a
1406             session, save the value of next_url:
1407              
1408             ...
1409             $oSearch->maximum_to_return(10);
1410             while ($oSearch->next_result) { ... }
1411             my $urlSave = $oSearch->next_url;
1412              
1413             Then, when you start up the next session (e.g. after the user clicks
1414             your "next" button), restore this value before calling for the results:
1415              
1416             $oSearch->native_query(...);
1417             $oSearch->next_url($urlSave);
1418             $oSearch->maximum_to_return(20);
1419             while ($oSearch->next_result) { ... }
1420              
1421             WARNING: It is entirely up to you to keep your interface in sync with
1422             the number of hits per page being returned from the backend. And, we
1423             make no guarantees whether this method will work for any given
1424             backend. (Their caching scheme might not enable you to jump into the
1425             middle of a list of search results, for example.)
1426              
1427             =cut
1428              
1429             sub next_url
1430             {
1431 0     0 1 0 return shift->_elem('_next_url', @_);
1432             }
1433              
1434              
1435             =item split_lines
1436              
1437             This internal routine splits data (typically the result of the web
1438             page retrieval) into lines in a way that is OS independent. If the
1439             first argument is a reference to an array, that array is taken to be a
1440             list of possible delimiters for this split. For example, Yahoo.pm
1441             uses

and

  • as "line" delimiters for convenience.
  • 1442              
    1443             =cut
    1444              
    1445             sub split_lines
    1446             {
    1447             # This probably fails on an EBCDIC box where input is in text mode.
    1448             # Too bad Macs do not just use binmode like Windows boxen.
    1449 2     2 1 665 my $self = shift;
    1450 2         3 my $s = shift;
    1451 2         5 my $patt = '\015?\012';
    1452 2 100       6 if (ref($s))
    1453             {
    1454 1         4 $patt = '('. $patt;
    1455 1         4 foreach (@$s)
    1456             {
    1457 1         3 $patt .= "|$_";
    1458             } # foreach
    1459 1         3 $patt .= ')';
    1460             # print STDERR " + patt is >>>$patt<<<\n";
    1461 1         2 $s = shift;
    1462             } # if
    1463 2         55 return split(/$patt/i, $s);
    1464             # If we require perl 5.005, this can be done by:
    1465             # use Socket qw(:crlf :DEFAULT);
    1466             # split(/$CR?$LF/,$_[0])
    1467             } # split_lines
    1468              
    1469              
    1470             =item generic_option
    1471              
    1472             This internal routine checks if an option
    1473             is generic or backend specific.
    1474             Currently all generic options begin with 'search_'.
    1475             This routine is not a method.
    1476              
    1477             =cut
    1478              
    1479             sub generic_option
    1480             {
    1481 9   50 9 1 31 my $option = shift || '';
    1482 9         30 return ($option =~ /^search_/);
    1483             } # generic_option
    1484              
    1485              
    1486             =item _native_setup_search
    1487              
    1488             Do some backend-specific initialization.
    1489             It will be called with the same arguments as native_query().
    1490              
    1491             =cut
    1492              
    1493             sub _native_setup_search
    1494             {
    1495 0     0   0 my $self = shift;
    1496 0 0       0 print STDERR " FFF _n_s_s\n" if (DEBUG_FUNC || $self->{_debug});
    1497             # Backward-compatibility for backends that define the old
    1498             # native_setup_search(), but not the new _native_setup_search()
    1499 0 0       0 if ($self->can('native_setup_search'))
    1500             {
    1501 0         0 return $self->native_setup_search(@_);
    1502             } # if
    1503             } # _native_setup_search
    1504              
    1505              
    1506             =item setup_search
    1507              
    1508             This internal routine does generic Search setup.
    1509             It calls C<_native_setup_search()> to do backend-specific setup.
    1510              
    1511             =cut
    1512              
    1513             sub setup_search
    1514             {
    1515 10     10 1 48 my ($self) = @_;
    1516 10 100       52 print STDERR " FFF setup_search(",$self->{'native_query'},")\n" if (DEBUG_FUNC || $self->{_debug});
    1517 10         32 $self->{cache} = [];
    1518 10         21 $self->{next_to_retrieve} = 1;
    1519 10         21 $self->{number_retrieved} = 0;
    1520 10         49 $self->{state} = SEARCH_UNDERWAY;
    1521             # $self->{_options} = ();
    1522 10         47 $self->_native_setup_search($self->{'native_query'}, $self->{'native_options'});
    1523             } # setup_search
    1524              
    1525              
    1526             =item need_to_delay
    1527              
    1528             A backend should override this method in order to dictate whether
    1529             user_agent_delay() needs to be called before the next HTTP request is
    1530             sent. Return any perlish true or zero value.
    1531              
    1532             =cut
    1533              
    1534             sub need_to_delay
    1535             {
    1536 1     1 1 12 my $self = shift;
    1537             # This is a NOP stub. Unless the subclass overrides this method,
    1538             # there is no reason to delay.
    1539 1         2 return 0;
    1540             } # need_to_delay
    1541              
    1542              
    1543             =item user_agent_delay
    1544              
    1545             According to what need_to_delay() returns,
    1546             user_agent_delay() will be called between requests to remote
    1547             servers to avoid overloading them with many back-to-back requests.
    1548              
    1549             =cut
    1550              
    1551             sub user_agent_delay
    1552             {
    1553 2     2 1 42 my $self = shift;
    1554             # Sleep for some number of seconds:
    1555 2         500743 select(undef, undef, undef, $self->{interrequest_delay});
    1556             } # user_agent_delay
    1557              
    1558              
    1559             =item absurl
    1560              
    1561             An internal routine to convert a relative URL into a absolute URL. It
    1562             takes two arguments, the 'base' url (usually the search engine CGI
    1563             URL) and the URL to be converted. Returns a URI object.
    1564              
    1565             =cut
    1566              
    1567             sub absurl
    1568             {
    1569 3     3 1 74 my ($self, $base, $url) = @_;
    1570 3   100     21 $base ||= '';
    1571 3   100     13 $url ||= '';
    1572             # print STDERR " + this is WWW::Search::absurl($base,$url)\n" if 1 < $self->{_debug};
    1573 3 100       14 $base = $self->{_prev_url} if ($base eq '');
    1574 3 100 66     17 return '' unless (($url ne '') && ($base ne ''));
    1575 1         14 my $link = URI->new_abs($url, $base);
    1576 1         6354 return $link;
    1577             } # absurl
    1578              
    1579              
    1580             =item retrieve_some
    1581              
    1582             An internal routine to interface with C<_native_retrieve_some()>.
    1583             Checks for overflow.
    1584              
    1585             =cut
    1586              
    1587             sub retrieve_some
    1588             {
    1589 8     8 1 17 my $self = shift;
    1590 8 50       36 print STDERR " FFF retrieve_some(", $self->{'native_query'}, ")\n" if (DEBUG_FUNC || $self->{_debug});
    1591 8 50       25 return undef if ($self->{state} == SEARCH_DONE);
    1592 8 100       47 $self->setup_search() if ($self->{state} == SEARCH_BEFORE);
    1593 8         20 $self->{state} = SEARCH_RETRIEVING;
    1594 8 100       25 if (! $self->{'_allow_empty_query'})
    1595             {
    1596 5 50       14 if (! defined($self->{'native_query'}))
    1597             {
    1598 0         0 $self->{response} = new HTTP::Response(500, "query string is not defined");
    1599 0         0 $self->{state} = SEARCH_DONE;
    1600 0         0 return;
    1601             } # if
    1602 5 100       19 if ($self->{'native_query'} eq '')
    1603             {
    1604 3         20 $self->{response} = new HTTP::Response(500, "query string is empty");
    1605 3         201 $self->{state} = SEARCH_DONE;
    1606 3         8 return;
    1607             } # if
    1608             } # if
    1609             # Got enough already?
    1610 5 50       19 if ($self->{number_retrieved} >= $self->{'maximum_to_retrieve'})
    1611             {
    1612 0 0       0 print STDERR " DDD retrieve_some() got enough already\n" if (DEBUG_RETR || $self->{_debug});
    1613 0         0 $self->{state} = SEARCH_DONE;
    1614 0         0 return;
    1615             } # if
    1616             # Spinning our wheels?
    1617 5 50       29 if ($self->{requests_made} > $self->{'maximum_to_retrieve'})
    1618             {
    1619 0 0       0 print STDERR " DDD retrieve_some() too many requests\n" if (DEBUG_RETR || $self->{_debug});
    1620 0         0 $self->{state} = SEARCH_DONE;
    1621 0         0 return;
    1622             } # if
    1623             # Need more results:
    1624 5   50     28 my $res = $self->_native_retrieve_some() || 0;
    1625 5 50       43 print STDERR " + _native_retrieve_some() returned $res\n" if (DEBUG_RETR || $self->{_debug});
    1626 5         18 $self->{requests_made}++;
    1627 5         11 $self->{number_retrieved} += $res;
    1628 5 50       21 $self->{state} = SEARCH_DONE if ($res == 0);
    1629 5         18 return $res;
    1630             } # retrieve_some
    1631              
    1632              
    1633             sub HTML::TreeBuilder::www_search_reset
    1634             {
    1635             # If a reset() method becomes available in HTML::TreeBuilder, we
    1636             # won't need this any more.
    1637 0     0 0 0 my $self = shift;
    1638 0         0 $self->delete;
    1639             # These 4 lines copied from HTML::TreeBuilder::new
    1640 0         0 $self->{'_head'} = $self->insert_element('head',1);
    1641 0         0 $self->{'_pos'} = undef; # pull it back up
    1642 0         0 $self->{'_body'} = $self->insert_element('body',1);
    1643 0         0 $self->{'_pos'} = undef; # pull it back up again
    1644             } # HTML::TreeBuilder::www_search_reset
    1645              
    1646              
    1647             =item _native_retrieve_some
    1648              
    1649             Fetch the next page of results from the web engine, parse the results,
    1650             and prepare for the next page of results.
    1651              
    1652             If a backend defines this method, it is in total control of the WWW
    1653             fetch, parsing, and preparing for the next page of results. See the
    1654             WWW::Search::AltaVista module for example usage of the
    1655             _native_retrieve_some method.
    1656              
    1657             An easier way to achieve this in a backend is to inherit
    1658             _native_retrieve_some from WWW::Search, and do only the HTML parsing.
    1659             Simply define a method _parse_tree which takes one argument, an
    1660             HTML::TreeBuilder object, and returns an integer, the number of
    1661             results found on this page. See the WWW::Search::Yahoo module for
    1662             example usage of the _parse_tree method.
    1663              
    1664             A backend should, in general, define either _parse_tree() or
    1665             _native_retrieve_some(), but not both.
    1666              
    1667             Additional features of the default _native_retrieve_some method:
    1668              
    1669             Sets $self->{_prev_url} to the URL of the page just retrieved.
    1670              
    1671             Calls $self->preprocess_results_page() on the raw HTML of the page.
    1672              
    1673             Then, parses the page with an HTML::TreeBuilder object and passes that
    1674             populated object to $self->_parse_tree().
    1675              
    1676             Additional notes on using the _parse_tree method:
    1677              
    1678             The built-in HTML::TreeBuilder object used to parse the page has
    1679             store_comments turned ON. If a backend needs to use a subclassed or
    1680             modified HTML::TreeBuilder object, the backend should set
    1681             $self->{'_treebuilder'} to that object before any results are
    1682             retrieved. The best place to do this is at the end of
    1683             _native_setup_search.
    1684              
    1685             my $oTree = new myTreeBuilder;
    1686             $oTree->store_pis(1); # for example
    1687             $self->{'_treebuilder'} = $oTree;
    1688              
    1689             When _parse_tree() is called, the $self->next_url is cleared.
    1690             During parsing, the backend should set $self->next_url to the appropriate URL for the next page of results.
    1691             (If _parse_tree() does not set the value, the search will end after parsing this page of results.)
    1692              
    1693             When _parse_tree() is called, the URL for the page being parsed can be
    1694             found in $self->{_prev_url}.
    1695              
    1696             =cut
    1697              
    1698             sub _parse_tree
    1699             {
    1700 1     1   8 my $self = shift;
    1701             # This is a NOP stub. Backend MUST define their own parse function!
    1702 1 50       7 print STDERR " FFF stub _parse_tree\n" if (DEBUG_FUNC || $self->{_debug});
    1703             # This is for backward-compatibility, for backends that define the
    1704             # old parse_tree(), but not the new _parse_tree():
    1705 1 50       20 return $self->parse_tree(@_) if $self->can('parse_tree');
    1706 1         2 return 0;
    1707             } # _parse_tree
    1708              
    1709              
    1710             sub _native_retrieve_some
    1711             {
    1712 0     0   0 my $self = shift;
    1713 0 0       0 if ($self->can('native_retrieve_some'))
    1714             {
    1715             # This is for backward-compatibility, for backends that define the
    1716             # old native_retrieve_some(), but not the new
    1717             # _native_retrieve_some():
    1718 0         0 return $self->native_retrieve_some(@_);
    1719             } # if
    1720 0 0       0 print STDERR " FFF _n_r_s\n" if (DEBUG_FUNC || $self->{_debug});
    1721             # Fast exit if already done:
    1722 0 0       0 return if (!defined($self->{_next_url}));
    1723 0 0       0 return if ($self->{_next_url} eq q{});
    1724             # If this is not the first page of results, sleep so as to not
    1725             # overload the server:
    1726 0   0     0 $self->{_next_to_retrieve} ||= 1;
    1727             $self->user_agent_delay if (
    1728             (1 < $self->{_next_to_retrieve})
    1729 0 0 0     0 ||
    1730             $self->need_to_delay
    1731             );
    1732             # Get one page of results:
    1733 0 0       0 print STDERR " + submitting URL (", $self->{'_next_url'}, ")\n" if $self->{_debug};
    1734 0         0 my $response = $self->http_request($self->http_method, $self->{'_next_url'});
    1735 0 0       0 print STDERR " + got response\n", $response->headers->as_string, "\n" if 2 <= $self->{_debug};
    1736 0         0 $self->{_prev_url} = $self->{_next_url};
    1737             # Assume there are no more results, unless we find out otherwise
    1738             # when we parse the html:
    1739 0         0 $self->{_next_url} = undef;
    1740 0         0 $self->{response} = $response;
    1741             # print STDERR " --- HTTP response is:\n", $response->as_string if 4 < $self->{_debug};
    1742 0 0       0 if (! $response->is_success)
    1743             {
    1744 0 0       0 if ($self->{_debug})
    1745             {
    1746 0         0 print STDERR " --- HTTP request failed, response is:\n", $response->as_string;
    1747             } # if
    1748 0         0 return;
    1749             } # if
    1750             # Pre-process the output:
    1751 0         0 my $sPage = $self->preprocess_results_page($response->content);
    1752             # Parse the output:
    1753 0         0 my $tree;
    1754 0 0       0 if (ref $self->{'_treebuilder'})
    1755             {
    1756             # print STDERR " + using existing _treebuilder\n" if 1 < $self->{_debug};
    1757             # Assume that the backend has installed their own TreeBuilder
    1758 0         0 $tree = $self->{'_treebuilder'};
    1759             }
    1760             else
    1761             {
    1762             # print STDERR " + creating new _treebuilder\n" if 1 < $self->{_debug};
    1763 0         0 $tree = HTML::TreeBuilder->new(
    1764             # use all default options
    1765             );
    1766 0         0 $tree->store_comments('yes');
    1767 0         0 $self->{'_treebuilder'} = $tree;
    1768             }
    1769             # If a reset() method becomes available in HTML::TreeBuilder, we can
    1770             # change this:
    1771 0         0 $tree->www_search_reset;
    1772             # print STDERR " + parsing content, tree is ", Dumper(\$tree) if 1 < $self->{_debug};
    1773             # use Encode;
    1774             # my $sPageOctets = Encode::encode_utf8($sPage);
    1775 0         0 $tree->utf8_mode('true');
    1776 0         0 $tree->parse($sPage);
    1777             # print STDERR " + done parsing content.\n" if 1 < $self->{_debug};
    1778 0         0 $tree->eof();
    1779 0 0       0 print STDERR " + calling _parse_tree...\n" if (1 < $self->{_debug});
    1780 0         0 return $self->_parse_tree($tree);
    1781             } # _native_retrieve_some
    1782              
    1783              
    1784             =item result_as_HTML
    1785              
    1786             Given a WWW::SearchResult object, formats it human-readable with HTML.
    1787              
    1788             =cut
    1789              
    1790             sub result_as_HTML
    1791             {
    1792             # print STDERR " DDD r_as_H(@_)\n";
    1793 6     6 1 327 my $self = shift;
    1794 6 100       27 my $oSR = shift or return '';
    1795 3 100       21 return '' unless (ref($oSR) =~ m'WWW::Search(::)?Result');
    1796 1         10 my $o = new CGI;
    1797 1         376 return join('',
    1798             $o->a(
    1799             { href => $oSR->url, },
    1800             $oSR->title,
    1801             ),
    1802             $o->br,
    1803             $oSR->description,
    1804             );
    1805             } # result_as_HTML
    1806              
    1807              
    1808             =item preprocess_results_page
    1809              
    1810             A filter on the raw HTML of the results page.
    1811             This allows the backend to alter the HTML before it is parsed,
    1812             such as to correct for known problems, HTML that can not be parsed correctly, etc.
    1813              
    1814             Takes one argument, a string (the HTML webpage);
    1815             returns one string (the same HTML, modified).
    1816              
    1817             This method is called from within _native_retrieve_some (above)
    1818             before the HTML of the page is parsed.
    1819              
    1820             See the WWW::Search::Ebay distribution 2.07 or higher for example
    1821             usage.
    1822              
    1823             =cut
    1824              
    1825             sub preprocess_results_page
    1826             {
    1827             # Here is just a stub. Return our argument without modification.
    1828 2     2 1 13 my $self = shift;
    1829 2         4 return shift;
    1830             } # preprocess_results_page
    1831              
    1832              
    1833             =item test_cases (DEPRECATED)
    1834              
    1835             Deprecated.
    1836              
    1837             Returns the value of the $TEST_CASES variable of the backend engine.
    1838              
    1839             =cut
    1840              
    1841             sub test_cases
    1842             {
    1843 1     1 1 8 my $self = shift;
    1844 1         53 return eval '$'.ref($self).'::TEST_CASES';
    1845             } # test_cases
    1846              
    1847             =item hash_to_cgi_string (DEPRECATED)
    1848              
    1849             Given a reference to a hash of string => string, constructs a CGI
    1850             parameter string that looks like 'key1=value1&key2=value2'.
    1851              
    1852             If the value is undef, the key will not be added to the string.
    1853              
    1854             At one time, for testing purposes, we asked backends to use this
    1855             function rather than piecing the URL together by hand, to ensure that
    1856             URLs are identical across platforms and software versions. But this
    1857             is no longer necessary.
    1858              
    1859             Example:
    1860              
    1861             $self->{_options} = {
    1862             'opt3' => 'val3',
    1863             'search_url' => 'http://www.deja.com/dnquery.xp',
    1864             'opt1' => 'val1',
    1865             'QRY' => $native_query,
    1866             'opt2' => 'val2',
    1867             };
    1868             $self->{_next_url} = $self->{_options}{'search_url'} .'?'.
    1869             $self->hash_to_cgi_string($self->{_options});
    1870              
    1871             =cut
    1872              
    1873             sub hash_to_cgi_string
    1874             {
    1875 2     2 1 20 my $self = shift;
    1876             # Because of the design of our test suite, we need our generated
    1877             # URLs to be identical on all systems, all versions of perl. Ergo
    1878             # we must explicitly control the order in which our CGI parameter
    1879             # strings are cobbled together. For now, I assume sorting the hash
    1880             # keys will suffice.
    1881 2         4 my $rh = shift;
    1882 2         3 my $ret = '';
    1883 2         15 foreach my $key (sort keys %$rh)
    1884             {
    1885             # printf STDERR "option: $key is " . $rh->{$key} . "\n";
    1886 4 50       10 next if generic_option($key);
    1887             # Throw out keys with undef values.
    1888 4 100       11 next unless defined($rh->{$key});
    1889             # If we want to let the user delete options, uncomment the next
    1890             # line. (They can still blank them out, which may or may not have
    1891             # the same effect):
    1892             # next unless $rh->{$key} ne '';
    1893 3         10 $ret .= $key .'='. $rh->{$key} .'&';
    1894             } # foreach $key
    1895             # Remove the trailing '&':
    1896 2         5 chop $ret;
    1897 2         6 return $ret;
    1898             } # hash_to_cgi_string
    1899              
    1900              
    1901             =back
    1902              
    1903             =head1 IMPLEMENTING NEW BACKENDS
    1904              
    1905             C supports backends to separate search engines. Each
    1906             backend is implemented as a subclass of C.
    1907             L provides a good sample backend.
    1908              
    1909             A backend must have the routine C<_native_setup_search()>. A backend
    1910             must have the routine C<_native_retrieve_some()> or C<_parse_tree()>.
    1911              
    1912             C<_native_setup_search()> is invoked before the search. It is passed
    1913             a single argument: the escaped, native version of the query.
    1914              
    1915             C<_native_retrieve_some()> is the core of a backend. It will be called
    1916             periodically to fetch URLs. It should retrieve several hits from the
    1917             search service and add them to the cache. It should return the number
    1918             of hits found, or undef when there are no more hits.
    1919              
    1920             Internally, C<_native_retrieve_some()> typically sends an HTTP request to
    1921             the search service, parses the HTML, extracts the links and
    1922             descriptions, then saves the URL for the next page of results. See
    1923             the code for the C module for an example.
    1924              
    1925             Alternatively, a backend can define the method C<_parse_tree()> instead
    1926             of C<_native_retrieve_some()>. See the C module for a
    1927             good example.
    1928              
    1929             If you implement a new backend, please let the authors know.
    1930              
    1931              
    1932             =head1 BUGS AND DESIRED FEATURES
    1933              
    1934             The bugs are there for you to find (some people call them Easter Eggs).
    1935              
    1936             Desired features:
    1937              
    1938             =over 4
    1939              
    1940             =item A portable query language.
    1941              
    1942             A portable language would easily allow you to move queries easily
    1943             between different search engines. A query abstraction is non-trivial
    1944             and unfortunately will not be done any time soon by the current
    1945             maintainer. If you want to take a shot at it, please let me know.
    1946              
    1947             =back
    1948              
    1949             =head1 AUTHOR
    1950              
    1951             John Heidemann
    1952             Maintained by Martin Thurn, C, L.
    1953              
    1954             =head1 COPYRIGHT
    1955              
    1956             Copyright (c) 1996 University of Southern California.
    1957             All rights reserved.
    1958              
    1959             Redistribution and use in source and binary forms are permitted
    1960             provided that the above copyright notice and this paragraph are
    1961             duplicated in all such forms and that any documentation, advertising
    1962             materials, and other materials related to such distribution and use
    1963             acknowledge that the software was developed by the University of
    1964             Southern California, Information Sciences Institute. The name of the
    1965             University may not be used to endorse or promote products derived from
    1966             this software without specific prior written permission.
    1967              
    1968             THIS SOFTWARE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
    1969             WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
    1970             MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
    1971              
    1972             =cut
    1973              
    1974              
    1975             1;
    1976              
    1977             __END__