File Coverage

blib/lib/URI/ParseSearchString/More.pm
Criterion Covered Total %
statement 28 30 93.3
branch n/a
condition n/a
subroutine 10 10 100.0
pod n/a
total 38 40 95.0


line stmt bran cond sub pod time code
1 4     4   496681 use warnings;
  4         11  
  4         151  
2 4     4   23 use strict;
  4         8  
  4         272  
3              
4             package URI::ParseSearchString::More;
5             $URI::ParseSearchString::More::VERSION = '0.17';
6 4     4   20 use base qw( URI::ParseSearchString );
  4         13  
  4         4297  
7              
8 4     4   107736 use List::Compare ();
  4         98320  
  4         341  
9 4     4   4034 use Params::Validate qw( validate SCALAR );
  4         51283  
  4         387  
10 4     4   35 use Try::Tiny;
  4         8  
  4         190  
11 4     4   22 use URI;
  4         6  
  4         93  
12 4     4   3565 use URI::Heuristic qw(uf_uristr);
  4         10565  
  4         324  
13 4     4   3153 use URI::QueryParam ();
  4         2645  
  4         80  
14 4     4   16811 use WWW::Mechanize::Cached ();
  0            
  0            
15              
16             my %search_regex = (
17             answers => [qr{(.*)}],
18             aol => [qr{(.*) - AOL Search Results}],
19             as => [qr{Starware (.*) Search Results}],
20             dogpile => [qr{(.*) - Dogpile Web Search}],
21             );
22              
23             my %url_regex = (
24             answers => qr{answers.yahoo.com},
25             aol => qr{aol.com/(?:aol|aolcom)/search\?encquery=},
26             as => qr{as.\w+.com/dp/search\?x=},
27             dogpile => qr{http://www.dogpile},
28             );
29              
30             # local.yahoo.com should come before Yahoo as it has different
31             # params
32             # ie. this list is for ordering some of the engines in the regex. if an
33             # engine has no reason to be at a certain point in the list, it should
34             # just be listed in the %query_lookup table
35              
36             my @engines = (
37             'local.google', 'maps.google',
38             'googlesyndication', 'google',
39             'local.yahoo.com', 'search.yahoo.com',
40             'shopping.yahoo.com', 'yahoo',
41             'alltheweb.com', 'errors.aol.com',
42             'sucheaol.aol.de', 'aol',
43             'ask.*', 'fastbrowsersearch.com',
44             'as.*.com', 'att.net',
45             'trustedsearch.com',
46             );
47              
48             my %query_lookup = (
49              
50             'abcsok.no' => ['q'],
51             'about.com' => ['terms'],
52             'alltheweb.com' => ['q'],
53             'answers.com' => ['s'],
54             'aol' => [ 'query', 'q' ],
55             'as.*.com' => ['qry'],
56             'ask.*' => ['q'],
57             'att.net' => [ 'qry', 'q' ],
58             'baidu.com' => ['bs'],
59             'bing.com' => ['q'],
60             'blingo.com' => ['q'],
61             'citysearch.com' => ['query'],
62             'clicknow.org.uk' => ['q'],
63             'clusty.com' => ['query'],
64             'comcast.net' => [ 'query', 'q' ],
65             'cuil.com' => ['q'],
66             'danielsearch.info' => ['q'],
67             'devilfinder.com' => ['q'],
68             'ebay' => ['satitle'],
69             'education.yahoo.com' => ['p'],
70             'errors.aol.com' => ['host'],
71             'excite' => ['search'],
72             'ez4search.com' => ['searchname'],
73             'fastbrowsersearch.com' => ['q'],
74             'fedstats.com' => ['s'],
75             'find.copernic.com' => ['query'],
76             'finna.is' => ['query'],
77             'googlesyndication' => [ 'q', 'ref', 'loc' ],
78             'google' => [ 'q', 'as_q' ],
79             'googel' => ['q'],
80             'hotbot.lycos.com' => ['query'],
81             'isearch.com' => ['Terms'],
82             'local.google' => [ 'q', 'near' ],
83             'local.yahoo.com' => [ 'stx', 'csz' ],
84             'looksmart.com' => ['key'],
85             'lycos' => ['query'],
86             'maps.google' => [ 'q', 'near' ],
87             'msntv.msn.com' => ['q'],
88             'munky.com' => ['term'],
89             'mysearch.com' => ['searchfor'],
90             'mywebsearch.com' => ['searchfor'],
91             'mytelus.com' => ['q'],
92             'netscape.com' => ['query'],
93             'nextag.com' => ['search'],
94             'overture.com' => ['Keywords'],
95             'pricescan.com' => ['SearchString'],
96             'reviews.search.com' => ['q'],
97             'search.com' => ['q'],
98             'searchalot.com' => ['q'],
99             'searchfusion.com' => ['t'],
100             'searchon.ca' => ['Terms'],
101             'search.cnn.com' => ['query'],
102             'search.bearshare.com' => ['q'],
103             'search.comcast.net' => ['q'],
104             'search.dmoz.org' => ['search'],
105             'search.earthlink.net' => ['q'],
106             'search.findsall.info' => ['s'],
107             'search.freeserve.com' => ['q'],
108             'search.freeze.com' => ['Keywords'],
109             'search.go.com' => ['search'],
110             qr/search\d?.incredimail.com/ => ['q'],
111             'search.juno.com' => ['query'],
112             'search.iol.ie' => ['q'],
113             'search.live.com' => ['q'],
114             'search.netzero.net' => ['query'],
115             'search.*.msn.' => ['q'],
116             'search.myway.com' => ['searchfor'],
117             'search.opera.com' => ['search'],
118             'search.rogers.com' => [ 'qf', 'qo' ],
119             'search.rr.com' => ['qs'],
120             'search.start.co.il' => ['q'],
121             'search.starware.com' => ['qry'],
122             'search.sympatico.msn.ca' => ['q'],
123             'search.sweetim.com' => ['q'],
124             'search.usatoday.com' => ['kw'],
125             'search.yahoo.com' => ['va'],
126             'search.virgilio.it' => ['qs'],
127             'search.wanadoo.co.uk' => ['q'],
128             'search.yahoo.com' => [ 'q', 'va', 'p' ],
129             'searchservice.myspace.com' => ['qry'],
130             'shopping.yahoo.com' => ['p'],
131             'start.shaw.ca' => ['q'],
132             'startgoogle.startpagina.nl' => ['q'],
133             'stumbleupon.com' => ['url'],
134             'sucheaol.aol.de' => ['q'],
135             'teoma.com' => ['q'],
136             'toronto.com' => ['query'],
137             'trustedsearch.net' => ['w'],
138             'trustedsearch.com' => ['w'],
139             'yahoo' => ['p'],
140             'yandex.ru' => ['text'],
141             'youtube.com' => ['search_query'],
142             'websearch.cbc.ca' => ['query'],
143             'websearch.cs.com' => ['query'],
144             'webtv.net' => ['q'],
145             'www.bestsearchonearth.info' => ['Keywords'],
146             'www.boat.com' => ['HotKeysTopCategory'],
147             'www.factbites.com' => ['kp'],
148             'www.mweb.co.za' => ['q'],
149             'www.rr.com/html/search.cfm' => ['query'],
150             'www.wotbox.com' => ['q'],
151             );
152              
153             sub parse_search_string {
154             my $self = shift;
155             my $url = shift;
156              
157             foreach my $engine ( keys %url_regex ) {
158              
159             if ( $url =~ $url_regex{$engine} ) {
160              
161             # fix funky URLs
162             $url = uf_uristr( $url );
163              
164             my $mech = $self->get_mech();
165              
166             try { $mech->get( $url ); }
167             catch {
168             warn "Issue with url: $url";
169             warn $_;
170             };
171              
172             if ( $mech->status && $mech->status == 403 ) {
173             warn "403 returned for $url Are you being blocked?";
174             }
175              
176             if ( $mech->title() ) {
177             my $search_term = $self->_apply_regex(
178             string => $mech->title(),
179             engine => $engine,
180             );
181              
182             if ( $search_term ) {
183             $self->{'more'}->{'blame'} = __PACKAGE__;
184             return $search_term;
185             }
186             }
187             }
188             }
189              
190             my $terms = $self->parse_more( $url );
191              
192             if ( $terms ) {
193             $self->{'more'}->{'blame'} = __PACKAGE__;
194             return $terms;
195             }
196              
197             # We've come up empty. Let's see what the superclass can do
198             $self->{'more'}->{'blame'} = 'URI::ParseSearchString';
199             return $self->SUPER::parse_search_string( $url, @_ );
200             }
201              
202             sub se_term {
203             my $self = shift;
204             return $self->parse_search_string( @_ );
205             }
206              
207             sub parse_more {
208             my $self = shift;
209             my $url = shift;
210              
211             die "you need to supply at least one argument" unless $url;
212              
213             $self->{'more'} = undef;
214             $self->{'more'}->{'string'} = $url;
215              
216             my $regex = join " | ", $self->_get_engines;
217             $self->{'more'}->{'regex'} = $regex;
218             $self->{'more'}->{'url'} = $url;
219              
220             if ( $url =~ m{ ( (?: $regex ) .* ?/ ) .* ?\? (.*)\z }xms ) {
221              
222             my $domain = $1;
223             my $query_string = $2;
224              
225             # for some reason, escaped quoted strings were messed up under mod_perl
226             $query_string =~ s{"}{"}gxms;
227             $query_string =~ s{&\#39;}{'}gxms;
228              
229             # remove trailing slash
230             $domain =~ s{/\z}{};
231              
232             my @param_parts = ();
233             my %params = ();
234             my @engines = $self->_get_engines;
235              
236             my $uri = URI->new( $url );
237              
238             ENGINE:
239             foreach my $engine ( @engines ) {
240              
241             if ( $domain =~ /$engine/i ) {
242              
243             my @names = @{ $query_lookup{$engine} };
244              
245             $self->{'more'}->{'domain'} = $domain;
246             $self->{'more'}->{'names'} = \@names;
247              
248             foreach my $name ( @names ) {
249             push @param_parts, $uri->query_param( $name );
250             $params{$name} = $uri->query_param( $name );
251             }
252              
253             last ENGINE;
254             }
255             }
256              
257             my $params = join( " ", @param_parts );
258             my $orig_domain = $domain;
259             $domain =~ s/\/.*//g;
260             unless ( $domain =~ /\w/ ) {
261             $domain = $orig_domain;
262             }
263              
264             $self->{'more'}->{'terms'} = \@param_parts;
265             $self->{'more'}->{'params'} = \%params;
266              
267             return $params;
268             }
269              
270             return;
271             }
272              
273             sub blame {
274             my $self = shift;
275             return $self->{more}->{blame};
276             }
277              
278             sub guess {
279             my $self = shift;
280             my $url = shift || $self->{'more'}->{'string'};
281              
282             my @guesses = ( 'q', 'query', 'searchfor' );
283              
284             my $uri = URI->new( $url );
285             if ( $uri->query_params ) {
286              
287             foreach my $guess ( @guesses ) {
288             if ( $uri->query_param( $guess ) ) {
289             return $uri->query_param( $guess );
290             }
291             }
292             }
293              
294             return;
295             }
296              
297             sub set_cached {
298             my $self = shift;
299             my $switch = shift;
300              
301             if ( $switch ) {
302             $self->{'__more_cached'} = 1;
303             }
304             else {
305             $self->{'__more_cached'} = 0;
306             }
307              
308             return $self->{'__more_cached'};
309             }
310              
311             sub get_cached {
312             my $self = shift;
313              
314             return $self->{'__more_cached'};
315             }
316              
317             sub get_mech {
318             my $self = shift;
319             my $cache = $self->get_cached;
320              
321             if ( $cache ) {
322              
323             if ( !exists $self->{'__more_mech_cached'} ) {
324              
325             my $mech = WWW::Mechanize::Cached->new();
326             $mech->agent( "URI::ParseSearchString::More" );
327             $self->{'__more_mech_cached'} = $mech;
328              
329             }
330              
331             return $self->{'__more_mech_cached'};
332              
333             }
334              
335             # return a non-caching object
336             if ( !exists $self->{'__more_mech'} ) {
337              
338             my $mech = WWW::Mechanize->new();
339             $mech->agent( "URI::ParseSearchString::More" );
340             $self->{'__more_mech'} = $mech;
341              
342             }
343              
344             return $self->{'__more_mech'};
345             }
346              
347             sub _apply_regex {
348             my $self = shift;
349             my %rules = (
350             string => { type => SCALAR },
351             engine => { type => SCALAR },
352             );
353              
354             my %args = validate( @_, \%rules );
355              
356             foreach my $regex ( @{ $search_regex{ $args{'engine'} } } ) {
357             if ( $args{'string'} =~ $regex ) {
358             return $1;
359             }
360             }
361             return;
362             }
363              
364             sub _get_engines {
365             my $lc = List::Compare->new( \@engines, [ keys %query_lookup ] );
366             my @remaining_engines = $lc->get_complement;
367              
368             my @all_engines = @engines;
369             push @all_engines, @remaining_engines;
370              
371             return @all_engines;
372             }
373              
374              
375             1;
376              
377             # ABSTRACT: Extract search strings from more referrers.
378              
379             __END__
380              
381             =pod
382              
383             =encoding UTF-8
384              
385             =head1 NAME
386              
387             URI::ParseSearchString::More - Extract search strings from more referrers.
388              
389             =head1 VERSION
390              
391             version 0.17
392              
393             =head1 SYNOPSIS
394              
395             use URI::ParseSearchString::More;
396             my $more = URI::ParseSearchString::More;
397             my $search_terms = $more->se_term( $search_engine_referring_url );
398              
399             =head1 DESCRIPTION
400              
401             This module is a subclass of L<URI::ParseSearchString>, so you can call any
402             methods on this object that you would call on a URI::ParseSearchString object.
403             This module works a little harder than its SuperClass to get you results. If
404             it fails, it will return to you the results that L<URI::ParseSearchString>
405             would have returned to you anyway, so it should function well as a drop-in
406             replacement.
407              
408             L<WWW::Mechanize> is used to extract search strings from some URLs
409             which contain session info rather than search params. Optionally,
410             L<WWW::Mechanize::Cached> can be used to cache your lookups. There is additional
411             parsing and also a guess() method which will return good results in many cases
412             of doubt.
413              
414             Repository: L<http://github.com/oalders/uri-parsesearchstring-more/tree/master>
415              
416             =head1 USAGE
417              
418             use URI::ParseSearchString::More;
419             my $more = URI::ParseSearchString::More->new;
420             my $search_terms = $more->se_term( $url );
421              
422             =head1 URI::ParseSearchString
423              
424             =head2 parse_search_string( $url )
425              
426             At this point, this is the only "extended" URI::ParseSearchString method.
427             This method performs the following bit of logic:
428              
429             1) If the URL supplied looks to be a search query with session info rather
430             than search data in the URL, it will attempt to access the URL and extract the
431             search terms from the page returned.
432              
433             2) If this returns no results, the URL will be processed by parse_more()
434              
435             3) If there are still no results, the results of URI::ParseSearchString::se_term
436             will be returned.
437              
438             WWW::Mechanize::Cached can be used to speed up your movement through large log
439             files which may contain multiple similar URLs:
440              
441             use URI::ParseSearchString::More;
442             my $more = URI::ParseSearchString::More->new;
443             $more->set_cached( 1 );
444             my $search_terms = $more->se_term( $url );
445              
446             One interesting thing to note is that maps.google.* URLs have 2 important
447             params: "q" and "near". The same can be said for local.google.* I would
448             think the results would be incomplete without including the value of "near" in
449             the search terms for these searches. So, expect the following results:
450              
451             my $url = ""http://local.google.ca/local?sc=1&hl=en&near=Stratford%20ON&btnG=Google%20Search&q=home%20health";
452             my $terms = $more->parse_search_string( $url );
453              
454             # $terms will = "home health Stratford ON"
455              
456             Engines with session info currently supported:
457              
458             aol.com
459              
460             =head2 se_term( $url )
461              
462             A convenience method which calls parse_search_string.
463              
464             =head1 URI::ParseSearchString::More
465              
466             =head2 blame
467              
468             Returns the name of the module that came up with the results on the last
469             string parsed by parse_search_string(). Possible results:
470              
471             URI::ParseSearchString
472             URI::ParseSearchString::More
473              
474             =head2 set_cached( 0|1 )
475              
476             Turn caching off and on. As of version 0.08 caching is OFF by default. See
477             KNOWN ISSUES below for more info on this.
478              
479             =head2 get_cached
480              
481             Returns 1 if caching is currently on, 0 if it is not.
482              
483             =head2 get_mech
484              
485             This gives you direct access to the Mechanize object. If caching is enabled,
486             a L<WWW::Mechanize::Cached> object will be returned. If caching is disabled,
487             a L<WWW::Mechanize> object will be returned.
488              
489             If you know what you're doing, play around with it. Caveat emptor.
490              
491             use URI::ParseSearchString::More;
492             my $more = URI::ParseSearchString::More;
493              
494             my $mech = $more->get_mech();
495             $mech->agent("My Agent Name");
496              
497             my $search_terms = $more->se_term( $search_engine_referring_url );
498              
499             =head2 parse_more( $url )
500              
501             Handles the bulk of More's parsing. This is automatically called (if needed)
502             when you pass a search string to se_term(). However, you may also call it
503             directly. Just keep in mind that this method will NOT try to get results from
504             URI::ParseSearchString if it comes up empty.
505              
506             =head2 guess( $url )
507              
508             For the most part, the parsing that goes on is done with specific search
509             engines (ie. the ones that we already know about) in mind. However, in a lot
510             cases, a good guess is all that you need. For example, a URI which contains
511             a query string with the parameter "q" or "query" is generally the product of
512             a search. If se_term() or parse_more() has come up empty, guess may just
513             provide you with a valid search term. Then again, it may not. Caveat emptor.
514              
515             =head1 TO DO
516              
517             I've pretty much added all of the search engines I care about. If you'd like
518             something added, please get in touch.
519              
520             =head1 NOTES
521              
522             Despite its low version number, this module is now stable.
523              
524             =head1 KNOWN ISSUES
525              
526             As of 0.13 WWW::Mechanize::Cached 1.33 is required. This solves the errors
527             which were being thrown by Storable.
528              
529             =head1 AUTHOR
530              
531             Olaf Alders <olaf@wundercounter.com>
532              
533             =head1 COPYRIGHT AND LICENSE
534              
535             This software is copyright (c) 2012 by Olaf Alders.
536              
537             This is free software; you can redistribute it and/or modify it under
538             the same terms as the Perl 5 programming language system itself.
539              
540             =cut