File Coverage

blib/lib/URI/ParseSearchString.pm
Criterion Covered Total %
statement 57 62 91.9
branch 20 28 71.4
condition 10 15 66.6
subroutine 10 11 90.9
pod 6 6 100.0
total 103 122 84.4


line stmt bran cond sub pod time code
1             package URI::ParseSearchString;
2              
3             require Exporter;
4             @ISA = (Exporter);
5             @EXPORT = ( qw (parse_search_string findEngine se_host se_name se_term) );
6              
7 3     3   30280 use warnings;
  3         8  
  3         116  
8 3     3   19 use strict;
  3         4  
  3         100  
9 3     3   2832 use URI;
  3         18044  
  3         109  
10 3     3   34686 use Data::Dumper;
  3         23182  
  3         12954  
11              
12             =encoding utf8
13              
14             =head1 NAME
15              
16             URI::ParseSearchString - parse search engine referrer URLs and extract keywords used
17              
18             =head1 VERSION
19              
20             Version 3.51 (Diablo 3 edition)
21              
22             =cut
23              
24             our $VERSION = '3.51';
25              
26             =head1 SYNOPSIS
27              
28             use URI::ParseSearchString ;
29              
30             my $uparse = new URI::ParseSearchString();
31             my $ref = 'http://www.google.com/search?hl=en&q=a+simple+test&btnG=Google+Search';
32            
33             my $query_terms = $uparse->se_term( $ref );
34             my $canonical = $uparse->se_name( $ref );
35             my $hostname = $uparse->se_host( $ref );
36              
37             =head1 FUNCTIONS
38              
39             =head2 new
40            
41             Creates a new instance object of the module.
42            
43             my $uparse = new URI::ParseSearchString() ;
44              
45             =cut
46              
47             my $RH_LOOKUPS = {
48            
49             'answers.yahoo.com' => { name => 'Yahoo Answers', q=>'p' },
50            
51             'sapo.pt' => { name => 'Pesquisa SAPO', q => 'q'},
52             'iol.pt' => { name => 'Pesquisa Iol', q => 'q'},
53             'pesquisa.clix.pt' => { name => 'Pesquisa Clix', q => 'question'},
54             'aeiou.pt' => { name => 'Aeiou', q => 'q'},
55             'cuil.pt' => { name => 'Cuil PT', q => 'q' },
56              
57            
58             'fotos.sapo.pt' => { name => 'SAPO fotos', q => 'word'},
59             'videos.sapo.pt' => { name => 'SAPO videos', q => 'word'},
60             'sabores.sapo.pt' => { name => 'SAPO sabores', q => 'cxSearch'},
61             'jn.sapo.pt' => { name => 'Jornal Noticias', q => 'Pesquisa'},
62             'dn.sapo.pt' => { name => 'Diario Noticias', q => 'Pesquisa'},
63            
64            
65             'rtp.pt' => { name => 'Rtp', q => 'search'},
66             'record.pt' => { name => 'Jornal Record', q => 'q'},
67             'correiodamanha.pt' => { name => 'Correio da Manha', q => 'pesquisa'},
68             'correiomanha.pt' => { name => 'Correio Manha', q => 'pesquisa'},
69             'publico.clix.pt' => { name => 'Publico', q => 'q'},
70             'xl.pt' => { name => 'XL', q => 'pesquisa'},
71            
72             'abacho.com' => { name => 'Abacho', q => 'q'},
73             'alice.it' => { name => 'Alice.it', q => 'qs' },
74             'altavista.com' => { name => 'Altavista', q => 'q' },
75             'aolsearch.aol.com' => { name => 'AOL Search', q => 'query' },
76             'as.starware.com' => { name => 'Starware', q => 'qry' },
77             'blogs.icerocket.com' => { name => 'IceRocket', q => 'q' },
78             'blogsearch.google.com' => { name => 'Google Blogsearch', q => 'q' },
79             'busca.orange.es' => { name => 'Orange ES', q => 'buscar' },
80             'buscador.lycos.es' => { name => 'Lycos ES', q => 'query' },
81             'buscador.terra.es' => { name => 'Terra ES', q => 'query' },
82             'buscar.ozu.es' => { name => 'Ozu ES', q => 'q' },
83             'categorico.it' => { name => 'Categorico IT', q => 'q' },
84             'cuil.com' => { name => 'Cuil', q => 'q' },
85             'clusty.com' => { name => 'Clusty', q => 'query' },
86             'excite.com' => { name => 'Excite', q => 'q' },
87             'excite.it' => { name => 'Excite IT', q => 'q' },
88             'fastweb.it' => { name => 'Fastweb IT', q => 'q' },
89             'fastbrowsersearch.com' => { name => 'Fastbrowsersearch', q=> 'q' },
90             'godado.com' => { name => 'Godado', q => 'key' },
91             'godado.it' => { name => 'Godado (IT)', q => 'key' },
92             'gps.virgin.net' => { name => 'Virgin Search', q => 'q' },
93             'ilmotore.com' => { name => 'ilMotore', q => 'query' },
94             'ithaki.net' => { name => 'Ithaki', q => 'query' },
95             'kataweb.it' => { name => 'Kataweb IT', q => 'q' },
96             'libero.it' => { name => 'Libero IT', q => 'query' },
97             'lycos.it' => { name => 'Lycos IT', q => 'query' },
98             'search.aol.co.uk' => { name => 'AOL UK', q => 'query' },
99             'search.arabia.msn.com' => { name => 'MSN Arabia', q => 'q' },
100             'search.bbc.co.uk' => { name => 'BBC Search', q => 'q' },
101             'search.conduit.com' => { name => 'Conduit', q => 'q' },
102             'search.icq.com' => { name => 'ICQ dot com', q => 'q' },
103             'search.live.com' => { name => 'Live.com', q => 'q' },
104             'search.lycos.co.uk' => { name => 'Lycos UK', q => 'query' },
105             'search.lycos.com' => { name => 'Lycos', q => 'query' },
106             'search.msn.co.uk' => { name => 'MSN UK', q => 'q' },
107             'search.msn.com' => { name => 'MSN', q => 'q' },
108             'search.myway.com' => { name => 'MyWay', q => 'searchfor' },
109             'search.mywebsearch.com' => { name => 'My Web Search', q => 'searchfor' },
110             'search.ntlworld.com' => { name => 'NTLWorld', q => 'q' },
111             'search.orange.co.uk' => { name => 'Orange Search', q => 'q' },
112             'search.prodigy.msn.com' => { name => 'MSN Prodigy', q => 'q' },
113             'search.sweetim.com' => { name => 'Sweetim', q => 'q' },
114             'search.virginmedia.com' => { name => 'VirginMedia', q => 'q' },
115             'search.yahoo.co.jp' => { name => 'Yahoo Japan', q => 'p' },
116             'search.yahoo.com' => { name => 'Yahoo!', q => 'p' },
117             'search.yahoo.jp' => { name => 'Yahoo! Japan', q => 'p' },
118             'simpatico.ws' => { name => 'Simpatico IT', q => 'query' },
119             'soso.com' => { name => 'Soso', q => 'w' },
120             'suche.fireball.de' => { name => 'Fireball DE', q => 'query' },
121             'suche.web.de' => { name => 'Suche DE', q => 'su' },
122             'suche.t-online.de' => { name => 'T-Online', q => 'q' },
123             'thespider.it' => { name => 'TheSpider IT', q => 'q' },
124             'uk.altavista.com' => { name => 'Altavista UK', q => 'q' },
125             'uk.ask.com' => { name => 'Ask UK', q => 'q' },
126             'uk.search.yahoo.com' => { name => 'Yahoo! UK', q => 'p' },
127             'alltheweb.com' => { name => 'AllTheWeb', q => 'q' },
128             'ask.com' => { name => 'Ask dot com', q => 'q' },
129             'blueyonder.co.uk' => { name => 'Blueyonder', q => 'q' },
130             'feedster.com' => { name => 'Feedster', q => 'q' },
131             'google.ad' => { name => 'Google Andorra',q => 'q' },
132             'google.ae' => { name => 'Google United Arab Emirates', q => 'q' },
133             'google.af' => { name => 'Google Afghanistan', q => 'q' },
134             'google.ag' => { name => 'Google Antiqua and Barbuda', q => 'q' },
135             'google.am' => { name => 'Google Armenia', q => 'q' },
136             'google.as' => { name => 'Google American Samoa', q => 'q' },
137             'google.at' => { name => 'Google Austria', q => 'q' },
138             'google.az' => { name => 'Google Azerbaijan', q => 'q' },
139             'google.ba' => { name => 'Google Bosnia and Herzegovina', q => 'q' },
140             'google.be' => { name => 'Google Belgium', q => 'q' },
141             'google.bg' => { name => 'Google Bulgaria',q => 'q' },
142             'google.bi' => { name => 'Google Burundi', q => 'q' },
143             'google.biz' => { name => 'Google dot biz', q => 'q' },
144             'google.bo' => { name => 'Google Bolivia', q => 'q' },
145             'google.bs' => { name => 'Google Bahamas', q => 'q' },
146             'google.bz' => { name => 'Google Belize', q => 'q' },
147             'google.ca' => { name => 'Google Canada', q => 'q' },
148             'google.cc' => { name => 'Google Cocos Islands', q => 'q' },
149             'google.cd' => { name => 'Google Dem Rep of Congo', q => 'q' },
150             'google.cg' => { name => 'Google Rep of Congo', q => 'q' },
151             'google.ch' => { name => 'Google Switzerland', q => 'q' },
152             'google.ci' => { name => 'Google Cote dIvoire', q => 'q' },
153             'google.cl' => { name => 'Google Chile', q => 'q' },
154             'google.cn' => { name => 'Google China', q => 'q' },
155             'google.co.at' => { name => 'Google Austria', q => 'q' },
156             'google.co.bi' => { name => 'Google Burundi', q => 'q' },
157             'google.co.bw' => { name => 'Google Botswana', q => 'q' },
158             'google.co.ci' => { name => 'Google Ivory Coast', q => 'q' },
159             'google.co.ck' => { name => 'Google Cook Islands', q => 'q' },
160             'google.co.cr' => { name => 'Google Costa Rica', q => 'q' },
161             'google.co.gg' => { name => 'Google Guernsey', q => 'q' },
162             'google.co.gl' => { name => 'Google Greenland', q => 'q' },
163             'google.co.gy' => { name => 'Google Guyana', q => 'q' },
164             'google.co.hu' => { name => 'Google Hungary', q => 'q' },
165             'google.co.id' => { name => 'Google Indonesia', q => 'q' },
166             'google.co.il' => { name => 'Google Israel', q => 'q' },
167             'google.co.im' => { name => 'Google Isle of Man', q => 'q' },
168             'google.co.in' => { name => 'Google India', q => 'q' },
169             'google.co.it' => { name => 'Google Italy', q => 'q' },
170             'google.co.je' => { name => 'Google Jersey', q => 'q' },
171             'google.co.jp' => { name => 'Google Japan', q => 'q' },
172             'google.co.ke' => { name => 'Google Kenya', q => 'q' },
173             'google.co.kr' => { name => 'Google South Korea', q => 'q' },
174             'google.co.ls' => { name => 'Google Lesotho', q => 'q' },
175             'google.co.ma' => { name => 'Google Morocco', q => 'q' },
176             'google.co.mu' => { name => 'Google Mauritius', q => 'q' },
177             'google.co.mw' => { name => 'Google Malawi', q => 'q' },
178             'google.co.nz' => { name => 'Google New Zeland', q => 'q' },
179             'google.co.pn' => { name => 'Google Pitcairn Islands', q => 'q' },
180             'google.co.th' => { name => 'Google Thailand', q => 'q' },
181             'google.co.tt' => { name => 'Google Trinidad and Tobago', q => 'q' },
182             'google.co.ug' => { name => 'Google Uganda', q => 'q' },
183             'google.co.uk' => { name => 'Google UK', q => 'q' },
184             'google.co.uz' => { name => 'Google Uzbekistan', q => 'q' },
185             'google.co.ve' => { name => 'Google Venezuela', q => 'q' },
186             'google.co.vi' => { name => 'Google US Virgin Islands', q => 'q' },
187             'google.co.za' => { name => 'Google South Africa',q => 'q' },
188             'google.co.zm' => { name => 'Google Zambia', q => 'q' },
189             'google.co.zw' => { name => 'Google Zimbabwe', q => 'q' },
190             'google.com' => { name => 'Google', q => 'q' },
191             'google.com.af' => { name => 'Google Afghanistan', q => 'q' },
192             'google.com.ag' => { name => 'Google Antiqua and Barbuda', q => 'q' },
193             'google.com.ai' => { name => 'Google Anguilla', q => 'q' },
194             'google.com.ar' => { name => 'Google Argentina', q => 'q' },
195             'google.com.au' => { name => 'Google Australia', q => 'q' },
196             'google.com.az' => { name => 'Google Azerbaijan', q => 'q' },
197             'google.com.bd' => { name => 'Google Bangladesh', q => 'q' },
198             'google.com.bh' => { name => 'Google Bahrain', q => 'q' },
199             'google.com.bi' => { name => 'Google Burundi', q => 'q' },
200             'google.com.bn' => { name => 'Google Brunei Darussalam', q => 'q' },
201             'google.com.bo' => { name => 'Google Bolivia', q => 'q' },
202             'google.com.br' => { name => 'Google Brazil', q => 'q' },
203             'google.com.bs' => { name => 'Google Bahamas', q => 'q' },
204             'google.com.bz' => { name => 'Google Belize', q => 'q' },
205             'google.com.cn' => { name => 'Google China', q => 'q' },
206             'google.com.co' => { name => 'Google', q => 'q' },
207             'google.com.cu' => { name => 'Google Cuba', q => 'q' },
208             'google.com.do' => { name => 'Google Dominican Rep', q => 'q' },
209             'google.com.ec' => { name => 'Google Ecuador', q => 'q' },
210             'google.com.eg' => { name => 'Google Egypt', q => 'q' },
211             'google.com.et' => { name => 'Google Ethiopia', q => 'q' },
212             'google.com.fj' => { name => 'Google Fiji', q => 'q' },
213             'google.com.ge' => { name => 'Google Georgia', q => 'q' },
214             'google.com.gh' => { name => 'Google Ghana', q => 'q' },
215             'google.com.gi' => { name => 'Google Gibraltar', q => 'q' },
216             'google.com.gl' => { name => 'Google Greenland', q => 'q' },
217             'google.com.gp' => { name => 'Google Guadeloupe', q => 'q' },
218             'google.com.gr' => { name => 'Google Greece', q => 'q' },
219             'google.com.gt' => { name => 'Google Guatemala', q => 'q' },
220             'google.com.gy' => { name => 'Google Guyana', q => 'q' },
221             'google.com.hk' => { name => 'Google Hong Kong', q => 'q' },
222             'google.com.hn' => { name => 'Google Honduras', q => 'q' },
223             'google.com.hr' => { name => 'Google Croatia', q => 'q' },
224             'google.com.jm' => { name => 'Google Jamaica', q => 'q' },
225             'google.com.jo' => { name => 'Google Jordan', q => 'q' },
226             'google.com.kg' => { name => 'Google Kyrgyzstan', q => 'q' },
227             'google.com.kh' => { name => 'Google Cambodia', q => 'q' },
228             'google.com.ki' => { name => 'Google Kiribati', q => 'q' },
229             'google.com.kz' => { name => 'Google Kazakhstan', q => 'q' },
230             'google.com.lk' => { name => 'Google Sri Lanka', q => 'q' },
231             'google.com.lv' => { name => 'Google Latvia', q => 'q' },
232             'google.com.ly' => { name => 'Google Libya', q => 'q' },
233             'google.com.mt' => { name => 'Google Malta', q => 'q' },
234             'google.com.mu' => { name => 'Google Mauritius', q => 'q' },
235             'google.com.mw' => { name => 'Google Malawi', q => 'q' },
236             'google.com.mx' => { name => 'Google Mexico', q => 'q' },
237             'google.com.my' => { name => 'Google Malaysia', q => 'q' },
238             'google.com.na' => { name => 'Google Namibia', q => 'q' },
239             'google.com.nf' => { name => 'Google Norfolk Island', q => 'q' },
240             'google.com.ng' => { name => 'Google Nigeria', q => 'q' },
241             'google.com.ni' => { name => 'Google Nicaragua', q => 'q' },
242             'google.com.np' => { name => 'Google Nepal', q => 'q' },
243             'google.com.nr' => { name => 'Google Nauru', q => 'q' },
244             'google.com.om' => { name => 'Google Oman', q => 'q' },
245             'google.com.pa' => { name => 'Google Panama', q => 'q' },
246             'google.com.pe' => { name => 'Google Peru', q => 'q' },
247             'google.com.ph' => { name => 'Google Philipines', q => 'q' },
248             'google.com.pk' => { name => 'Google Pakistan', q => 'q' },
249             'google.com.pl' => { name => 'Google Poland', q => 'q' },
250             'google.com.pr' => { name => 'Google Puerto Rico', q => 'q' },
251             'google.com.pt' => { name => 'Google Portugal', q => 'q' },
252             'google.com.py' => { name => 'Google Paraguay', q => 'q' },
253             'google.com.qa' => { name => 'Google', q => 'q' },
254             'google.com.ru' => { name => 'Google Russia', q => 'q' },
255             'google.com.sa' => { name => 'Google Saudi Arabia', q => 'q' },
256             'google.com.sb' => { name => 'Google Solomon Islands', q => 'q' },
257             'google.com.sc' => { name => 'Google Seychelles', q => 'q' },
258             'google.com.sg' => { name => 'Google Singapore', q => 'q' },
259             'google.com.sv' => { name => 'Google El Savador', q => 'q' },
260             'google.com.tj' => { name => 'Google Tajikistan', q => 'q' },
261             'google.com.tr' => { name => 'Google Turkey', q => 'q' },
262             'google.com.tt' => { name => 'Google Trinidad and Tobago', q => 'q' },
263             'google.com.tw' => { name => 'Google Taiwan', q => 'q' },
264             'google.com.ua' => { name => 'Google Ukraine', q => 'q' },
265             'google.com.uy' => { name => 'Google Uruguay', q => 'q' },
266             'google.com.uz' => { name => 'Google Uzbekistan', q => 'q' },
267             'google.com.ve' => { name => 'Google Venezuela', q => 'q' },
268             'google.com.vi' => { name => 'Google US Virgin Islands', q => 'q' },
269             'google.com.vn' => { name => 'Google Vietnam', q => 'q' },
270             'google.com.ws' => { name => 'Google Samoa', q => 'q' },
271             'google.cz' => { name => 'Google Czech Rep', q => 'q' },
272             'google.de' => { name => 'Google Germany', q => 'q' },
273             'google.dj' => { name => 'Google Djubouti', q => 'q' },
274             'google.dk' => { name => 'Google Denmark', q => 'q' },
275             'google.dm' => { name => 'Google Dominica', q => 'q' },
276             'google.ec' => { name => 'Google Ecuador', q => 'q' },
277             'google.ee' => { name => 'Google Estonia', q => 'q' },
278             'google.es' => { name => 'Google Spain', q => 'q' },
279             'google.fi' => { name => 'Google Finland', q => 'q' },
280             'google.fm' => { name => 'Google Micronesia', q => 'q' },
281             'google.fr' => { name => 'Google France', q => 'q' },
282             'google.gd' => { name => 'Google Grenada', q => 'q' },
283             'google.ge' => { name => 'Google Georgia', q => 'q' },
284             'google.gf' => { name => 'Google French Guiana', q => 'q' },
285             'google.gg' => { name => 'Google Guernsey', q => 'q' },
286             'google.gl' => { name => 'Google Greenland', q => 'q' },
287             'google.gm' => { name => 'Google Gambia', q => 'q' },
288             'google.gp' => { name => 'Google Guadeloupe', q => 'q' },
289             'google.gr' => { name => 'Google Greece', q => 'q' },
290             'google.gy' => { name => 'Google Guyana', q => 'q' },
291             'google.hk' => { name => 'Google Hong Kong', q => 'q' },
292             'google.hn' => { name => 'Google Honduras', q => 'q' },
293             'google.hr' => { name => 'Google Croatia', q => 'q' },
294             'google.ht' => { name => 'Google Haiti', q => 'q' },
295             'google.hu' => { name => 'Google Hungary', q => 'q' },
296             'google.ie' => { name => 'Google Ireland', q => 'q' },
297             'google.im' => { name => 'Google Isle of Man', q => 'q' },
298             'google.in' => { name => 'Google India', q => 'q' },
299             'google.info' => { name => 'Google dot info', q => 'q' },
300             'google.is' => { name => 'Google Iceland', q => 'q' },
301             'google.it' => { name => 'Google Italy', q => 'q' },
302             'google.je' => { name => 'Google Jersey', q => 'q' },
303             'google.jo' => { name => 'Google Jordan', q => 'q' },
304             'google.jobs' => { name => 'Google dot jobs', q => 'q' },
305             'google.jp' => { name => 'Google Japan', q => 'q' },
306             'google.kg' => { name => 'Google Kyrgyzstan', q => 'q' },
307             'google.ki' => { name => 'Google Kiribati', q => 'q' },
308             'google.kz' => { name => 'Google Kazakhstan', q => 'q' },
309             'google.la' => { name => 'Google Laos', q => 'q' },
310             'google.li' => { name => 'Google Liechtenstein', q => 'q' },
311             'google.lk' => { name => 'Google Sri Lanka', q => 'q' },
312             'google.lt' => { name => 'Google Lithuania', q => 'q' },
313             'google.lu' => { name => 'Google Luxembourg', q => 'q' },
314             'google.lv' => { name => 'Google Latvia', q => 'q' },
315             'google.ma' => { name => 'Google Morocco', q => 'q' },
316             'google.md' => { name => 'Google Moldova', q => 'q' },
317             'google.mn' => { name => 'Google Mongolia', q => 'q' },
318             'google.mobi' => { name => 'Google dot mobi', q => 'q' },
319             'google.ms' => { name => 'Google Montserrat', q => 'q' },
320             'google.mu' => { name => 'Google Mauritius', q => 'q' },
321             'google.mv' => { name => 'Google Maldives', q => 'q' },
322             'google.mw' => { name => 'Google Malawi', q => 'q' },
323             'google.net' => { name => 'Google dot net', q => 'q' },
324             'google.nf' => { name => 'Google Norfolk Island', q => 'q' },
325             'google.nl' => { name => 'Google Netherlands', q => 'q' },
326             'google.no' => { name => 'Google Norway', q => 'q' },
327             'google.nr' => { name => 'Google Nauru', q => 'q' },
328             'google.nu' => { name => 'Google Niue', q => 'q' },
329             'google.off.ai' => { name => 'Google Anguilla', q => 'q' },
330             'google.ph' => { name => 'Google Philipines', q => 'q' },
331             'google.pk' => { name => 'Google Pakistan', q => 'q' },
332             'google.pl' => { name => 'Google Poland', q => 'q' },
333             'google.pn' => { name => 'Google Pitcairn Islands', q => 'q' },
334             'google.pr' => { name => 'Google Puerto Rico', q => 'q' },
335             'google.pt' => { name => 'Google Portugal', q => 'q' },
336             'google.ro' => { name => 'Google Romania', q => 'q' },
337             'google.ru' => { name => 'Google Russia', q => 'q' },
338             'google.rw' => { name => 'Google Rwanda', q => 'q' },
339             'google.sc' => { name => 'Google Seychelles', q => 'q' },
340             'google.se' => { name => 'Google Sweden', q => 'q' },
341             'google.sg' => { name => 'Google Singapore', q => 'q' },
342             'google.sh' => { name => 'Google Saint Helena', q => 'q' },
343             'google.si' => { name => 'Google Slovenia', q => 'q' },
344             'google.sk' => { name => 'Google Slovakia', q => 'q' },
345             'google.sm' => { name => 'Google San Marino', q => 'q' },
346             'google.sn' => { name => 'Google Senegal', q => 'q' },
347             'google.sr' => { name => 'Google Suriname', q => 'q' },
348             'google.st' => { name => 'Google Sao Tome', q => 'q' },
349             'google.tk' => { name => 'Google Tokelau', q => 'q' },
350             'google.tm' => { name => 'Google Turkmenistan', q => 'q' },
351             'google.to' => { name => 'Google Tonga', q => 'q' },
352             'google.tp' => { name => 'Google East Timor', q => 'q' },
353             'google.tt' => { name => 'Google Trinidad and Tobago', q => 'q' },
354             'google.tv' => { name => 'Google Tuvalu', q => 'q' },
355             'google.tw' => { name => 'Google Taiwan', q => 'q' },
356             'google.ug' => { name => 'Google Uganda', q => 'q' },
357             'google.us' => { name => 'Google US', q => 'q' },
358             'google.uz' => { name => 'Google Uzbekistan', q => 'q' },
359             'google.vg' => { name => 'Google British Virgin Islands', q => 'q' },
360             'google.vn' => { name => 'Google Vietnam', q => 'q' },
361             'google.vu' => { name => 'Google Vanuatu', q => 'q' },
362             'google.ws' => { name => 'Google Samoa', q => 'q' },
363             'hotbot.com' => { name => 'HotBot', q => 'query' },
364             'in.gr' => { name => 'In GR', q => 'q' },
365             'mamma.com' => { name => 'Mamma', q => 'query' },
366             'mahalo.com' => { name => 'Mahalo', q => 'search' },
367             'megasearching.net' => { name => 'Megasearching', q => 's' },
368             'mirago.co.uk' => { name => 'Mirago UK', q => 'qry' },
369             'netscape.com' => { name => 'Netscape', q => 's' },
370             'community.paglo.com' => { name => 'Paglo', q => 'q' },
371             'pathfinder.gr' => { name => 'Pathfinder GR', q => 'q' },
372             'phantis.com' => { name => 'Phantis GR' , q => 'q'},
373             'robby.gr' => { name => 'Robby GR' , q => 'searchstr' },
374             'sproose.com' => { name => 'Sproose', q => 'query' },
375             'technorati.com' => { name => 'Technorati', q => 'q' },
376             'tesco.net' => { name => 'Tesco Search', q => 'q' },
377             'tiscali.co.uk' => { name => 'Tiscali UK', q => 'query' },
378             'bing.com' => { name => 'Bing', q => 'q' },
379            
380             'acbusca.com' => { name => 'ACBusca', q => 'query' },
381             'atalhocerto.com.br' => { name => 'Atalho Certo', q => 'keyword' },
382             'bastaclicar.com.br' => { name => 'Basta Clicar', q => 'search' },
383             'bemrapido.com.br' => { name => 'Bem Rapido', q => 'chave' },
384             'br.altavista.com' => { name => 'AltaVista Brasil', q => 'q' },
385             'br.search.yahoo.com' => { name => 'Yahoo Brazil', q => 'p' },
386             'busca.uol.com.br' => { name => 'Radar UOL', q => 'q' },
387             'buscaaqui.com.br' => { name => 'Busca Aqui', q => 'q' },
388             'buscador.terra.com.br' => { name => 'Terra Busca', q => 'query' },
389             'cade.search.yahoo.com' => { name => 'Cadê', q => 'p' },
390             'clickgratis.com.br' => { name => 'Click Gratis', q => 'query' },
391             'entrada.com.br' => { name => 'Entrada', q => 'q' },
392             'gigabusca.com.br' => { name => 'Giga Busca', q => 'what' },
393             'internetica.com.br' => { name => 'Internetica', q => 'busca' },
394             'katatudo.com.br' => { name => 'KataTudo', q => 'q' },
395             'minasplanet.com.br' => { name => 'Minas Planet', q => 'term' },
396             'speedybusca.com.br' => { name => 'SpeedyBusca', q => 'q' },
397             'vaibuscar.com.br' => { name => 'Vai Busca', q => 'q' },
398            
399             'search.conduit.com' => { name => 'Conduit', q=>'q' },
400             'in.search.yahoo.com' => { name => 'Yahoo India', q => 'p' },
401             'rediff.com' => { name => 'Rediff', q => 'MT' },
402             'guruji.com' => { name => 'Guruji', q => 'q' },
403            
404             'isohunt.com' => { name => 'Isohunt', q => 'ihq' },
405             'btjunkie.org' => { name => 'BT Junkie', q => 'q' },
406             'torrentz.eu' => { name => 'Torrentz', q => 'f' }
407            
408             };
409              
410             sub new {
411 1     1 1 820 my $class = shift ;
412 1         2 my $self = { } ;
413 1         3 $self->{engines} = $RH_LOOKUPS;
414 1         4 return bless $self, $class ;
415             }
416              
417             =head2 parse_search_string
418              
419             This module provides a simple function to parse and extract search engine query strings. It was designed and tested having
420             Apache referrer logs in mind. It can be used for a wide number of purposes, including tracking down what keywords people use
421             on popular search engines before they land on a site. Although a number of existing modules and scripts exist for this purpose,
422             the majority of them are either outdated using obsolete search strings associated with each engine.
423              
424             The default function exported is "parse_search_string" which accepts an unquoted referrer string as input and returns the
425             search engine query contained within. It currently works with both escaped and un-escaped queries and will translate the search
426             terms before returning them in the latter case. The function returns undef in all other cases and errors.
427              
428             for example:
429              
430             my $ref = 'http://www.google.com/search?hl=en&q=a+simple+test&btnG=Google+Search';
431             my $terms =
432             $uparse->parse_search_string( $ref );
433              
434             would return I<'a simple test'>
435              
436             whereas
437              
438             my $ref = 'http://www.mamma.com/Mamma?utfout=1&qtype=0&query=a+more%21+complex_+search%24&Submit=%C2%A0%C2%A0Search%C2%A0%C2%A0';
439             my $terms =
440             $uparse->parse_search_string( $ref );
441              
442             would return I<'a more! complex_ search$'>
443              
444             =cut
445              
446             =head2 se_term
447              
448             Same as parse_search_string().
449              
450             =cut
451              
452             sub se_term {
453 124     124 1 67158 my $self = shift ;
454 124         207 my $string = shift ;
455 124 50       321 return unless defined $string ;
456 124         271 return $self->parse_search_string($string) ;
457             }
458              
459             ## internal method for creating a URI object
460              
461             sub _uri {
462 264     264   282 my $self = shift;
463 264         311 my $string = shift;
464            
465 264 50       503 return unless defined($string);
466            
467             ## create a new URI object
468             ## and return unless its http or https
469            
470 264         961 my $uri = URI->new( $string );
471             return
472 264 100 100     29332 unless (defined($uri)
      33        
473             && (ref($uri) eq 'URI::http' || ref($uri) eq 'URI::https'));
474            
475             ## feedster and technorati as they do not follow
476             ## the usual search patterns thus we extract the query
477             ## terms by taking the last element from the path segments
478            
479 260         881 my $host = $uri->host;
480            
481 260 100 100     13750 return unless defined($host) && $host;
482            
483 256 100       870 if ( $host =~ m/(feedster|technorati)\.com$/ ){
484 4         24 $uri->query_form( q => ( $uri->path_segments)[-1]);
485             }
486              
487             ## clean up the host until it matches
488             ## something we already know about
489            
490 256         1399 while( ! defined $self->{'engines'}{ $host }){
491 134         242 my $c = index($host, '.');
492 134 100       270 last if $c <0;
493 132         686 $host= substr($host, $c+1);
494             }
495              
496 256         707 return ($uri, $host);
497            
498             }
499              
500              
501             sub parse_search_string {
502 134     134 1 4178 my $self = shift ;
503 134         157 my $string = shift ;
504 134 50       299 return unless defined($string);
505            
506 134         283 my ($uri,$host) = $self->_uri( $string );
507 134 100       327 return unless defined($uri);
508            
509             ## get rid of the www
510 132         184 $host =~ m!^www\.!;
511            
512             ## find the query parameter the engine uses
513 132         445 my $q = $self->{'engines'}{$host}{'q'};
514 132 100       297 return unless defined $q;
515            
516             ## return the string passed to the query parameter
517 128         398 my %h_query = $uri->query_form;
518            
519 128         12082 return $h_query{$q}
520             }
521              
522             =head2 findEngine
523              
524             Returns a list with the hostname of the search engine as the first element and
525             the canonical name as the second element.
526              
527             my $ref = 'http://www.google.com/search?hl=en&q=a+simple+test&btnG=Google+Search';
528             my ($hostname, $canonical) = $uparse->findEngine( $ref ) ;
529              
530             This will return 'google.com' as the search engine hostname and 'Google' as the name.
531             This function will return I<undef> on error.
532              
533             =cut
534              
535             sub findEngine {
536 130     130 1 169 my $self = shift ;
537 130         154 my $string = shift ;
538            
539 130 50       254 return unless defined($string);
540            
541             ## create a URI object
542            
543 130         250 my ($uri,$hostname) = $self->_uri( $string );
544 130 100 66     701 return unless defined($uri) && $uri;
545 124 50 33     1824 return unless defined($hostname) && $hostname;
546            
547 124         314 my $canonical = $self->{'engines'}->{$hostname}->{'name'};
548            
549 124         324 return ($hostname,$canonical);
550             }
551              
552             =head2 se_host
553              
554             Wrapper around findEngine - returns just the hostname.
555             This function will return I<undef> on error.
556              
557             =cut
558              
559             sub se_host {
560 130     130 1 1904 my $self = shift ;
561 130         176 my $string = shift ;
562 130 50       330 return unless defined($string) ;
563 130         280 my ($host,$name) = $self->findEngine($string) ;
564 130         469 return $host ;
565             }
566              
567             =head2 se_name
568              
569             Wrapper around findEngine - returns just the canonical name;
570             This function will return I<undef> on error.
571              
572             =cut
573              
574             sub se_name {
575 0     0 1   my $self = shift ;
576 0           my $string = shift ;
577 0 0         return unless defined($string);
578 0           my ($host,$name) = $self->findEngine($string) ;
579 0           return $name ;
580             }
581              
582             =head1 SUPPORTED ENGINES
583              
584             Currently supported search engines include: Sproose, Google Namibia, Google Ivory Coast, Google Oman, Technorati, Google Ecuador,
585             Google Norfolk Island, Mahalo, Google UK, Yahoo! UK, Google Micronesia, Google Bahrain, Basta Clicar,
586             Giga Busca, Google Greece, Google Belgium, Google Egypt, Google Chile, Godado (IT), Google Australia,
587             Google Uruguay, Google India, Google Taiwan, Google Ukraine, Google US, Terra ES,
588             Tesco Search, Megasearching, SAPO videos, Google Nepal, Google Israel, Google US Virgin Islands, Google Hungary,
589             Google San Marino, Google Croatia, Google dot jobs, Google Panama, Google Malaysia, Internetica, Google Brunei Darussalam,
590             Google Denmark, Google Pakistan, Google Solomon Islands, Google dot biz, Google Lesotho, IceRocket, Google Greenland, Fireball DE,
591             Rtp, Google Portugal, Google Samoa, Google Kazakhstan, Google Blogsearch, Google Thailand, Google, Google Antiqua and Barbuda,
592             Google Germany, Google Moldova, Google Zambia, Google Greece, Google Sri Lanka, Google Ireland, Google Austria,
593             Google Peru, Google Guatemala, ICQ dot com, AOL UK, Google Guyana, In GR, Google dot info, MyWay, Pathfinder GR, Google Costa Rica,
594             KataTudo, Google Jamaica, Google Vietnam, Google Morocco, Google Gambia, Google Singapore, Google Mauritius, Altavista, Google Afghanistan,
595             Google Cote dIvoire, Google Kazakhstan, Google Czech Rep, Phantis GR, Google Bahamas, Google United Arab Emirates, Google East Timor, Ozu ES,
596             Google Venezuela, Google Puerto Rico, Google Armenia, Google Croatia, Google Botswana, Google Tuvalu, Ask UK, Google Singapore, Mirago UK,
597             Google Greenland, MSN Arabia, Google Nauru, Publico, Robby GR, Minas Planet, Pesquisa Iol, Google Romania, Google South Korea, Google Jersey,
598             Netscape, Busca Aqui, Google Bulgaria, Google Uzbekistan, Tiscali UK, Ithaki, Cadê, Lycos IT, Google Suriname, Excite IT, Google Hong Kong,
599             Kataweb IT, Google Burundi, Click Gratis, Google Vietnam, MSN, Alice.it, Google Honduras, Google Trinidad and Tobago, Google Uganda, XL,
600             Jornal Noticias, Google Cook Islands, Google Japan, Google Ecuador, Google Ghana, Google Guadeloupe, Google Libya, Google Kenya, Fastbrowsersearch,
601             Aeiou, Google Niue, Jornal Record, HotBot, Google Honduras, Google Georgia, Google Fiji, Google Philipines, BBC Search, Google, Google Laos,
602             Soso, AltaVista Brasil, Lycos UK, SAPO fotos, Ask dot com, Google Netherlands, Google Philipines, Google Trinidad and Tobago, Google Turkey,
603             AllTheWeb, Google Japan, Google Argentina, Google Vanuatu, Blueyonder, Google Greenland, Google Samoa, Google Georgia, Google Slovakia,
604             Google Sri Lanka, Pesquisa SAPO, Google Latvia, Google Latvia, Correio Manha, Terra Busca, Google El Savador, Google Cambodia,
605             Google Mauritius, Google China, AOL Search, Google Tokelau, Google Tonga, Correio da Manha, Radar UOL, Google Jordan, Godado, Google Jordan,
606             Google Pitcairn Islands, Categorico IT, Google Morocco, Google Dominican Rep, Google France, Abacho, Google Azerbaijan, Google Andorra, Google Belize,
607             Google Paraguay, Simpatico IT, Google Ethiopia, Google Uganda, Google Poland, Google Bolivia, Google Hungary, Google Russia, Diario Noticias,
608             Google Puerto Rico, Google Montserrat, Yahoo! Japan, Google Seychelles, Mamma, Google Pitcairn Islands, Google South Africa, Paglo, Google Malta,
609             Google Azerbaijan, Google New Zeland, Google China, Google Norway, Google Bosnia and Herzegovina, Google Indonesia, SpeedyBusca, Entrada, Google Anguilla,
610             Google Rep of Congo, Google Dominica, Google Finland, Altavista UK, Google Guyana, MSN UK, Yahoo Answers, Google British Virgin Islands, Google Guadeloupe,
611             Google Lithuania, Google Antiqua and Barbuda, Google Bahamas, Google Malawi, MSN Prodigy, Bing, Google Bolivia, Google Djubouti, Google Uzbekistan, Fastweb IT,
612             Google Tajikistan, Virgin Search, Google Nigeria, Yahoo Japan, Pesquisa Clix, Google Grenada, Google Haiti, Google American Samoa, Google Pakistan,
613             Google Cocos Islands, Google Hong Kong, NTLWorld, ilMotore, Google Belize, Google Guernsey, Google Sweden, Google Anguilla, Google Bangladesh, Google Isle of Man,
614             Google Guernsey, Google Kyrgyzstan, Google Dem Rep of Congo, Google Malawi, Orange Search, Google Seychelles, Google Guyana, Google Gibraltar,
615             oogle Italy, Google Kiribati, TheSpider IT, Google Nicaragua, Google Russia, Google Venezuela, Google Poland, Google Brazil, Google Senegal, Conduit, Lycos,
616             Google Isle of Man, Live.com, Google Italy, Libero IT, Google Canada, Google Nauru, Google Liechtenstein, Google Afghanistan, Cuil, Google Zimbabwe, Google Mauritius,
617             Orange ES, Google Burundi, Google Portugal, ACBusca, Bem Rapido, Atalho Certo, Excite, Clusty, Yahoo Brazil, My Web Search, Google Spain, Google Uzbekistan, Google,
618             Google Mexico, T-Online, Google dot mobi, Google Luxembourg, Google Austria, Yahoo!, Google Kiribati, Sweetim, Vai Busca, Google Mongolia, Google Saudi Arabia, Google dot net,
619             Google Maldives, Google Trinidad and Tobago, Google Jersey, Feedster, Google Turkmenistan, Google Switzerland, Google Norfolk Island, Suche DE, Google Malawi, Google Rwanda,
620             Lycos ES, Google Burundi, Google French Guiana, Google Kyrgyzstan, Google Saint Helena, VirginMedia, Google Iceland, SAPO sabores, Google India, Google Cuba,
621             Google US Virgin Islands, Google Taiwan, Google Sao Tome, Google Slovenia, Starware, Google Estonia, Conduit, Yahoo India, Rediff, Guruji
622              
623             =head1 AUTHOR
624              
625             Spiros Denaxas, C<< <s.denaxas at gmail.com> >>
626              
627             =head1 SOURCE CODE
628              
629             The source code can be found on github L<https://github.com/spiros/URI-ParseSearchString>
630              
631             =head1 BUGS
632              
633             This is my first CPAN module so I encourage you to send all comments, especially bad,
634             to my email address.
635              
636             This could not have been possible without the support of my co-workers at
637             http://nestoria.co.uk - the easiest way of finding UK property.
638              
639             =head1 SUPPORT
640              
641             For more information, you could also visit my blog:
642              
643             http://blog.ffffruit.com
644              
645             =over 4
646              
647             =back
648              
649             =head1 COPYRIGHT & LICENSE
650              
651             Copyright 2011 Spiros Denaxas, all rights reserved.
652              
653             This program is free software; you can redistribute it and/or modify it
654             under the same terms as Perl itself.
655              
656             =cut
657              
658             1; # End of URI::ParseSearchString