| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
# |
|
2
|
|
|
|
|
|
|
# $Id$ |
|
3
|
|
|
|
|
|
|
# |
|
4
|
|
|
|
|
|
|
# www::google Brik |
|
5
|
|
|
|
|
|
|
# |
|
6
|
|
|
|
|
|
|
package Metabrik::Www::Google; |
|
7
|
1
|
|
|
1
|
|
780
|
use strict; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
34
|
|
|
8
|
1
|
|
|
1
|
|
5
|
use warnings; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
35
|
|
|
9
|
|
|
|
|
|
|
|
|
10
|
1
|
|
|
1
|
|
6
|
use base qw(Metabrik::Client::Www); |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
499
|
|
|
11
|
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
sub brik_properties { |
|
13
|
|
|
|
|
|
|
return { |
|
14
|
0
|
|
|
0
|
1
|
|
revision => '$Revision$', |
|
15
|
|
|
|
|
|
|
tags => [ qw(unstable) ], |
|
16
|
|
|
|
|
|
|
author => 'GomoR ', |
|
17
|
|
|
|
|
|
|
license => 'http://opensource.org/licenses/BSD-3-Clause', |
|
18
|
|
|
|
|
|
|
attributes => { |
|
19
|
|
|
|
|
|
|
language => [ qw(fr|uk|de|ch) ], |
|
20
|
|
|
|
|
|
|
page => [ qw(number) ], |
|
21
|
|
|
|
|
|
|
filter => [ qw(0|1) ], |
|
22
|
|
|
|
|
|
|
}, |
|
23
|
|
|
|
|
|
|
attributes_default => { |
|
24
|
|
|
|
|
|
|
language => 'fr', |
|
25
|
|
|
|
|
|
|
page => 1, |
|
26
|
|
|
|
|
|
|
do_javascript => 1, |
|
27
|
|
|
|
|
|
|
filter => 0, |
|
28
|
|
|
|
|
|
|
}, |
|
29
|
|
|
|
|
|
|
commands => { |
|
30
|
|
|
|
|
|
|
search => [ qw(keywords) ], |
|
31
|
|
|
|
|
|
|
}, |
|
32
|
|
|
|
|
|
|
require_modules => { |
|
33
|
|
|
|
|
|
|
'WWW::Mechanize::PhantomJS' => [ ], |
|
34
|
|
|
|
|
|
|
'Metabrik::String::Html' => [ ], |
|
35
|
|
|
|
|
|
|
'Metabrik::String::Uri' => [ ], |
|
36
|
|
|
|
|
|
|
}, |
|
37
|
|
|
|
|
|
|
}; |
|
38
|
|
|
|
|
|
|
} |
|
39
|
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
# Search last 24 hours: &tbs=qdr:d |
|
41
|
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
# my $url = 'http://www.google.fr/#q=gomor' |
|
43
|
|
|
|
|
|
|
# set client::www do_javascript 1 |
|
44
|
|
|
|
|
|
|
# run client::www get $url |
|
45
|
|
|
|
|
|
|
# my $content = $RUN->{content} |
|
46
|
|
|
|
|
|
|
# run client::www parse $content |
|
47
|
|
|
|
|
|
|
# my $body = $RUN->content |
|
48
|
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
sub search { |
|
50
|
0
|
|
|
0
|
0
|
|
my $self = shift; |
|
51
|
0
|
|
|
|
|
|
my @args = @_; |
|
52
|
|
|
|
|
|
|
|
|
53
|
0
|
0
|
|
|
|
|
if (@args <= 0) { |
|
54
|
0
|
0
|
|
|
|
|
$self->brik_help_run_undef_arg('search', undef) or return; |
|
55
|
|
|
|
|
|
|
} |
|
56
|
|
|
|
|
|
|
|
|
57
|
0
|
|
|
|
|
|
my $language = $self->language; |
|
58
|
0
|
|
|
|
|
|
my $page = $self->page; |
|
59
|
0
|
|
|
|
|
|
my $filter = $self->filter; |
|
60
|
|
|
|
|
|
|
|
|
61
|
0
|
|
|
|
|
|
my $keywords = join(' ', @args); |
|
62
|
|
|
|
|
|
|
|
|
63
|
0
|
0
|
|
|
|
|
my $si = Metabrik::String::Uri->new_from_brik_init($self) or return; |
|
64
|
0
|
0
|
|
|
|
|
$keywords = $si->encode($keywords) or return; |
|
65
|
|
|
|
|
|
|
|
|
66
|
0
|
0
|
0
|
|
|
|
if ($language eq 'fr' || $language eq 'uk' || $language eq 'de' || $language eq 'ch') { |
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
67
|
|
|
|
|
|
|
} |
|
68
|
|
|
|
|
|
|
else { |
|
69
|
0
|
|
|
|
|
|
return $self->log->error("search: unsupported language [$language]"); |
|
70
|
|
|
|
|
|
|
} |
|
71
|
|
|
|
|
|
|
|
|
72
|
0
|
|
|
|
|
|
my $cache = { |
|
73
|
|
|
|
|
|
|
fr => 'en cache', |
|
74
|
|
|
|
|
|
|
de => 'im cache', |
|
75
|
|
|
|
|
|
|
ch => 'im cache', |
|
76
|
|
|
|
|
|
|
uk => 'cached', |
|
77
|
|
|
|
|
|
|
}; |
|
78
|
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
# Google UK is google.co.uk |
|
80
|
0
|
|
|
|
|
|
my $url = 'http://www.google.'.$language.'/#q='; |
|
81
|
0
|
0
|
|
|
|
|
if ($language eq 'uk') { |
|
82
|
0
|
|
|
|
|
|
$url = 'http://www.google.co.uk/#q='; |
|
83
|
|
|
|
|
|
|
} |
|
84
|
|
|
|
|
|
|
|
|
85
|
0
|
|
|
|
|
|
my $start = ($page - 1); |
|
86
|
0
|
0
|
|
|
|
|
if ($start < 0) { |
|
87
|
0
|
|
|
|
|
|
$start = 0; |
|
88
|
|
|
|
|
|
|
} |
|
89
|
0
|
|
|
|
|
|
$start *= 10; |
|
90
|
0
|
|
|
|
|
|
my $search = $url.$keywords.'&start='.$start.'&filter='.$filter; |
|
91
|
|
|
|
|
|
|
|
|
92
|
0
|
|
|
|
|
|
$self->log->verbose("search: [$search]"); |
|
93
|
|
|
|
|
|
|
|
|
94
|
0
|
0
|
|
|
|
|
my $get = $self->get($search) or return; |
|
95
|
0
|
0
|
|
|
|
|
if ($get->{code} == 200) { |
|
96
|
0
|
0
|
|
|
|
|
my $tree = $self->parse($get->{content}) or return; |
|
97
|
0
|
|
|
|
|
|
my $body = $tree->content; |
|
98
|
|
|
|
|
|
|
|
|
99
|
0
|
|
|
|
|
|
my $r = $self->_traverse($body->[1]); |
|
100
|
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
# We merge cache stuff within results |
|
102
|
0
|
|
|
|
|
|
my @merged = (); |
|
103
|
0
|
|
|
|
|
|
my $this = {}; |
|
104
|
0
|
|
|
|
|
|
for (@$r) { |
|
105
|
0
|
|
|
|
|
|
$self->log->debug("url: [".$_->{url}."]"); |
|
106
|
0
|
|
|
|
|
|
$self->log->debug("title: [".$_->{title}."]"); |
|
107
|
|
|
|
|
|
|
|
|
108
|
0
|
0
|
|
|
|
|
if ($_->{title} =~ m/^@{[$cache->{$language}]}/i) { |
|
|
0
|
|
|
|
|
|
|
|
109
|
0
|
|
|
|
|
|
$self->log->debug("cache: [".$_->{url}."]"); |
|
110
|
0
|
|
|
|
|
|
$merged[-1]->{cache_url} = $_->{url}; |
|
111
|
|
|
|
|
|
|
} |
|
112
|
|
|
|
|
|
|
else { |
|
113
|
0
|
|
|
|
|
|
$this->{url} = $_->{url}; |
|
114
|
0
|
|
|
|
|
|
$this->{title} = $_->{title}; |
|
115
|
0
|
|
|
|
|
|
push @merged, $this; |
|
116
|
0
|
|
|
|
|
|
$this = {}; |
|
117
|
|
|
|
|
|
|
} |
|
118
|
|
|
|
|
|
|
} |
|
119
|
|
|
|
|
|
|
|
|
120
|
0
|
|
|
|
|
|
return \@merged; |
|
121
|
|
|
|
|
|
|
} |
|
122
|
|
|
|
|
|
|
|
|
123
|
0
|
|
|
|
|
|
return $self->log->error("search: unhandled error"); |
|
124
|
|
|
|
|
|
|
} |
|
125
|
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
sub _traverse { |
|
127
|
0
|
|
|
0
|
|
|
my $self = shift; |
|
128
|
0
|
|
|
|
|
|
my ($node) = @_; |
|
129
|
|
|
|
|
|
|
|
|
130
|
0
|
|
|
|
|
|
my @results = (); |
|
131
|
|
|
|
|
|
|
|
|
132
|
0
|
|
|
|
|
|
my @list = $node->content_list; |
|
133
|
0
|
|
|
|
|
|
for my $this (@list) { |
|
134
|
0
|
0
|
|
|
|
|
if (ref($this) eq 'HTML::Element') { |
|
135
|
0
|
|
|
|
|
|
my $tag = $this->tag; |
|
136
|
0
|
0
|
|
|
|
|
if ($tag eq 'a') { |
|
137
|
0
|
|
|
|
|
|
my $h = $self->_href_to_hash($this); |
|
138
|
0
|
0
|
0
|
|
|
|
if ($h && keys %$h > 0) { |
|
139
|
|
|
|
|
|
|
#print Data::Dumper::Dumper($h)."\n"; |
|
140
|
0
|
|
|
|
|
|
push @results, $h; |
|
141
|
|
|
|
|
|
|
} |
|
142
|
0
|
|
|
|
|
|
next; |
|
143
|
|
|
|
|
|
|
} |
|
144
|
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
# Do it recursively |
|
146
|
0
|
|
|
|
|
|
my $new = $self->_traverse($this); |
|
147
|
0
|
|
|
|
|
|
push @results, @$new; |
|
148
|
|
|
|
|
|
|
} |
|
149
|
|
|
|
|
|
|
} |
|
150
|
|
|
|
|
|
|
|
|
151
|
0
|
|
|
|
|
|
return \@results; |
|
152
|
|
|
|
|
|
|
} |
|
153
|
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
sub _href_to_hash { |
|
155
|
0
|
|
|
0
|
|
|
my $self = shift; |
|
156
|
0
|
|
|
|
|
|
my ($element) = @_; |
|
157
|
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
# /url?q=http://www.justanswer.com/military-law/5ps6l-read-gomor-submitted-rebuttal-go-will.html&sa=U&ved=0ahUKEwi_hP_LgJTPAhVEWRoKHdlaDKQQFghHMAk&usg=AFQjCNGs50hYJHY-aJ6yxYeiP0p5Qd52-A |
|
159
|
0
|
|
|
|
|
|
my $is_incomplete = 0; |
|
160
|
0
|
|
|
|
|
|
my $title = ''; |
|
161
|
0
|
|
|
|
|
|
my $url = ''; |
|
162
|
0
|
|
|
|
|
|
my $href = $element->{href}; |
|
163
|
0
|
0
|
|
|
|
|
if ($href =~ m{^/url\?q=}) { # && $href !~ m{/url\?q=http://webcache.googleusercontent.com/}) { |
|
164
|
0
|
|
|
|
|
|
$url = $href; |
|
165
|
0
|
|
|
|
|
|
$url =~ s{^/url\?q=}{}; |
|
166
|
0
|
|
|
|
|
|
$url =~ s{&sa=.+?$}{}; |
|
167
|
0
|
|
|
|
|
|
my @list = @{$element->content}; |
|
|
0
|
|
|
|
|
|
|
|
168
|
0
|
|
|
|
|
|
for (@list) { |
|
169
|
0
|
0
|
|
|
|
|
if (ref($_) eq 'HTML::Element') { |
|
170
|
0
|
0
|
|
|
|
|
if (defined($_->content)) { |
|
171
|
0
|
|
|
|
|
|
my $txt = join(' ', @{$_->content}); |
|
|
0
|
|
|
|
|
|
|
|
172
|
0
|
|
|
|
|
|
$title .= $txt; |
|
173
|
|
|
|
|
|
|
} |
|
174
|
|
|
|
|
|
|
else { |
|
175
|
0
|
|
|
|
|
|
return {}; |
|
176
|
|
|
|
|
|
|
} |
|
177
|
|
|
|
|
|
|
} |
|
178
|
|
|
|
|
|
|
else { |
|
179
|
0
|
|
|
|
|
|
$title .= $_; |
|
180
|
|
|
|
|
|
|
} |
|
181
|
|
|
|
|
|
|
} |
|
182
|
|
|
|
|
|
|
} |
|
183
|
|
|
|
|
|
|
else { |
|
184
|
0
|
|
|
|
|
|
return; |
|
185
|
|
|
|
|
|
|
} |
|
186
|
|
|
|
|
|
|
|
|
187
|
0
|
0
|
|
|
|
|
my $sh = Metabrik::String::Html->new_from_brik_init($self) or return; |
|
188
|
0
|
0
|
|
|
|
|
my $si = Metabrik::String::Uri->new_from_brik_init($self) or return; |
|
189
|
|
|
|
|
|
|
|
|
190
|
0
|
|
|
|
|
|
$title = $sh->decode($title); |
|
191
|
0
|
|
|
|
|
|
$url = $si->decode($url); |
|
192
|
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
return { |
|
194
|
0
|
|
|
|
|
|
url => $url, |
|
195
|
|
|
|
|
|
|
title => $title, |
|
196
|
|
|
|
|
|
|
}; |
|
197
|
|
|
|
|
|
|
} |
|
198
|
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
1; |
|
200
|
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
__END__ |