line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Search::OpenSearch::Federated; |
2
|
1
|
|
|
1
|
|
77514
|
use Moo; |
|
1
|
|
|
|
|
67481
|
|
|
1
|
|
|
|
|
14
|
|
3
|
|
|
|
|
|
|
|
4
|
|
|
|
|
|
|
our $VERSION = '0.007'; |
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
has 'debug' => ( is => 'rw' ); |
7
|
|
|
|
|
|
|
has 'fields' => ( |
8
|
|
|
|
|
|
|
is => 'rw', |
9
|
|
|
|
|
|
|
default => sub { [qw( title id author link summary tags modified )] } |
10
|
|
|
|
|
|
|
); |
11
|
|
|
|
|
|
|
has 'urls' => ( is => 'rw' ); |
12
|
|
|
|
|
|
|
has 'total' => ( is => 'rw' ); |
13
|
|
|
|
|
|
|
has 'facets' => ( is => 'rw' ); |
14
|
|
|
|
|
|
|
has 'subtotals' => ( is => 'rw' ); |
15
|
|
|
|
|
|
|
has 'timeout' => ( is => 'rw' ); |
16
|
|
|
|
|
|
|
has 'normalize_scores' => ( is => 'rw' ); |
17
|
|
|
|
|
|
|
has 'version' => ( is => 'rw', default => sub {$VERSION} ); |
18
|
|
|
|
|
|
|
|
19
|
1
|
|
|
1
|
|
3059
|
use Carp; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
267
|
|
20
|
1
|
|
|
1
|
|
2888
|
use Data::Dump qw( dump ); |
|
1
|
|
|
|
|
30279
|
|
|
1
|
|
|
|
|
138
|
|
21
|
1
|
|
|
1
|
|
2765
|
use Parallel::Iterator qw( iterate_as_array ); |
|
1
|
|
|
|
|
75873
|
|
|
1
|
|
|
|
|
102
|
|
22
|
1
|
|
|
1
|
|
1936
|
use JSON; |
|
1
|
|
|
|
|
34704
|
|
|
1
|
|
|
|
|
6
|
|
23
|
1
|
|
|
1
|
|
4425
|
use LWP::UserAgent; |
|
1
|
|
|
|
|
139937
|
|
|
1
|
|
|
|
|
138
|
|
24
|
1
|
|
|
1
|
|
15
|
use Scalar::Util qw( blessed ); |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
195
|
|
25
|
1
|
|
|
1
|
|
5485
|
use Search::Tools::XML; |
|
1
|
|
|
|
|
189898
|
|
|
1
|
|
|
|
|
79
|
|
26
|
1
|
|
|
1
|
|
1607
|
use Data::Transformer; |
|
1
|
|
|
|
|
924
|
|
|
1
|
|
|
|
|
39
|
|
27
|
1
|
|
|
1
|
|
1070
|
use Normalize; |
|
1
|
|
|
|
|
5404
|
|
|
1
|
|
|
|
|
75
|
|
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
# we do not use WWW::OpenSearch because we need to pull out |
30
|
|
|
|
|
|
|
# some non-standard data from the XML. |
31
|
|
|
|
|
|
|
# we do use XML::Feed to parse XML responses. |
32
|
1
|
|
|
1
|
|
931
|
use XML::Simple; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
use XML::Feed; |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
my $OS_NS = 'http://a9.com/-/spec/opensearch/1.1/'; |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
my $XMLer = Search::Tools::XML->new(); |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
my $XML_ESCAPER = Data::Transformer->new( |
40
|
|
|
|
|
|
|
normal => sub { local ($_) = shift; $$_ = $XMLer->escape($$_); } ); |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
sub search { |
43
|
|
|
|
|
|
|
my $self = shift; |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
my $urls = $self->{urls} or croak "no urls defined"; |
46
|
|
|
|
|
|
|
my $num_urls = scalar @$urls; |
47
|
|
|
|
|
|
|
my @done = iterate_as_array( |
48
|
|
|
|
|
|
|
sub { |
49
|
|
|
|
|
|
|
$self->_fetch( $_[1] ); |
50
|
|
|
|
|
|
|
}, |
51
|
|
|
|
|
|
|
$urls, |
52
|
|
|
|
|
|
|
); |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
return $self->_aggregate( \@done ); |
55
|
|
|
|
|
|
|
} |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
sub _aggregate { |
58
|
|
|
|
|
|
|
my $self = shift; |
59
|
|
|
|
|
|
|
my $responses = shift; |
60
|
|
|
|
|
|
|
my $results = []; |
61
|
|
|
|
|
|
|
my $fields = $self->fields; |
62
|
|
|
|
|
|
|
my $total = 0; |
63
|
|
|
|
|
|
|
my %subtotals = (); |
64
|
|
|
|
|
|
|
my %facets = (); |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
RESP: for my $resp (@$responses) { |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
my $req_uri = $resp->request->uri; |
69
|
|
|
|
|
|
|
my $resp_status = $resp->code; |
70
|
|
|
|
|
|
|
$self->debug |
71
|
|
|
|
|
|
|
and warn |
72
|
|
|
|
|
|
|
sprintf( "response for %s = %s\n", $req_uri, $resp_status ); |
73
|
|
|
|
|
|
|
next RESP unless $resp_status =~ m/^2/; |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
# temporary buffer to allow for normalizing scores |
76
|
|
|
|
|
|
|
my @resp_results = (); |
77
|
|
|
|
|
|
|
my $highest_score = 0; |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
if ( $resp->content_type eq 'application/json' ) { |
80
|
|
|
|
|
|
|
my $r = decode_json( $resp->content ); |
81
|
|
|
|
|
|
|
if ( $r->{results} ) { |
82
|
|
|
|
|
|
|
@resp_results = @{ $r->{results} }; |
83
|
|
|
|
|
|
|
} |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
# must turn facets inside out in order |
86
|
|
|
|
|
|
|
# to aggregate counts correctly |
87
|
|
|
|
|
|
|
if ( $r->{facets} ) { |
88
|
|
|
|
|
|
|
for my $name ( keys %{ $r->{facets} } ) { |
89
|
|
|
|
|
|
|
for my $facet ( @{ $r->{facets}->{$name} } ) { |
90
|
|
|
|
|
|
|
$facets{$name}->{ $facet->{term} } += $facet->{count}; |
91
|
|
|
|
|
|
|
} |
92
|
|
|
|
|
|
|
} |
93
|
|
|
|
|
|
|
} |
94
|
|
|
|
|
|
|
$total += $r->{total} || 0; |
95
|
|
|
|
|
|
|
$subtotals{$req_uri} = $r->{total}; |
96
|
|
|
|
|
|
|
} |
97
|
|
|
|
|
|
|
elsif ( $resp->content_type eq 'application/xml' ) { |
98
|
|
|
|
|
|
|
my $xml = $resp->content; |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
#warn $xml; |
101
|
|
|
|
|
|
|
my $feed = XML::Feed->parse( \$xml ); |
102
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
if ( !$feed ) { |
104
|
|
|
|
|
|
|
warn XML::Feed->errstr; |
105
|
|
|
|
|
|
|
next RESP; |
106
|
|
|
|
|
|
|
} |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
#dump $feed; |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
# |
111
|
|
|
|
|
|
|
# we must re-escape the XML content since the feed parser |
112
|
|
|
|
|
|
|
# and XML::Simple will escape values automatically |
113
|
|
|
|
|
|
|
# |
114
|
|
|
|
|
|
|
my @entries; |
115
|
|
|
|
|
|
|
for my $item ( $feed->entries ) { |
116
|
|
|
|
|
|
|
my $e = {}; |
117
|
|
|
|
|
|
|
for my $f (@$fields) { |
118
|
|
|
|
|
|
|
$e->{$f} = $item->$f; |
119
|
|
|
|
|
|
|
if ( blessed( $e->{$f} ) ) { |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
#dump( $e->{$f} ); |
122
|
|
|
|
|
|
|
if ( $e->{$f}->isa('XML::Feed::Content') ) { |
123
|
|
|
|
|
|
|
$e->{$f} = $XMLer->escape( $e->{$f}->body ); |
124
|
|
|
|
|
|
|
} |
125
|
|
|
|
|
|
|
elsif ( $e->{$f}->isa('DateTime') ) { |
126
|
|
|
|
|
|
|
$e->{$f} = $e->{$f}->epoch; |
127
|
|
|
|
|
|
|
} |
128
|
|
|
|
|
|
|
} |
129
|
|
|
|
|
|
|
else { |
130
|
|
|
|
|
|
|
$e->{$f} = $XMLer->escape( $e->{$f} ); |
131
|
|
|
|
|
|
|
} |
132
|
|
|
|
|
|
|
} |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
#dump $e; |
135
|
|
|
|
|
|
|
my $content = $item->content; |
136
|
|
|
|
|
|
|
my $fields = XMLin( $content->body, NoAttr => 1 ); |
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
#dump $fields; |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
for my $f ( keys %$fields ) { |
141
|
|
|
|
|
|
|
$e->{$f} = $fields->{$f}; |
142
|
|
|
|
|
|
|
if ( ref $e->{$f} ) { |
143
|
|
|
|
|
|
|
$XML_ESCAPER->traverse( $e->{$f} ); |
144
|
|
|
|
|
|
|
} |
145
|
|
|
|
|
|
|
else { |
146
|
|
|
|
|
|
|
$e->{$f} = $XMLer->escape( $e->{$f} ); |
147
|
|
|
|
|
|
|
} |
148
|
|
|
|
|
|
|
} |
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
# massage some field names |
151
|
|
|
|
|
|
|
$e->{mtime} = delete $e->{modified}; |
152
|
|
|
|
|
|
|
$e->{uri} = delete $e->{id}; |
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
#dump $content; |
155
|
|
|
|
|
|
|
#dump $e; |
156
|
|
|
|
|
|
|
push @entries, $e; |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
} |
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
# facets require digging into the raw xml |
161
|
|
|
|
|
|
|
my $xml_feed = XMLin( $feed->as_xml, NoAttr => 1 ); |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
#dump($xml_feed); |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
# must turn facets inside out in order |
166
|
|
|
|
|
|
|
# to aggregate counts correctly |
167
|
|
|
|
|
|
|
if ( $xml_feed->{category}->{sos}->{facets} ) { |
168
|
|
|
|
|
|
|
my $facet_feed = $xml_feed->{category}->{sos}->{facets}; |
169
|
|
|
|
|
|
|
for my $name ( keys %$facet_feed ) { |
170
|
|
|
|
|
|
|
if ( ref $facet_feed->{$name}->{$name} eq 'ARRAY' ) { |
171
|
|
|
|
|
|
|
for my $facet ( @{ $facet_feed->{$name}->{$name} } ) { |
172
|
|
|
|
|
|
|
$facets{$name}->{ $facet->{term} } |
173
|
|
|
|
|
|
|
+= $facet->{count}; |
174
|
|
|
|
|
|
|
} |
175
|
|
|
|
|
|
|
} |
176
|
|
|
|
|
|
|
elsif ( ref $facet_feed->{$name}->{$name} eq 'HASH' ) { |
177
|
|
|
|
|
|
|
my $facet = $facet_feed->{$name}->{$name}; |
178
|
|
|
|
|
|
|
$facets{$name}->{ $facet->{term} } = $facet->{count}; |
179
|
|
|
|
|
|
|
} |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
} |
182
|
|
|
|
|
|
|
} |
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
my $atom = $feed->{atom}; |
185
|
|
|
|
|
|
|
my $this_total = $atom->get( $OS_NS, 'totalResults' ); |
186
|
|
|
|
|
|
|
$total += $this_total; |
187
|
|
|
|
|
|
|
$subtotals{$req_uri} = $this_total; |
188
|
|
|
|
|
|
|
push @resp_results, @entries; |
189
|
|
|
|
|
|
|
} |
190
|
|
|
|
|
|
|
else { |
191
|
|
|
|
|
|
|
croak sprintf( "Unsupported response type '%s' for %s\n", |
192
|
|
|
|
|
|
|
scalar $resp->content_type, $req_uri ); |
193
|
|
|
|
|
|
|
} |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
# normalize scores |
196
|
|
|
|
|
|
|
if ( $self->normalize_scores ) { |
197
|
|
|
|
|
|
|
my $normalizer = Normalize->new( 'round_to' => 0.001 ); |
198
|
|
|
|
|
|
|
my %normalized = (); |
199
|
|
|
|
|
|
|
my $i = 0; |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
# compute |
202
|
|
|
|
|
|
|
for my $r (@resp_results) { |
203
|
|
|
|
|
|
|
$normalized{ $i++ } = $r->{score}; |
204
|
|
|
|
|
|
|
} |
205
|
|
|
|
|
|
|
$normalizer->normalize_to_max( \%normalized ); |
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
# apply |
208
|
|
|
|
|
|
|
for my $idx ( keys %normalized ) { |
209
|
|
|
|
|
|
|
$resp_results[$idx]->{score} = ( $normalized{$idx} * 1000 ); |
210
|
|
|
|
|
|
|
} |
211
|
|
|
|
|
|
|
} |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
# aggregate |
214
|
|
|
|
|
|
|
push @$results, @resp_results; |
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
} |
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
# transform facets back into arrays of count/term pairs |
219
|
|
|
|
|
|
|
my %facets_norm; |
220
|
|
|
|
|
|
|
for my $name ( keys %facets ) { |
221
|
|
|
|
|
|
|
my @diads = (); |
222
|
|
|
|
|
|
|
for my $term ( keys %{ $facets{$name} } ) { |
223
|
|
|
|
|
|
|
push @diads, { term => $term, count => $facets{$name}->{$term} }; |
224
|
|
|
|
|
|
|
} |
225
|
|
|
|
|
|
|
$facets_norm{$name} = [@diads]; |
226
|
|
|
|
|
|
|
} |
227
|
|
|
|
|
|
|
$self->{facets} = \%facets_norm; |
228
|
|
|
|
|
|
|
$self->{total} = $total; |
229
|
|
|
|
|
|
|
$self->{subtotals} = \%subtotals; |
230
|
|
|
|
|
|
|
return [ sort { $b->{score} <=> $a->{score} } @$results ]; |
231
|
|
|
|
|
|
|
} |
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
sub _fetch { |
234
|
|
|
|
|
|
|
my $self = shift; |
235
|
|
|
|
|
|
|
my $url = shift or croak "url required"; |
236
|
|
|
|
|
|
|
my $ua = LWP::UserAgent->new(); |
237
|
|
|
|
|
|
|
$ua->agent( 'sos-fedsearch ' . $VERSION ); |
238
|
|
|
|
|
|
|
$ua->timeout( $self->{timeout} ) if $self->{timeout}; |
239
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
my $response = $ua->get($url); |
241
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
$self->debug and warn "got response for $url: " . $response->status_line; |
243
|
|
|
|
|
|
|
return $response; |
244
|
|
|
|
|
|
|
} |
245
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
1; |
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
__END__ |