File Coverage

blib/lib/Dist/Surveyor/Inquiry.pm
Criterion Covered Total %
statement 71 91 78.0
branch 13 36 36.1
condition 5 7 71.4
subroutine 14 15 93.3
pod 1 1 100.0
total 104 150 69.3


line stmt bran cond sub pod time code
1             package Dist::Surveyor::Inquiry;
2              
3 3     3   458 use strict;
  3         10  
  3         90  
4 3     3   17 use warnings;
  3         5  
  3         75  
5 3     3   1650 use Memoize; # core
  3         6428  
  3         141  
6 3     3   1292 use FindBin;
  3         2890  
  3         120  
7 3     3   18 use Fcntl qw(:DEFAULT :flock); # core
  3         5  
  3         892  
8 3     3   1241 use Dist::Surveyor::DB_File; # internal
  3         10  
  3         152  
9 3     3   1174 use HTTP::Tiny;
  3         101337  
  3         143  
10 3     3   1503 use JSON::MaybeXS qw(JSON decode_json);
  3         25420  
  3         245  
11 3     3   24 use Scalar::Util qw(looks_like_number); # core
  3         9  
  3         298  
12 3     3   752 use Data::Dumper;
  3         8646  
  3         158  
13 3     3   992 use version;
  3         4097  
  3         21  
14              
15             our $VERSION = '0.021';
16              
17             =head1 NAME
18              
19             Dist::Surveyor::Inquiry - Handling the meta-cpan API access for Dist::Surveyor
20              
21             =head1 DESCRIPTION
22              
23             There are a few things that needed to be known in this module:
24              
25             =over
26              
27             =item *
28              
29             $metacpan_size - internally defined global to limit the maximum size of
30             every API call
31              
32             =item *
33              
34             $metacpan_calls - internally defined global counting how many API call happen.
35              
36             =item *
37              
38             This module checks $::DEBUG and $::VERBOSE for obvious proposes.
39              
40             =item *
41              
42             For initating cache-on-disk, call Dist::Surveyor::Inquiry->perma_cache()
43             (this should be usually done, except in testing environment)
44              
45             =back
46              
47             =cut
48              
49             # We have to limit the number of results when using MetaCPAN::API.
50             # We can'r make it too large as it hurts the server (it preallocates)
51             # but need to make it large enough for worst case distros (eg eBay-API).
52             # TODO: switching to the ElasticSearch module, with cursor support, will
53             # probably avoid the need for this. Else we could dynamically adjust.
54             our $metacpan_size = 2500;
55             our $metacpan_calls = 0;
56              
57             our ($DEBUG, $VERBOSE);
58             *DEBUG = \$::DEBUG;
59             *VERBOSE = \$::VERBOSE;
60              
61             require Exporter;
62             our @ISA = qw{Exporter};
63             our @EXPORT = qw{
64             get_candidate_cpan_dist_releases
65             get_candidate_cpan_dist_releases_fallback
66             get_module_versions_in_release
67             get_release_info
68             };
69              
70             my $agent_string = "dist_surveyor/$VERSION";
71              
72             my ($ua, $wget, $curl);
73             if (HTTP::Tiny->can_ssl) {
74             $ua = HTTP::Tiny->new(
75             agent => $agent_string,
76             timeout => 10,
77             keep_alive => 1,
78             );
79             } else { # for fatpacking support
80             require File::Which;
81             require IPC::System::Simple;
82             $wget = File::Which::which('wget');
83             $curl = File::Which::which('curl');
84             }
85              
86             sub _https_request {
87 59     59   1808 my ($method, $url, $headers, $content) = @_;
88 59   100     267 $headers ||= {};
89 59   50     276 $method = uc($method || 'GET');
90 59 50       234 if (defined $ua) {
    0          
    0          
91 59         134 my %options;
92 59 100       270 $options{headers} = $headers if %$headers;
93 59 100       267 $options{content} = $content if defined $content;
94 59         340 my $response = $ua->request($method, $url, \%options);
95 59 50       23213555 unless ($response->{success}) {
96 0 0       0 die "Transport error: $response->{content}\n" if $response->{status} == 599;
97 0         0 die "HTTP error: $response->{status} $response->{reason}\n";
98             }
99 59         1115 return $response->{content};
100             } elsif (defined $wget) {
101 0         0 my @args = ('-q', '-O', '-', '-U', $agent_string, '-T', 10, '--method', $method);
102 0         0 push @args, '--header', "$_: $headers->{$_}" for keys %$headers;
103 0 0       0 push @args, '--body-data', $content if defined $content;
104 0         0 return IPC::System::Simple::capturex($wget, @args, $url);
105             } elsif (defined $curl) {
106 0         0 my @args = ('-s', '-S', '-L', '-A', $agent_string, '--connect-timeout', 10, '-X', $method);
107 0         0 push @args, '-H', "$_: $headers->{$_}" for keys %$headers;
108 0 0       0 push @args, '--data-raw', $content if defined $content;
109 0         0 return IPC::System::Simple::capturex($curl, @args, $url);
110             } else {
111 0         0 die "None of IO::Socket::SSL, wget, or curl are available; cannot make HTTPS requests.";
112             }
113             }
114              
115             # caching via persistent memoize
116              
117             my %memoize_cache;
118             my $locking_file;
119              
120             =head1 CLASS METHODS
121              
122             =head2 Dist::Surveyor::Inquiry->perma_cache()
123              
124             Enable caching to disk of all the MetaCPAN API requests.
125             This cache can grew to be quite big - 40MB is one case, but it worth it,
126             as if you will need to run this program again, it will run much faster.
127              
128             =cut
129              
130             sub perma_cache {
131 0     0 1 0 my $class = shift;
132 0         0 my $db_generation = 3; # XXX increment on incompatible change
133 0         0 my $pname = $FindBin::Script;
134 0         0 $pname =~ s/\..*$//;
135 0         0 my $memoize_file = "$pname-$db_generation.db";
136 0 0       0 open $locking_file, ">", "$memoize_file.lock"
137             or die "Unable to open lock file: $!";
138 0 0       0 flock ($locking_file, LOCK_EX) || die "flock: $!";
139 0 0       0 tie %memoize_cache => 'Dist::Surveyor::DB_File', $memoize_file, O_CREAT|O_RDWR, 0640
140             or die "Unable to use persistent cache: $!";
141             }
142              
143             my @memoize_subs = qw(
144             get_candidate_cpan_dist_releases
145             get_candidate_cpan_dist_releases_fallback
146             get_module_versions_in_release
147             get_release_info
148             );
149             for my $subname (@memoize_subs) {
150             my %memoize_args = (
151             SCALAR_CACHE => [ HASH => \%memoize_cache ],
152             LIST_CACHE => 'FAULT',
153             NORMALIZER => sub { return join("\034", $subname, @_) }
154             );
155             memoize($subname, %memoize_args);
156             }
157              
158             =head1 FUNCTIONS
159              
160             =head2 get_release_info($author, $release)
161              
162             Receive release info, such as:
163              
164             get_release_info('SEMUELF', 'Dist-Surveyor-0.009')
165              
166             Returns a hashref containing all that release meta information, returned by
167             C
168             (but not information on the files inside the module)
169              
170             Dies on HTTP error, and warns on empty response.
171              
172             =cut
173              
174             sub get_release_info {
175             my ($author, $release) = @_;
176             $metacpan_calls++;
177             my $response = _https_request(GET => "https://fastapi.metacpan.org/v1/release/$author/$release");
178             my $release_data = decode_json $response;
179             if (!$release_data or !$release_data->{release}) {
180             warn "Can't find release details for $author/$release - SKIPPED!\n";
181             return; # XXX could fake some of $release_data instead
182             }
183             return $release_data->{release};
184             }
185              
186             =head2 get_candidate_cpan_dist_releases($module, $version, $file_size)
187              
188             Return a hashref containing all the releases that contain this module
189             (with the specific version and file size combination)
190              
191             The keys are the release name (i.e. 'Dist-Surveyor-0.009') and the value
192             is a hashref containing release information and file information:
193              
194             'Dist-Surveyor-0.009' => {
195             # release information
196             'date' => '2013-02-20T06:48:35.000Z',
197             'version' => '0.009',
198             'author' => 'SEMUELF',
199             'version_numified' => '0.009',
200             'release' => 'Dist-Surveyor-0.009',
201             'distribution' => 'Dist-Surveyor',
202             'version_obj' => ,
203              
204             # File information
205             'path' => 'lib/Dist/Surveyor/DB_File.pm',
206             'stat.mtime' => 1361342736,
207             'module.version' => '0.009'
208             'module.version_numified' => '0.009',
209             }
210              
211             =cut
212              
213             sub get_candidate_cpan_dist_releases {
214             my ($module, $version, $file_size) = @_;
215             my $funcstr = "get_candidate_cpan_dist_releases($module, $version, $file_size)";
216              
217             my $version_qual = _prepare_version_query(0, $version);
218              
219             my @and_quals = (
220             {"term" => {"module.name" => $module }},
221             (@$version_qual > 1 ? { "bool" => { "should" => $version_qual } } : $version_qual->[0]),
222             );
223             push @and_quals, {"term" => {"stat.size" => $file_size }}
224             if $file_size;
225              
226             # XXX doesn't cope with odd cases like
227             # http://explorer.metacpan.org/?url=/module/MLEHMANN/common-sense-3.4/sense.pm.PL
228             $metacpan_calls++;
229              
230             my $query = {
231             "size" => $metacpan_size,
232             "query" => { "bool" => {
233             "filter" => \@and_quals,
234             }},
235             "fields" => [qw(
236             release _parent author version version_numified module.version
237             module.version_numified date stat.mtime distribution path
238             )]
239             };
240              
241             my $response = _https_request(POST => 'https://fastapi.metacpan.org/v1/file',
242             { 'Content-Type' => 'application/json;charset=UTF-8' },
243             JSON->new->utf8->canonical->encode($query),
244             );
245             return _process_response($funcstr, $response);
246             }
247              
248             =head2 get_candidate_cpan_dist_releases_fallback($module, $version)
249              
250             Similar to get_candidate_cpan_dist_releases, but getting called when
251             get_candidate_cpan_dist_releases fails for find matching file and release.
252              
253             Maybe the file was tempared somehow, so the file size does not match anymore.
254              
255             =cut
256              
257             sub get_candidate_cpan_dist_releases_fallback {
258             my ($module, $version) = @_;
259              
260             # fallback to look for distro of the same name as the module
261             # for odd cases like
262             # http://explorer.metacpan.org/?url=/module/MLEHMANN/common-sense-3.4/sense.pm.PL
263             (my $distname = $module) =~ s/::/-/g;
264              
265             my $version_qual = _prepare_version_query(1, $version);
266              
267             my @and_quals = (
268             {"term" => {"distribution" => $distname }},
269             (@$version_qual > 1 ? { "bool" => { "should" => $version_qual } } : $version_qual->[0]),
270             );
271              
272             # XXX doesn't cope with odd cases like
273             $metacpan_calls++;
274             my $query = {
275             "size" => $metacpan_size,
276             "query" => { "bool" => {
277             "filter" => \@and_quals,
278             }},
279             "fields" => [qw(
280             release _parent author version version_numified module.version
281             module.version_numified date stat.mtime distribution path)]
282             };
283             my $response = _https_request(POST => 'https://fastapi.metacpan.org/v1/file',
284             { 'Content-Type' => 'application/json;charset=UTF-8' },
285             JSON->new->utf8->canonical->encode($query),
286             );
287             return _process_response("get_candidate_cpan_dist_releases_fallback($module, $version)", $response);
288             }
289              
290             sub _prepare_version_query {
291 7     7   26 my ($is_fallback, $version) = @_;
292 7 50       29 $version = 0 if not defined $version; # XXX
293 7 50       38 my ($v_key, $num_key) =
294             $is_fallback
295             ? qw{ version version_numified }
296             : qw{ module.version module.version_numified };
297              
298             # timbunce: So, the current situation is that: version_numified is a float
299             # holding version->parse($raw_version)->numify, and version is a string
300             # also holding version->parse($raw_version)->numify at the moment, and
301             # that'll change to ->stringify at some point. Is that right now?
302             # mo: yes, I already patched the indexer, so new releases are already
303             # indexed ok, but for older ones I need to reindex cpan
304 7 100 66     85 my $v = (ref $version && $version->isa('version')) ? $version : version->parse($version);
305 7         86 my %v = map { $_ => 1 } "$version", $v->stringify, $v->numify;
  21         69  
306 7         24 my @version_qual;
307             push @version_qual, { term => { $v_key => $_ } }
308 7         60 for keys %v;
309             push @version_qual, { term => { $num_key => $_ }}
310 7         25 for grep { looks_like_number($_) } keys %v;
  11         75  
311 7         32 return \@version_qual;
312             }
313              
314             sub _process_response {
315 7     7   28 my ($funcname, $response) = @_;
316              
317 7         607 my $results = decode_json $response;
318              
319 7         32 my $hits = $results->{hits}{hits};
320 7 50       32 die "$funcname: too many results (>$metacpan_size)"
321             if @$hits >= $metacpan_size;
322             warn "$funcname: ".Dumper($results)
323 7 50       27 if grep { not $_->{fields}{release} } @$hits; # XXX temp, seen once but not since
  51         106  
324              
325             # filter out perl-like releases
326             @$hits =
327 51         166 grep { $_->{fields}{path} !~ m!^(?:t|xt|tests?|inc|samples?|ex|examples?|bak|local-lib)\b! }
328 7         22 grep { $_->{fields}{release} !~ /^(perl|ponie|parrot|kurila|SiePerl-)/ }
  51         110  
329             @$hits;
330              
331 7         25 for my $hit (@$hits) {
332 50         87 $hit->{release_id} = delete $hit->{_parent};
333             # add version_obj for convenience (will fail and be undef for releases like "0.08124-TRIAL")
334 50         64 $hit->{fields}{version_obj} = eval { version->parse($hit->{fields}{version}) };
  50         292  
335             }
336              
337             # we'll return { "Dist-Name-Version" => { details }, ... }
338 7         22 my %dists = map { $_->{fields}{release} => $_->{fields} } @$hits;
  50         146  
339              
340 7 50       34 warn "$funcname: @{[ sort keys %dists ]}\n"
  0         0  
341             if $VERBOSE;
342              
343 7         171 return \%dists;
344             }
345              
346             =head2 get_module_versions_in_release($author, $release)
347              
348             Receive release info, such as:
349              
350             get_module_versions_in_release('SEMUELF', 'Dist-Surveyor-0.009')
351              
352             And returns a hashref, that contains one entry for each module that exists
353             in the release. module information is the format:
354              
355             'Dist::Surveyor' => {
356             'version' => '0.009',
357             'name' => 'Dist::Surveyor',
358             'path' => 'lib/Dist/Surveyor.pm',
359             'size' => 43879
360             },
361              
362             this function can be called for all sorts of releases that are only vague
363             possibilities and aren't actually installed, so generally it's quiet
364              
365             =cut
366              
367             sub get_module_versions_in_release {
368             my ($author, $release) = @_;
369              
370             $metacpan_calls++;
371             my $results = eval {
372             my $query = {
373             "size" => $metacpan_size,
374             "query" => { "bool" => {
375             "filter" => [
376             {"term" => {"release" => $release }},
377             {"term" => {"author" => $author }},
378             {"term" => {"mime" => "text/x-script.perl-module"}},
379             ],
380             }},
381             "fields" => ["path","name","stat.size"],
382             "inner_hits" => {"module" => {"path" => {"module" => {}}}},
383             };
384             my $response = _https_request(POST => 'https://fastapi.metacpan.org/v1/file',
385             { 'Content-Type' => 'application/json;charset=UTF-8' },
386             JSON->new->utf8->canonical->encode($query),
387             );
388             decode_json $response;
389             };
390             if (not $results) {
391             warn "Failed get_module_versions_in_release for $author/$release: $@";
392             return {};
393             }
394             my $hits = $results->{hits}{hits};
395             die "get_module_versions_in_release($author, $release): too many results"
396             if @$hits >= $metacpan_size;
397              
398             my %modules_in_release;
399             for my $hit (@$hits) {
400             my $path = $hit->{fields}{path};
401              
402             # XXX try to ignore files that won't get installed
403             # XXX should use META noindex!
404             if ($path =~ m!^(?:t|xt|tests?|inc|samples?|ex|examples?|bak|local-lib)\b!) {
405             warn "$author/$release: ignored non-installed module $path\n"
406             if $DEBUG;
407             next;
408             }
409              
410             my $size = $hit->{fields}{"stat.size"};
411             # files can contain more than one package ('module')
412             my $rel_mods = $hit->{inner_hits}{module}{hits}{hits} || [];
413             for my $inner_hit (@$rel_mods) { # actually packages in the file
414             my $mod = $inner_hit->{_source};
415              
416             # Some files may contain multiple packages. We want to ignore
417             # all except the one that matches the name of the file.
418             # We use a fairly loose (but still very effective) test because we
419             # can't rely on $path including the full package name.
420             (my $filebasename = $hit->{fields}{name}) =~ s/\.pm$//;
421             if ($mod->{name} !~ m/\b$filebasename$/) {
422             warn "$author/$release: ignored $mod->{name} in $path\n"
423             if $DEBUG;
424             next;
425             }
426              
427             # warn if package previously seen in this release
428             # with a different version or file size
429             if (my $prev = $modules_in_release{$mod->{name}}) {
430             my $version_obj = eval { version->parse($mod->{version}) };
431             die "$author/$release: $mod $mod->{version}: $@" if $@;
432              
433             if ($VERBOSE) {
434             # XXX could add a show-only-once cache here
435             my $msg = "$mod->{name} $mod->{version} ($size) seen in $path after $prev->{path} $prev->{version} ($prev->{size})";
436             warn "$release: $msg\n"
437             if ($version_obj != version->parse($prev->{version}) or $size != $prev->{size});
438             }
439             }
440              
441             # keep result small as Storable thawing this is major runtime cost
442             # (specifically we avoid storing a version_obj here)
443             $modules_in_release{$mod->{name}} = {
444             name => $mod->{name},
445             path => $path,
446             version => $mod->{version},
447             size => $size,
448             };
449             }
450             }
451              
452             warn "\n$author/$release contains: @{[ map { qq($_->{name} $_->{version}) } values %modules_in_release ]}\n"
453             if $DEBUG;
454              
455             return \%modules_in_release;
456             }
457              
458             =head1 License, Copyright
459              
460             Please see L for details
461              
462             =cut
463              
464             1;