File Coverage

blib/lib/CPAN/Testers/Data/Generator.pm
Criterion Covered Total %
statement 49 51 96.0
branch n/a
condition n/a
subroutine 17 17 100.0
pod n/a
total 66 68 97.0


line stmt bran cond sub pod time code
1             package CPAN::Testers::Data::Generator;
2              
3 9     9   132204 use warnings;
  9         12  
  9         260  
4 9     9   32 use strict;
  9         9  
  9         239  
5              
6 9     9   50 use vars qw($VERSION);
  9         12  
  9         393  
7             $VERSION = '1.19';
8              
9             #----------------------------------------------------------------------------
10             # Library Modules
11              
12 9     9   4496 use Config::IniFiles;
  9         243716  
  9         252  
13 9     9   4657 use CPAN::Testers::Common::Article;
  9         112800  
  9         57  
14 9     9   4067 use CPAN::Testers::Common::DBUtils;
  9         113622  
  9         53  
15             #use Data::Dumper;
16 9     9   4971 use Data::FlexSerializer;
  9         10236863  
  9         69  
17 9     9   12871 use DateTime;
  9         654710  
  9         355  
18 9     9   70 use DateTime::Duration;
  9         18  
  9         132  
19 9     9   95 use File::Basename;
  9         8  
  9         649  
20 9     9   36 use File::Path;
  9         10  
  9         334  
21 9     9   4414 use File::Slurp;
  9         25627  
  9         499  
22 9     9   3936 use HTML::Entities;
  9         32463  
  9         596  
23 9     9   61 use IO::File;
  9         8  
  9         873  
24 9     9   4568 use JSON;
  9         27521  
  9         35  
25 9     9   974 use Time::Local;
  9         11  
  9         410  
26              
27 9     9   10130 use Metabase 0.004;
  0            
  0            
28             use Metabase::Fact;
29             use Metabase::Resource;
30             use CPAN::Testers::Fact::LegacyReport;
31             use CPAN::Testers::Fact::TestSummary;
32             use CPAN::Testers::Metabase::AWS;
33             use CPAN::Testers::Report;
34              
35             #----------------------------------------------------------------------------
36             # Variables
37              
38             my $DIFF = 30; # max difference allowed in seconds
39             my $MINS = 15; # split time in minutes
40              
41             my %testers;
42              
43             my $FROM = 'CPAN Tester Report Server <do_not_reply@cpantesters.org>';
44             my $HOW = '/usr/sbin/sendmail -bm';
45             my $HEAD = 'To: EMAIL
46             From: FROM
47             Date: DATE
48             Subject: CPAN Testers Generator Error Report
49              
50             ';
51              
52             my $BODY = '
53             The following reports failed to parse into the cpanstats database:
54              
55             INVALID
56              
57             Thanks,
58             CPAN Testers Server.
59             ';
60              
61             my @admins = (
62             'barbie@missbarbell.co.uk',
63             #'david@dagolden.com'
64             );
65              
66             my ($OSNAMES,%MAPPINGS);
67              
68             #----------------------------------------------------------------------------
69             # The Application Programming Interface
70              
71             sub new {
72             my $class = shift;
73             my %hash = @_;
74              
75             my $self = {
76             meta_count => 0,
77             stat_count => 0,
78             last => '',
79             };
80             bless $self, $class;
81              
82             # load configuration
83             my $cfg = Config::IniFiles->new( -file => $hash{config} );
84              
85             # configure databases
86             for my $db (qw(CPANSTATS METABASE)) {
87             die "No configuration for $db database\n" unless($cfg->SectionExists($db));
88             my %opts = map {$_ => ($cfg->val($db,$_)||undef);} qw(driver database dbfile dbhost dbport dbuser dbpass);
89             $opts{AutoCommit} = 0;
90             $self->{$db} = CPAN::Testers::Common::DBUtils->new(%opts);
91             die "Cannot configure $db database\n" unless($self->{$db});
92             $self->{$db}->{'mysql_enable_utf8'} = 1 if($opts{driver} =~ /mysql/i);
93             $self->{$db}->{'mysql_auto_reconnect'} = 1 if($opts{driver} =~ /mysql/i);
94             }
95              
96             if($cfg->SectionExists('ADMINISTRATION')) {
97             my @admins = $cfg->val('ADMINISTRATION','admins');
98             $self->{admins} = \@admins;
99             }
100              
101             # command line swtiches override configuration settings
102             for my $key (qw(logfile poll_limit stopfile offset aws_bucket aws_namespace)) {
103             $self->{$key} = $hash{$key} || $cfg->val('MAIN',$key);
104             }
105              
106             $self->{offset} ||= 1;
107             $self->{poll_limit} ||= 1000;
108              
109             my @rows = $self->{METABASE}->get_query('hash','SELECT * FROM testers_email');
110             for my $row (@rows) {
111             $testers{$row->{resource}} = $row->{email};
112             }
113              
114             # build OS names map
115             @rows = $self->{CPANSTATS}->get_query('array','SELECT osname,ostitle FROM osname');
116             for my $row (@rows) {
117             $self->{OSNAMES}{lc $row->[0]} ||= $row->[1];
118             }
119             $OSNAMES = join('|',keys %{$self->{OSNAMES}}) if(keys %{$self->{OSNAMES}});
120              
121             $self->load_uploads();
122             $self->load_authors();
123             $self->load_perl_versions();
124              
125             if($cfg->SectionExists('DISABLE')) {
126             my @values = $cfg->val('DISABLE','LIST');
127             $self->{DISABLE}{$_} = 1 for(@values);
128             }
129              
130             if($cfg->SectionExists('OSNAMES')) {
131             for my $param ($cfg->Parameters('OSNAMES')) {
132             $self->{OSNAMES}{lc $param} ||= lc $cfg->val('OSNAMES',$param);
133             }
134             }
135              
136             if($cfg->SectionExists('MAPPINGS')) {
137             for my $param ($cfg->Parameters('MAPPINGS')) {
138             $MAPPINGS{$param} = [ split(',', $cfg->val('MAPPINGS',$param), 2) ];
139             }
140             }
141              
142             eval {
143             $self->{metabase} = CPAN::Testers::Metabase::AWS->new(
144             bucket => $self->{aws_bucket},
145             namespace => $self->{aws_namespace},
146             );
147             $self->{librarian} = $self->{metabase}->public_librarian;
148             };
149              
150             # if we require remote access, we need the librarian
151             unless($hash{localonly}) {
152             return unless($self->{metabase} && $self->{librarian});
153             }
154              
155             # reports are now stored in a compressed format
156             $self->{serializer} = Data::FlexSerializer->new(
157             detect_compression => 1,
158             detect_json => 1,
159             output_format => 'json'
160             );
161             $self->{serializer2} = Data::FlexSerializer->new(
162             detect_compression => 1,
163             detect_sereal => 1,
164             output_format => 'sereal'
165             );
166              
167             return $self;
168             }
169              
170             sub DESTROY {
171             my $self = shift;
172              
173             $self->save_perl_versions();
174             }
175              
176             #----------------------------------------------------------------------------
177             # Public Methods
178              
179             sub generate {
180             my $self = shift;
181             my $nonstop = shift || 0;
182             my $maxdate = shift;
183             my ($to,@reports);
184              
185             $self->{reparse} = 0;
186              
187             $self->_log("START GENERATE nonstop=$nonstop\n");
188              
189             do {
190             my $start = localtime(time);
191             ($self->{processed},$self->{stored},$self->{cached}) = (0,0,0);
192              
193             if($maxdate) {
194             $to = $maxdate;
195             } else {
196             $to = sprintf "%sT%sZ", DateTime->now->ymd, DateTime->now->hms;
197             }
198              
199             $self->_log("DATES maxdate=$maxdate, to=$to \n");
200              
201             my $data = $self->get_next_dates($to);
202            
203             $self->_consume_reports( $to, $data );
204              
205             $nonstop = 0 if($self->{processed} == 0);
206             $nonstop = 0 if($self->{stopfile} && -f $self->{stopfile});
207             $nonstop = 0 if($maxdate && $maxdate le $to);
208              
209             $self->load_uploads() if($nonstop);
210             $self->load_authors() if($nonstop);
211              
212             $self->_log("CHECK nonstop=$nonstop\n");
213             } while($nonstop);
214             $self->_log("STOP GENERATE nonstop=$nonstop\n");
215             }
216              
217             sub regenerate {
218             my ($self,$hash) = @_;
219              
220             $self->{reparse} = 0;
221              
222             my $maxdate = sprintf "%sT%sZ", DateTime->now->ymd, DateTime->now->hms;
223              
224             $self->_log("START REGENERATE\n");
225              
226             my @data;
227             if($hash->{file}) {
228             my $fh = IO::File->new($hash->{file},'r') or die "Cannot open file [$hash->{file}]: $!\n";
229             while(<$fh>) {
230             s/\s+$//;
231             my ($fval,$tval) = split(/,/,$_,2);
232             my %data;
233             $data{gstart} = $fval if($fval =~ /^\w+-\w+-\w+-\w+-\w+$/);
234             $data{dstart} = $fval if($fval =~ /^\d+-\d+-\d+T\d+:\d+:\d+Z$/);
235             $data{gend} = $tval if($tval =~ /^\w+-\w+-\w+-\w+-\w+$/);
236             $data{dend} = $tval if($tval =~ /^\d+-\d+-\d+T\d+:\d+:\d+Z$/);
237             push @data, \%data;
238             }
239             $fh->close;
240             } else {
241             push @data, { gstart => $hash->{gstart}, gend => $hash->{gend},
242             dstart => $hash->{dstart}, dend => $hash->{dend} };
243             }
244              
245             $self->_consume_reports( $maxdate, \@data );
246              
247             $self->_log("STOP REGENERATE\n");
248             }
249              
250             sub rebuild {
251             my ($self,$hash) = @_;
252             $self->_log("START REBUILD\n");
253              
254             my $start = localtime(time);
255             ($self->{processed},$self->{stored},$self->{cached}) = (0,0,0);
256              
257             $self->{reparse} = 1;
258             $self->{localonly} = $hash->{localonly} ? 1 : 0;
259             $self->{check} = $hash->{check} ? 1 : 0;
260              
261              
262             # selection choices:
263             # 1) from guid [to guid]
264             # 2) from date [to date]
265              
266             $hash->{dstart} = $self->_get_createdate( $hash->{gstart}, $hash->{dstart} );
267             $hash->{dend} = $self->_get_createdate( $hash->{gend}, $hash->{dend} );
268              
269             my @where;
270             push @where, "updated >= '$hash->{dstart}'" if($hash->{dstart});
271             push @where, "updated <= '$hash->{dend}'" if($hash->{dend});
272            
273             my $sql = 'SELECT * FROM metabase' .
274             (@where ? ' WHERE ' . join(' AND ',@where) : '') .
275             ' ORDER BY updated ASC';
276              
277             $self->_log("START sql=[$sql]\n");
278              
279             # $self->{CPANSTATS}->do_query("DELETE FROM cpanstats WHERE id >= $start AND id <= $end");
280              
281             my $iterator = $self->{METABASE}->iterator('hash',$sql);
282             while(my $row = $iterator->()) {
283             $self->_log("GUID [$row->{guid}]");
284             $self->{processed}++;
285              
286             my $report = $self->load_fact(undef,0,$row);
287              
288             unless($report) {
289             $self->_log(" ... no report\n");
290             warn "No report returned [$row->{id},$row->{guid}]\n";
291             next;
292             }
293              
294             $self->{report}{id} = $row->{id};
295             $self->{report}{guid} = $row->{guid};
296             $self->{report}{metabase} = $self->{facts};
297              
298             # corrupt cached report?
299             if($self->reparse_report()) { # true if invalid report
300             $self->_log(".. cannot parse metabase cache report\n");
301             warn "Cannot parse cached report [$row->{id},$row->{guid}]\n";
302             next;
303             }
304              
305             if($self->store_report()) { $self->_log(".. cpanstats stored\n") }
306             else { $self->_log(".. cpanstats not stored\n") }
307             if($self->cache_update()) { $self->_log(".. metabase stored\n") }
308             else { $self->_log(".. bad metabase cache data\n") }
309              
310             $self->{stored}++;
311             $self->{cached}++;
312             }
313              
314             my $invalid = $self->{invalid} ? scalar(@{$self->{invalid}}) : 0;
315             my $stop = localtime(time);
316             $self->_log("MARKER: processed=$self->{processed}, stored=$self->{stored}, cached=$self->{cached}, invalid=$invalid, start=$start, stop=$stop\n");
317              
318             $self->commit();
319             $self->_log("STOP REBUILD\n");
320             }
321              
322             sub parse {
323             my ($self,$hash) = @_;
324             $self->_log("START PARSE\n");
325              
326             my @guids = $self->_get_guid_list($hash->{guid},$hash->{file});
327             return unless(@guids);
328              
329             $self->{force} ||= 0;
330              
331             for my $guid (@guids) {
332             $self->_log("GUID [$guid]");
333              
334             my ($report,$stored);
335             unless($hash->{force}) {
336             $report = $self->load_fact($guid,1);
337             $stored = $self->retrieve_report($guid);
338             }
339              
340             if($report && $stored) {
341             $self->_log(".. report already stored and cached\n");
342             next;
343             }
344              
345             $report = $self->get_fact($guid);
346              
347             unless($report) {
348             $self->_log(".. report not found [$guid]\n");
349             next;
350             }
351            
352             $self->{report}{guid} = $guid;
353             $hash->{report} = $report;
354             if($self->parse_report(%$hash)) { # true if invalid report
355             $self->_log(".. cannot parse report [$guid]\n");
356             next;
357             }
358              
359             if($self->store_report()) { $self->_log(".. stored"); }
360             else { $self->_log(".. already stored"); }
361              
362             if($self->cache_report()) { $self->_log(".. cached\n"); }
363             else { $self->_log(".. FAIL: bad cache data\n"); }
364             }
365              
366             $self->commit();
367             $self->_log("STOP PARSE\n");
368             return 1;
369             }
370              
371             sub reparse {
372             my ($self,$hash) = @_;
373             $self->_log("START REPARSE\n");
374              
375             my @guids = $self->_get_guid_list($hash->{guid},$hash->{file});
376             return unless(@guids);
377              
378             $self->{reparse} = $self->{force} ? 0 : 1;
379             $self->{localonly} = $hash->{localonly} ? 1 : 0;
380             $self->{check} = $hash->{check} ? 1 : 0;
381              
382             for my $guid (@guids) {
383             $self->_log("GUID [$guid]");
384              
385             my $report;
386             $report = $self->load_fact($guid) unless($hash->{force});
387              
388             if($report) {
389             $self->{report}{metabase} = $report;
390             $self->{report}{guid} = $guid;
391             $hash->{report} = $report;
392             if($self->reparse_report(%$hash)) { # true if invalid report
393             $self->_log(".. cannot parse report [$guid]\n");
394             return 0;
395             }
396             } else {
397             $report = $self->get_fact($guid) unless($report || $hash->{localonly});
398              
399             unless($report) {
400             if($self->{localonly}) {
401             $self->_log(".. report not available locally [$guid]\n");
402             return 0;
403             }
404             $self->_log(".. report not found [$guid]\n");
405             return 0;
406             }
407            
408             $self->{report}{guid} = $guid;
409             $hash->{report} = $report;
410             if($self->parse_report(%$hash)) { # true if invalid report
411             $self->_log(".. cannot parse report [$guid]\n");
412             return 0;
413             }
414             }
415              
416             if($self->store_report()) { $self->_log(".. stored"); }
417             else {
418             if($self->{time} gt $self->{report}{updated}) {
419             $self->_log(".. FAIL: older than requested [$self->{time}]\n");
420             return 0;
421             }
422            
423             $self->_log(".. already stored");
424             }
425             if($self->cache_report()) { $self->_log(".. cached\n"); }
426             else { $self->_log(".. FAIL: bad cache data\n"); }
427             }
428              
429             $self->commit();
430             $self->_log("STOP REPARSE\n");
431             return 1;
432             }
433              
434             sub tail {
435             my ($self,$hash) = @_;
436             return unless($hash->{file});
437              
438             $self->_log("START TAIL\n");
439              
440             my $guids = $self->get_tail_guids();
441             my $fh = IO::File->new($hash->{file},'a+') or die "Cannot read file [$hash->{file}]: $!";
442             print $fh "$_\n" for(@$guids);
443             $fh->close;
444              
445             $self->_log("STOP TAIL\n");
446             }
447              
448             #----------------------------------------------------------------------------
449             # Internal Methods
450              
451             sub commit {
452             my $self = shift;
453             for(qw(CPANSTATS)) {
454             next unless($self->{$_});
455             $self->{$_}->do_commit;
456             }
457             }
458              
459             sub get_tail_guids {
460             my $self = shift;
461             my $guids;
462              
463             eval {
464             $guids = $self->{librarian}->search(
465             'core.type' => 'CPAN-Testers-Report',
466             'core.update_time' => { ">", 0 },
467             '-desc' => 'core.update_time',
468             '-limit' => $self->{poll_limit},
469             );
470             };
471              
472             $self->_log(" ... Metabase Tail Failed [$@]\n") if($@);
473             $self->_log("Retrieved ".($guids ? scalar(@$guids) : 0)." guids\n");
474              
475             return $guids;
476             }
477              
478             sub get_next_dates {
479             my ($self,$to) = @_;
480             my (@data,$from);
481              
482             my $time = sprintf "%sT%sZ", DateTime->now->ymd, DateTime->now->hms;
483              
484             $self->_log("DATES to=$to, time=$time\n");
485              
486             # note that because Amazon's SimpleDB can return odd entries out of sync, we have to look at previous entries
487             # to ensure we are starting from the right point. Also ignore date/times in the future.
488             my @rows = $self->{METABASE}->get_query('array','SELECT updated FROM metabase WHERE updated <= ? ORDER BY updated DESC LIMIT 10',$time);
489             for my $row (@rows) {
490             if($from) {
491             my $diff = abs( _date_diff($from,$row->[0]) ); # just interested in the difference
492             $self->_log("get_next_dates from=[$from], updated=[$row->[0]], diff=$diff, DIFF=$DIFF\n");
493             next if($diff < $DIFF);
494             }
495              
496             $from = $row->[0];
497             }
498              
499             $from ||= '1999-01-01T00:00:00Z';
500             if($from gt $to) {
501             my $xx = $from;
502             $from = $to;
503             $to = $xx;
504             }
505              
506             $self->_log("NEXT from=[$from], to=[$to]\n");
507              
508             while($from lt $to) {
509             my @from = $from =~ /(\d+)\-(\d+)\-(\d+)T(\d+):(\d+):(\d+)/;
510             my $dt = DateTime->new(
511             year => $from[0], month => $from[1], day => $from[2],
512             hour => $from[3], minute => $from[4], second => $from[5],
513             );
514             $dt->add( DateTime::Duration->new( minutes => $MINS ) );
515             my $split = sprintf "%sT%sZ", $dt->ymd, $dt->hms;
516             if($split lt $to) {
517             push @data, { dstart => $from, dend => $split };
518             } else {
519             push @data, { dstart => $from, dend => $to };
520             }
521              
522             $from = $split;
523             }
524              
525             return \@data;
526             }
527              
528             sub get_next_guids {
529             my ($self,$start,$end) = @_;
530             my ($guids);
531              
532             $self->{time} ||= 0;
533             $self->{last} ||= 0;
534             $start ||= 0;
535              
536             $self->_log("PRE time=[$self->{time}], last=[$self->{last}], start=[".($start||'')."], end=[".($end||'')."]\n");
537              
538             if($start) {
539             $self->{time} = $start;
540             $self->{time_to} = $end || '';
541             } else {
542             my $time = sprintf "%sT%sZ", DateTime->now->ymd, DateTime->now->hms;
543              
544             # note that because Amazon's SimpleDB can return odd entries out of sync, we have to look at previous entries
545             # to ensure we are starting from the right point. Also ignore date/times in the future.
546             my @rows = $self->{METABASE}->get_query('array','SELECT updated FROM metabase WHERE updated <= ? ORDER BY updated DESC LIMIT 10',$time);
547             for my $row (@rows) {
548             if($self->{time}) {
549             my $diff = abs( _date_diff($self->{time},$row->[0]) ); # just interested in the difference
550             next if($diff < $DIFF);
551             }
552              
553             $self->{time} = $row->[0];
554             }
555              
556             $self->{time} ||= '1999-01-01T00:00:00Z';
557             if($self->{last} ge $self->{time}) {
558             my @ts = $self->{last} =~ /(\d+)-(\d+)-(\d+)T(\d+):(\d+):(\d+)Z/;
559             $ts[1]--;
560             my $ts = timelocal(reverse @ts);
561             @ts = localtime($ts + $self->{offset}); # increment the offset for next time
562             $self->{time} = sprintf "%04d-%02d-%02dT%02d:%02d:%02dZ", $ts[5]+1900,$ts[4]+1,$ts[3], $ts[2],$ts[1],$ts[0];
563             }
564             }
565              
566             $self->_log("START time=[$self->{time}], last=[$self->{last}]\n");
567             $self->{last} = $self->{time};
568              
569             eval {
570             # if($self->{time_to}) {
571             # $guids = $self->{librarian}->search(
572             # 'core.type' => 'CPAN-Testers-Report',
573             # 'core.update_time' => { -and => { ">=" => $self->{time}, "<=" => $self->{time_to} } },
574             # '-asc' => 'core.update_time',
575             # '-limit' => $self->{poll_limit},
576             # );
577             # } else {
578             $guids = $self->{librarian}->search(
579             '-where' => [
580             '-and' =>
581             [ '-eq' => 'core.type' => 'CPAN-Testers-Report' ],
582             [ '-ge' => 'core.update_time' => $self->{time} ]
583             ],
584             '-order' => [ '-asc' => 'core.update_time' ],
585             '-limit' => $self->{poll_limit},
586             );
587             # }
588             };
589              
590             $self->_log(" ... Metabase Search Failed [$@]\n") if($@);
591             $self->_log("Retrieved ".($guids ? scalar(@$guids) : 0)." guids\n");
592             return $guids;
593             }
594              
595             sub retrieve_reports {
596             my ($self,$guids,$start) = @_;
597              
598             if($guids) {
599             for my $guid (@$guids) {
600             $self->_log("GUID [$guid]");
601             $self->{processed}++;
602             $self->{msg} = '';
603              
604             if(my $report = $self->get_fact($guid)) {
605             $self->{report}{guid} = $guid;
606             next if($self->parse_report(report => $report)); # true if invalid report
607              
608             if($self->store_report()) {
609             $self->{msg} .= ".. stored";
610             $self->{stored}++;
611              
612             } else {
613             if($self->{time} gt $self->{report}{updated}) {
614             $self->_log(".. FAIL: older than requested [$self->{time}]\n");
615             next;
616             }
617             $self->{msg} .= ".. already stored";
618             }
619             if($self->cache_report()) { $self->_log(".. cached\n"); $self->{cached}++; }
620             else { $self->_log(".. bad cache data\n"); }
621             } else {
622             $self->_log(".. FAIL\n");
623             }
624             }
625             }
626              
627             $self->commit();
628             my $invalid = $self->{invalid} ? scalar(@{$self->{invalid}}) : 0;
629             my $stop = localtime(time);
630             $self->_log("MARKER: processed=$self->{processed}, stored=$self->{stored}, cached=$self->{cached}, invalid=$invalid, start=$start, stop=$stop\n");
631              
632             # only email invalid reports during the generate process
633             $self->_send_email() if($self->{invalid});
634             }
635              
636             sub already_saved {
637             my ($self,$guid) = @_;
638             my @rows = $self->{METABASE}->get_query('array','SELECT updated FROM metabase WHERE guid=?',$guid);
639             return $rows[0]->[0] if(@rows);
640             return 0;
641             }
642              
643             sub load_fact {
644             my ($self,$guid,$check,$row) = @_;
645              
646             if(!$row && $guid) {
647             my @rows = $self->{METABASE}->get_query('hash','SELECT report,fact FROM metabase WHERE guid=?',$guid);
648             $row = $rows[0] if(@rows);
649             }
650              
651             if($row) {
652             if($row->{fact}) {
653             $self->{fact} = $self->{serializer2}->deserialize($row->{fact});
654             $self->{facts} = $self->dereference_report($self->{fact});
655             return $self->{facts};
656             }
657            
658             if($row->{report}) {
659             $self->{facts} = $self->{serializer}->deserialize($row->{report});
660             return $self->{facts};
661             }
662             }
663              
664             $self->_log(" ... no report [guid=$guid]\n") unless($check);
665             return;
666             }
667              
668             sub get_fact {
669             my ($self,$guid) = @_;
670             my $fact;
671             #print STDERR "guid=$guid\n";
672             eval { $fact = $self->{librarian}->extract( $guid ) };
673              
674             if($fact) {
675             $self->{fact} = $fact;
676             return $fact;
677             }
678              
679             $self->_log(" ... no report [guid=$guid] [$@]\n");
680             return;
681             }
682              
683             sub dereference_report {
684             my ($self,$report) = @_;
685             my %facts;
686              
687             my @facts = $report->facts();
688             for my $fact (@facts) {
689             my $name = ref $fact;
690             $facts{$name} = $fact->as_struct;
691             $facts{$name}{content} = decode_json($facts{$name}{content});
692             }
693              
694             return \%facts;
695             }
696              
697             sub parse_report {
698             my ($self,%hash) = @_;
699             my $options = $hash{options};
700             my $report = $hash{report};
701             my $guid = $self->{report}{guid};
702             my $invalid;
703              
704             $self->{report}{created} = $report->{metadata}{core}{creation_time};
705             $self->{report}{updated} = $report->{metadata}{core}{update_time};
706              
707             my @facts = $report->facts();
708             for my $fact (@facts) {
709             if(ref $fact eq 'CPAN::Testers::Fact::TestSummary') {
710             $self->{report}{metabase}{'CPAN::Testers::Fact::TestSummary'} = $fact->as_struct;
711             $self->{report}{metabase}{'CPAN::Testers::Fact::TestSummary'}{content} = decode_json($self->{report}{metabase}{'CPAN::Testers::Fact::TestSummary'}{content});
712              
713             $self->{report}{state} = lc $fact->{content}{grade};
714             $self->{report}{platform} = $fact->{content}{archname};
715             $self->{report}{osname} = $self->_osname($fact->{content}{osname});
716             $self->{report}{osvers} = $fact->{content}{osversion};
717             $self->{report}{perl} = $fact->{content}{perl_version};
718             #$self->{report}{created} = $fact->{metadata}{core}{creation_time};
719             #$self->{report}{updated} = $fact->{metadata}{core}{update_time};
720              
721             my $dist = Metabase::Resource->new( $fact->resource );
722             $self->{report}{dist} = $dist->metadata->{dist_name};
723             $self->{report}{version} = $dist->metadata->{dist_version};
724             $self->{report}{resource} = $dist->metadata->{resource};
725              
726             # some distros are a pain!
727             if($self->{report}{version} eq '' && $MAPPINGS{$self->{report}{dist}}) {
728             $self->{report}{version} = $MAPPINGS{$self->{report}{dist}}->[1];
729             $self->{report}{dist} = $MAPPINGS{$self->{report}{dist}}->[0];
730             } elsif($self->{report}{version} eq '') {
731             $self->{report}{version} = 0;
732             }
733              
734             $self->{report}{from} = $self->_get_tester( $fact->creator->resource );
735              
736             # alternative API
737             #my $profile = $fact->creator->user;
738             #$self->{report}{from} = $profile->{email};
739             #$self->{report}{from} =~ s/'/''/g; #'
740             #$self->{report}{dist} = $fact->resource->dist_name;
741             #$self->{report}{version} = $fact->resource->dist_version;
742              
743             } elsif(ref $fact eq 'CPAN::Testers::Fact::LegacyReport') {
744             $self->{report}{metabase}{'CPAN::Testers::Fact::LegacyReport'} = $fact->as_struct;
745             $self->{report}{metabase}{'CPAN::Testers::Fact::LegacyReport'}{content} = decode_json($self->{report}{metabase}{'CPAN::Testers::Fact::LegacyReport'}{content});
746             $invalid = 'missing textreport' if(length $fact->{content}{textreport} < 10); # what is the smallest report?
747              
748             $self->{report}{perl} = $fact->{content}{perl_version};
749             }
750             }
751              
752             if($invalid) {
753             push @{$self->{invalid}}, {msg => $invalid, guid => $guid};
754             return 1;
755             }
756              
757             # fixes from metabase formatting
758             $self->{report}{perl} =~ s/^v//; # no leading 'v'
759             $self->_check_arch_os();
760              
761             if($self->{report}{created}) {
762             my @created = $self->{report}{created} =~ /(\d+)-(\d+)-(\d+)T(\d+):(\d+):(\d+)Z/; # 2010-02-23T20:33:52Z
763             $self->{report}{postdate} = sprintf "%04d%02d", $created[0], $created[1];
764             $self->{report}{fulldate} = sprintf "%04d%02d%02d%02d%02d", $created[0], $created[1], $created[2], $created[3], $created[4];
765             } else {
766             my @created = localtime(time);
767             $self->{report}{postdate} = sprintf "%04d%02d", $created[5]+1900, $created[4]+1;
768             $self->{report}{fulldate} = sprintf "%04d%02d%02d%02d%02d", $created[5]+1900, $created[4]+1, $created[3], $created[2], $created[1];
769             }
770              
771             $self->{msg} .= ".. time [$self->{report}{created}][$self->{report}{updated}]";
772              
773             $self->{report}{type} = 2;
774             if($self->{DISABLE} && $self->{DISABLE}{$self->{report}{from}}) {
775             $self->{report}{state} .= ':invalid';
776             $self->{report}{type} = 3;
777             } elsif($self->{report}{response} && $self->{report}{response} =~ m!/perl6/!) {
778             # $self->{report}{type} = 6;
779             return 1;
780             }
781              
782             #print STDERR "\n====\nreport=".Dumper($self->{report});
783              
784             return 1 unless($self->_valid_field($guid, 'dist' => $self->{report}{dist}) || ($options && $options->{exclude}{dist}));
785             return 1 unless($self->_valid_field($guid, 'version' => $self->{report}{version}) || ($options && $options->{exclude}{version}));
786             return 1 unless($self->_valid_field($guid, 'from' => $self->{report}{from}) || ($options && $options->{exclude}{from}));
787             return 1 unless($self->_valid_field($guid, 'perl' => $self->{report}{perl}) || ($options && $options->{exclude}{perl}));
788             return 1 unless($self->_valid_field($guid, 'platform' => $self->{report}{platform}) || ($options && $options->{exclude}{platform}));
789             return 1 unless($self->_valid_field($guid, 'osname' => $self->{report}{osname}) || ($options && $options->{exclude}{osname}));
790             return 1 unless($self->_valid_field($guid, 'osvers' => $self->{report}{osvers}) || ($options && $options->{exclude}{osname}));
791              
792             return 0
793             }
794              
795             sub reparse_report {
796             my ($self,%hash) = @_;
797             my $fact = 'CPAN::Testers::Fact::TestSummary';
798             my $options = $hash{options};
799              
800             $self->{report}{metabase}{$fact}{content} = encode_json($self->{report}{metabase}{$fact}{content});
801             my $report = CPAN::Testers::Fact::TestSummary->from_struct( $self->{report}{metabase}{$fact} );
802             my $guid = $self->{report}{guid};
803              
804             $self->{report}{state} = lc $report->{content}{grade};
805             $self->{report}{platform} = $report->{content}{archname};
806             $self->{report}{osname} = $self->_osname($report->{content}{osname});
807             $self->{report}{osvers} = $report->{content}{osversion};
808             $self->{report}{perl} = $report->{content}{perl_version};
809             $self->{report}{created} = $report->{metadata}{core}{creation_time};
810              
811             my $dist = Metabase::Resource->new( $report->{metadata}{core}{resource} );
812             $self->{report}{dist} = $dist->metadata->{dist_name};
813             $self->{report}{version} = $dist->metadata->{dist_version};
814             $self->{report}{resource} = $dist->metadata->{resource};
815              
816             $self->{report}{from} = $self->_get_tester( $report->{metadata}{core}{creator}{resource} );
817              
818             if($self->{report}{created}) {
819             my @created = $self->{report}{created} =~ /(\d+)-(\d+)-(\d+)T(\d+):(\d+):(\d+)Z/; # 2010-02-23T20:33:52Z
820             $self->{report}{postdate} = sprintf "%04d%02d", $created[0], $created[1];
821             $self->{report}{fulldate} = sprintf "%04d%02d%02d%02d%02d", $created[0], $created[1], $created[2], $created[3], $created[4];
822             } else {
823             my @created = localtime(time);
824             $self->{report}{postdate} = sprintf "%04d%02d", $created[5]+1900, $created[4]+1;
825             $self->{report}{fulldate} = sprintf "%04d%02d%02d%02d%02d", $created[5]+1900, $created[4]+1, $created[3], $created[2], $created[1];
826             }
827              
828             $self->{report}{type} = 2;
829             if($self->{DISABLE} && $self->{DISABLE}{$self->{report}{from}}) {
830             $self->{report}{state} .= ':invalid';
831             $self->{report}{type} = 3;
832             } elsif($self->{report}{response} && $self->{report}{response} =~ m!/perl6/!) {
833             # $self->{report}{type} = 6;
834             return 1;
835             }
836              
837             return 1 unless($self->_valid_field($guid, 'dist' => $self->{report}{dist}) || ($options && $options->{exclude}{dist}));
838             return 1 unless($self->_valid_field($guid, 'version' => $self->{report}{version}) || ($options && $options->{exclude}{version}));
839             return 1 unless($self->_valid_field($guid, 'from' => $self->{report}{from}) || ($options && $options->{exclude}{from}));
840             return 1 unless($self->_valid_field($guid, 'perl' => $self->{report}{perl}) || ($options && $options->{exclude}{perl}));
841             return 1 unless($self->_valid_field($guid, 'platform' => $self->{report}{platform}) || ($options && $options->{exclude}{platform}));
842             return 1 unless($self->_valid_field($guid, 'osname' => $self->{report}{osname}) || ($options && $options->{exclude}{osname}));
843             return 1 unless($self->_valid_field($guid, 'osvers' => $self->{report}{osvers}) || ($options && $options->{exclude}{osname}));
844              
845             return 0;
846             }
847              
848             sub retrieve_report {
849             my $self = shift;
850             my $guid = shift or return;
851              
852             my @rows = $self->{CPANSTATS}->get_query('hash','SELECT * FROM cpanstats WHERE guid=?',$guid);
853             return $rows[0] if(@rows);
854             return;
855             }
856              
857             sub store_report {
858             my $self = shift;
859             my @fields = qw(guid state postdate from dist version platform perl osname osvers fulldate type);
860              
861             my %fields = map {$_ => $self->{report}{$_}} @fields;
862             $fields{$_} ||= 0 for(qw(type));
863             $fields{$_} ||= '0' for(qw(perl));
864             $fields{$_} ||= '' for(@fields);
865              
866             my @values = map {$fields{$_}} @fields;
867              
868             my %SQL = (
869             'SELECT' => {
870             CPANSTATS => 'SELECT id FROM cpanstats WHERE guid=?',
871             RELEASE => 'SELECT id FROM release_data WHERE guid=?',
872             },
873             'INSERT' => {
874             CPANSTATS => 'INSERT INTO cpanstats (guid,state,postdate,tester,dist,version,platform,perl,osname,osvers,fulldate,type) VALUES (?,?,?,?,?,?,?,?,?,?,?,?)',
875             RELEASE => 'INSERT INTO release_data (id,guid,dist,version,oncpan,distmat,perlmat,patched,pass,fail,na,unknown) VALUES (?,?,?,?,?,?,?,?,?,?,?,?)',
876             PASSES => 'INSERT IGNORE passreports SET platform=?, osname=?, perl=?, dist=?, postdate=?',
877             },
878             'UPDATE' => {
879             CPANSTATS => 'UPDATE cpanstats SET state=?,postdate=?,tester=?,dist=?,version=?,platform=?,perl=?,osname=?,osvers=?,fulldate=?,type=? WHERE guid=?',
880             RELEASE => 'UPDATE release_data SET id=?,dist=?,version=?,oncpan=?,distmat=?,perlmat=?,patched=?,pass=?,fail=?,na=?,unknown=? WHERE guid=?',
881             },
882             );
883              
884             # update the mysql database
885             my @rows = $self->{CPANSTATS}->get_query('array',$SQL{SELECT}{CPANSTATS},$values[0]);
886             if(@rows) {
887             if($self->{reparse}) {
888             my ($guid,@update) = @values;
889             if($self->{check}) {
890             $self->_log( "CHECK: $SQL{UPDATE}{CPANSTATS},[" . join(',',@update,$guid) . "]\n" );
891             } else {
892             $self->{CPANSTATS}->do_query($SQL{UPDATE}{CPANSTATS},@update,$guid);
893             }
894             } else {
895             $self->{report}{id} = $rows[0]->[0];
896             return 0;
897             }
898             } else {
899             if($self->{check}) {
900             $self->_log( "CHECK: $SQL{INSERT}{CPANSTATS},[" . join(',',@values) . "]\n" );
901             } else {
902             $self->{report}{id} = $self->{CPANSTATS}->id_query($SQL{INSERT}{CPANSTATS},@values);
903             }
904             }
905              
906             # in check mode, assume the rest happens
907             return 1 if($self->{check});
908              
909             # perl version components
910             my ($perl,$patch,$devel) = $self->_get_perl_version($fields{perl});
911              
912             # only valid perl5 reports
913             if($self->{report}{type} == 2) {
914             $fields{id} = $self->{report}{id};
915              
916             # push page requests
917             # - note we only update the author if this is the *latest* version of the distribution
918             my $author = $self->{report}{pauseid} || $self->_get_author($fields{dist},$fields{version});
919             $self->{CPANSTATS}->do_query("INSERT INTO page_requests (type,name,weight,id) VALUES ('author',?,1,?)",$author,$fields{id}) if($author);
920             $self->{CPANSTATS}->do_query("INSERT INTO page_requests (type,name,weight,id) VALUES ('distro',?,1,?)",$fields{dist},$fields{id});
921              
922             my @rows = $self->{CPANSTATS}->get_query('array',$SQL{SELECT}{RELEASE},$fields{guid});
923             #print STDERR "# select release $SQL{SELECT}{RELEASE},$fields{guid}\n";
924             if(@rows) {
925             if($self->{reparse}) {
926             $self->{CPANSTATS}->do_query($SQL{UPDATE}{RELEASE},
927             $fields{id}, # id,
928             $fields{dist},$fields{version}, # dist, version
929              
930             $self->_oncpan($fields{dist},$fields{version}) ? 1 : 2,
931              
932             $fields{version} =~ /_/ ? 2 : 1,
933             $devel ? 2 : 1,
934             $patch ? 2 : 1,
935              
936             $fields{state} eq 'pass' ? 1 : 0,
937             $fields{state} eq 'fail' ? 1 : 0,
938             $fields{state} eq 'na' ? 1 : 0,
939             $fields{state} eq 'unknown' ? 1 : 0,
940              
941             $fields{guid}); # guid
942             }
943             } else {
944             #print STDERR "# insert release $SQL{INSERT}{RELEASE},$fields[0],$fields[1]\n";
945             $self->{CPANSTATS}->do_query($SQL{INSERT}{RELEASE},
946             $fields{id},$fields{guid}, # id, guid
947             $fields{dist},$fields{version}, # dist, version
948              
949             $self->_oncpan($fields{dist},$fields{version}) ? 1 : 2,
950              
951             $fields{version} =~ /_/ ? 2 : 1,
952             $devel ? 2 : 1,
953             $patch ? 2 : 1,
954              
955             $fields{state} eq 'pass' ? 1 : 0,
956             $fields{state} eq 'fail' ? 1 : 0,
957             $fields{state} eq 'na' ? 1 : 0,
958             $fields{state} eq 'unknown' ? 1 : 0);
959             }
960             }
961              
962             if($fields{state} eq 'pass') {
963             $fields{perl} =~ s/\s.*//; # only need to know the main release
964             $self->{CPANSTATS}->do_query($SQL{INSERT}{PASSES},
965             $fields{platform},
966             $fields{osname},
967             $fields{perl},
968             $fields{dist},
969             $fields{postdate});
970             }
971              
972             if((++$self->{stat_count} % 500) == 0) {
973             $self->commit;
974             }
975              
976             return 1;
977             }
978              
979             sub cache_report {
980             my $self = shift;
981             return 0 unless($self->{report}{guid} && $self->{report}{metabase});
982              
983             # in check mode, assume the rest happens
984             return 1 if($self->{check});
985             return 1 if($self->{localonly});
986              
987             my ($json,$data,$fact);
988              
989             eval { $json = encode_json($self->{report}{metabase}); };
990             eval { $data = $self->{serializer}->serialize("$json"); };
991             eval { $data = $self->{serializer}->serialize( $self->{report}{metabase} ); } if($@);
992             eval { $fact = $self->{serializer2}->serialize($self->{fact}); };
993              
994             $data ||= '';
995             $fact ||= '';
996              
997             $self->{METABASE}->do_query('INSERT IGNORE INTO metabase (guid,id,updated,report,fact) VALUES (?,?,?,?,?)',
998             $self->{report}{guid},$self->{report}{id},$self->{report}{updated},$data,$fact);
999              
1000             if((++$self->{meta_count} % 500) == 0) {
1001             $self->{METABASE}->do_commit;
1002             }
1003              
1004             return 1;
1005             }
1006              
1007             sub cache_update {
1008             my $self = shift;
1009             return 0 unless($self->{report}{guid} && $self->{report}{id});
1010              
1011             # in check mode, assume the rest happens
1012             return 1 if($self->{check});
1013             return 1 if($self->{localonly});
1014              
1015             $self->{METABASE}->do_query('UPDATE metabase SET id=? WHERE guid=?',$self->{report}{id},$self->{report}{guid});
1016              
1017             if((++$self->{meta_count} % 500) == 0) {
1018             $self->{METABASE}->do_commit;
1019             }
1020              
1021             return 1;
1022             }
1023              
1024             #----------------------------------------------------------------------------
1025             # Internal Cache Methods
1026              
1027             sub load_uploads {
1028             my $self = shift;
1029              
1030             my @rows = $self->{CPANSTATS}->get_query('hash','SELECT dist,version,type FROM uploads');
1031             for my $row (@rows) {
1032             $self->{oncpan}{$row->{dist}}{$row->{version}} = $row->{type};
1033             }
1034             }
1035              
1036             sub load_authors {
1037             my $self = shift;
1038              
1039             my @rows = $self->{CPANSTATS}->get_query('hash','SELECT author,dist,version FROM ixlatest');
1040             for my $row (@rows) {
1041             $self->{author}{$row->{dist}}{$row->{version}} = $row->{author};
1042             }
1043             }
1044              
1045             sub load_perl_versions {
1046             my $self = shift;
1047              
1048             my @rows = $self->{CPANSTATS}->get_query('hash','SELECT * FROM perl_version');
1049             for my $row (@rows) {
1050             $self->{perls}{$row->{version}} = {
1051             perl => $row->{perl},
1052             patch => $row->{patch},
1053             devel => $row->{devel},
1054             saved => 1
1055             };
1056             }
1057             }
1058              
1059             sub save_perl_versions {
1060             my $self = shift;
1061              
1062             for my $vers (keys %{ $self->{perls} }) {
1063             next if($self->{perls}{$vers}{saved});
1064             $self->{CPANSTATS}->do_query("INSERT INTO perl_version (version,perl,patch,devel) VALUES (?,?,?,?)",
1065             $vers, $self->{perls}{$vers}{perl}, $self->{perls}{$vers}{patch}, $self->{perls}{$vers}{devel});
1066             }
1067             }
1068              
1069             #----------------------------------------------------------------------------
1070             # Private Methods
1071              
1072             sub _consume_reports {
1073             my ($self,$maxdate,$dataset) = @_;
1074              
1075             for my $data (@$dataset) {
1076             my $start = $self->_get_createdate( $data->{gstart}, $data->{dstart} );
1077             my $end = $self->_get_createdate( $data->{gend}, $data->{dend} );
1078              
1079             unless($start && $end) {
1080             $start ||= '';
1081             $end ||= '';
1082             $self->_log("BAD DATES: start=$start, end=$end [missing dates]\n");
1083             next;
1084             }
1085             if($start ge $end) {
1086             $self->_log("BAD DATES: start=$start, end=$end [end before start]\n");
1087             next;
1088             }
1089             # if($end gt $maxdate) {
1090             # $self->_log("BAD DATES: start=$start, end=$end [exceeds $maxdate]\n");
1091             # next;
1092             # }
1093              
1094             $self->_log("LOOP: start=$start, end=$end\n");
1095              
1096             ($self->{processed},$self->{stored},$self->{cached}) = (0,0,0);
1097              
1098             # what guids do we already have?
1099             my $sql = 'SELECT guid FROM metabase WHERE updated >= ? AND updated <= ? ORDER BY updated asc';
1100             my @guids = $self->{METABASE}->get_query('hash',$sql,$data->{dstart},$data->{dend});
1101             my %guids = map {$_->{guid} => 1} @guids;
1102              
1103             # note that because Amazon's SimpleDB can return odd entries out of
1104             # sync, we have to look at previous entries to ensure we are starting
1105             # from the right point
1106             my ($update,$prev,$last) = ($start,$start,$start);
1107             my @times = ();
1108              
1109             my $prior = [ 0, 0 ];
1110             my $saved = 0;
1111             while($update lt $end) {
1112             $self->_log("UPDATE: update=$update, end=$end, saved=$saved, guids=".(scalar(@guids))."\n");
1113              
1114             # get list of guids from last update date
1115             my $guids = $self->get_next_guids($update,$end);
1116             last unless($guids);
1117              
1118             @guids = grep { !$guids{$_} } @$guids;
1119             last unless(@guids);
1120             last if($prior->[0] eq $guids[0] && $prior->[1] eq $guids[-1]); # prevent an endless loop
1121             $prior = [ $guids[0], $guids[-1] ];
1122              
1123             $self->_log("UPDATE: todo guids=".(scalar(@guids))."\n");
1124              
1125             my $current = $update;
1126             for my $guid (@guids) {
1127             # don't process too far
1128             shift @times if(@times > 9); # one off
1129             push @times, [ $current, (_date_diff($end,$current) <= 0 ? 0 : 1) ]; # one on ... max 10
1130              
1131             my $times = 0;
1132             $times += $_->[1] for(@times);
1133             last if($times == 10); # stop if all greater than end
1134              
1135             # okay process
1136             $self->_log("GUID [$guid]");
1137              
1138             $self->{processed}++;
1139              
1140             if(my $time = $self->already_saved($guid)) {
1141             $self->_log(".. already saved [$time]\n");
1142             $current = $time;
1143             $saved++;
1144             next;
1145             }
1146              
1147             if(my $report = $self->get_fact($guid)) {
1148             $current = $report->{metadata}{core}{update_time};
1149             $self->{report}{guid} = $guid;
1150             next if($self->parse_report(report => $report)); # true if invalid report
1151              
1152             if($self->store_report()) { $self->_log(".. stored"); $self->{stored}++; }
1153             else { $self->_log(".. already stored"); }
1154             if($self->cache_report()) { $self->_log(".. cached\n"); $self->{cached}++; }
1155             else { $self->_log(".. bad cache data\n"); }
1156             } else {
1157             $self->_log(".. FAIL\n");
1158             }
1159             }
1160              
1161             $update = $times[0]->[0];
1162              
1163             $self->commit();
1164             }
1165              
1166             $self->commit();
1167             my $invalid = $self->{invalid} ? scalar(@{$self->{invalid}}) : 0;
1168             my $stop = localtime(time);
1169             $self->_log("MARKER: processed=$self->{processed}, stored=$self->{stored}, cached=$self->{cached}, invalid=$invalid, start=$start, stop=$stop\n");
1170             }
1171              
1172             # only email invalid reports during the generate process
1173             $self->_send_email() if($self->{invalid});
1174             }
1175              
1176             sub _get_perl_version {
1177             my $self = shift;
1178             my $vers = shift;
1179              
1180             unless($self->{perls}{$vers}) {
1181             my $patch = $vers =~ /^5.(7|9|[1-9][13579])/ ? 1 : 0, # odd numbers now mark development releases
1182             my $devel = $vers =~ /(RC\d+|patch)/ ? 1 : 0,
1183             my ($perl) = $vers =~ /(5\.\d+(?:\.\d+)?)/;
1184              
1185             $self->{perls}{$vers} = {
1186             perl => $perl,
1187             patch => $patch,
1188             devel => $devel,
1189             saved => 0
1190             };
1191             }
1192              
1193             return $self->{perls}{$vers}{perl}, $self->{perls}{$vers}{patch}, $self->{perls}{$vers}{devel};
1194             }
1195              
1196             sub _get_guid_list {
1197             my ($self,$guid,$file) = @_;
1198             my (@ids,@guids);
1199              
1200             # we're only parsing one id
1201             if($guid) {
1202             if($guid =~ /^\d+$/) { push @ids, $guid }
1203             else { push @guids, $guid }
1204             } elsif($file) {
1205             my $fh = IO::File->new($file,'r') or die "Cannot read file [$file]: $!";
1206             while(<$fh>) {
1207             chomp;
1208             my ($num) = (m/^([\da-z-]+)/i);
1209             if($num =~ /^\d+$/) { push @ids, $num }
1210             else { push @guids, $num }
1211             }
1212             $fh->close;
1213             } else {
1214             return;
1215             }
1216              
1217             # turn ids into guids
1218             if(@ids) {
1219             my @rows = $self->{CPANSTATS}->get_query('array','SELECT guid FROM cpanstats WHERE id IN ('.join(',',@ids).')');
1220             push @guids, $_->[0] for(@rows);
1221             }
1222              
1223             my %guids = map {$_ => 1} @guids;
1224             my @list = keys %guids;
1225             return @list;
1226             }
1227              
1228             sub _get_createdate {
1229             my ($self,$guid,$date) = @_;
1230              
1231             return unless($guid || $date);
1232             if($guid) {
1233             my @rows = $self->{METABASE}->get_query('hash','SELECT updated FROM metabase WHERE guid=?',$guid);
1234             $date = $rows[0]->{updated} if(@rows);
1235             }
1236              
1237             return unless($date && $date =~ /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$/);
1238             return $date;
1239             }
1240              
1241             sub _get_tester {
1242             my ($self,$creator) = @_;
1243             return $testers{$creator} if($testers{$creator});
1244              
1245             my $profile = Metabase::Resource->new( $creator );
1246             return $creator unless($profile);
1247              
1248             my $user;
1249             eval { $user = $self->{librarian}->extract( $profile->guid ) };
1250             return $creator unless($user);
1251              
1252             my ($name,@emails);
1253             for my $fact ($user->facts()) {
1254             if(ref $fact eq 'Metabase::User::EmailAddress') {
1255             push @emails, $fact->{content};
1256             } elsif(ref $fact eq 'Metabase::User::FullName') {
1257             $name = encode_entities($fact->{content});
1258             }
1259             }
1260              
1261             $name ||= 'NONAME'; # shouldn't happen, but allows for checks later
1262              
1263             for my $em (@emails) {
1264             $self->{METABASE}->do_query('INSERT INTO testers_email (resource,fullname,email) VALUES (?,?,?)',$creator,$name,$em);
1265             }
1266              
1267             $testers{$creator} = @emails ? $emails[0] : $creator;
1268             $testers{$creator} =~ s/\'/''/g if($testers{$creator});
1269             return $testers{$creator};
1270             }
1271              
1272             sub _get_author {
1273             my ($self,$dist,$vers) = @_;
1274             my $author = $self->{author}{$dist}{$vers} || '';
1275             return $author;
1276             }
1277              
1278             sub _valid_field {
1279             my ($self,$id,$name,$value) = @_;
1280             return 1 if(defined $value);
1281             $self->_log(" . [$id] ... missing field: $name\n");
1282             return 0;
1283             }
1284              
1285             sub _get_lastid {
1286             my $self = shift;
1287              
1288             my @rows = $self->{METABASE}->get_query('array',"SELECT MAX(id) FROM metabase");
1289             return 0 unless(@rows);
1290             return $rows[0]->[0] || 0;
1291             }
1292              
1293             sub _oncpan {
1294             my ($self,$dist,$vers) = @_;
1295              
1296             my $type = $self->{oncpan}{$dist}{$vers};
1297              
1298             return 1 unless($type); # assume it's a new release
1299             return 0 if($type eq 'backpan'); # on backpan only
1300             return 1; # on cpan or new upload
1301             }
1302              
1303             sub _osname {
1304             my $self = shift;
1305             my $name = shift || return '';
1306              
1307             my $lname = lc $name;
1308             my $uname = uc $name;
1309             $self->{OSNAMES}{$lname} ||= do {
1310             $self->{CPANSTATS}->do_query(qq{INSERT INTO osname (osname,ostitle) VALUES ('$name','$uname')});
1311             $uname;
1312             };
1313              
1314             return $self->{OSNAMES}{$lname};
1315             }
1316              
1317             sub _check_arch_os {
1318             my $self = shift;
1319              
1320             my $text = $self->_platform_to_osname($self->{report}{platform});
1321             #print STDERR "_check: text=$text\n";
1322             #print STDERR "_check: platform=$self->{report}{platform}\n";
1323             #print STDERR "_check: osname=$self->{report}{osname}\n";
1324             return if($text && $self->{report}{osname} && lc $text eq lc $self->{report}{osname});
1325              
1326             #print STDERR "_check: metabase=".Dumper($self->{report}{metabase})."\n";
1327             my $textreport = $self->{report}{metabase}{'CPAN::Testers::Fact::LegacyReport'}{content}{textreport};
1328             $textreport =~ s/\\n/\n/g; # newlines may be escaped
1329              
1330             # create a fake mail, as CTC::Article parses a mail like text block
1331             my $mail = <<EMAIL;
1332             From: fake\@example.com
1333             To: fake\@example.com
1334             Subject: PASS Fake-0.01
1335             Date: 01-01-2010 01:01:01 Z
1336              
1337             $textreport
1338             EMAIL
1339             my $object = CPAN::Testers::Common::Article->new( $mail ) or return;
1340             $object->parse_report();
1341              
1342             $self->{report}{osname} = $object->osname;
1343             $self->{report}{platform} = $object->archname;
1344             }
1345              
1346             sub _platform_to_osname {
1347             my $self = shift;
1348             my $arch = shift || return '';
1349              
1350             $OSNAMES = join('|',keys %{$self->{OSNAMES}}) if(keys %{$self->{OSNAMES}});
1351              
1352             return $1 if($arch =~ /($OSNAMES)/i);
1353              
1354             for my $rx (keys %{ $self->{OSNAMES} }) {
1355             return $self->{OSNAMES}{$rx} if($arch =~ /$rx/i);
1356             }
1357              
1358             return '';
1359             }
1360              
1361             sub _send_email {
1362             my $self = shift;
1363             my $t = localtime;
1364             my $DATE = $t->strftime("%a, %d %b %Y %H:%M:%S +0000");
1365             $DATE =~ s/\s+$//;
1366             my $INVALID = join("\n",@{$self->{invalid}});
1367             $self->_log("INVALID:\n$INVALID\n");
1368              
1369             for my $admin (@{$self->{admins}}) {
1370             my $cmd = qq!| $HOW $admin!;
1371              
1372             my $body = $HEAD . $BODY;
1373             $body =~ s/FROM/$FROM/g;
1374             $body =~ s/EMAIL/$admin/g;
1375             $body =~ s/DATE/$DATE/g;
1376             $body =~ s/INVALID/$INVALID/g;
1377              
1378             if(my $fh = IO::File->new($cmd)) {
1379             print $fh $body;
1380             $fh->close;
1381             $self->_log(".. MAIL SEND - SUCCESS - $admin\n");
1382             } else {
1383             $self->_log(".. MAIL SEND - FAILED - $admin\n");
1384             }
1385             }
1386             }
1387              
1388             sub _date_diff {
1389             my ($date1,$date2) = @_;
1390              
1391             my (@dt1) = $date1 =~ /(\d+)-(\d+)-(\d+)T(\d+):(\d+):(\d+)Z/;
1392             my (@dt2) = $date2 =~ /(\d+)-(\d+)-(\d+)T(\d+):(\d+):(\d+)Z/;
1393              
1394             return -1 unless(@dt1 && @dt2);
1395              
1396             my $dt1 = DateTime->new( year => $dt1[0], month => $dt1[1], day => $dt1[2], hour => $dt1[3], minute => $dt1[4], second => $dt1[5], time_zone => 'UTC' )->epoch;
1397             my $dt2 = DateTime->new( year => $dt2[0], month => $dt2[1], day => $dt2[2], hour => $dt2[3], minute => $dt2[4], second => $dt2[5], time_zone => 'UTC' )->epoch;
1398              
1399             return $dt2 - $dt1;
1400             }
1401              
1402             sub _log {
1403             my $self = shift;
1404             my $log = $self->{logfile} or return;
1405             mkpath(dirname($log)) unless(-f $log);
1406             my $fh = IO::File->new($log,'a+') or die "Cannot append to log file [$log]: $!\n";
1407             print $fh $self->{msg} if($self->{msg});
1408             print $fh @_;
1409             $fh->close;
1410             $self->{msg} = '';
1411             }
1412              
1413             1;
1414              
1415             __END__
1416              
1417             =head1 NAME
1418              
1419             CPAN::Testers::Data::Generator - Download and summarize CPAN Testers data
1420              
1421             =head1 SYNOPSIS
1422              
1423             % cpanstats
1424             # ... wait patiently, very patiently
1425             # ... then use the cpanstats MySQL database
1426              
1427             =head1 DESCRIPTION
1428              
1429             This distribution was originally written by Leon Brocard to download and
1430             summarize CPAN Testers data. However, all of the original code has been
1431             rewritten to use the CPAN Testers Statistics database generation code. This
1432             now means that all the CPAN Testers sites including the Reports site, the
1433             Statistics site and the CPAN Dependencies site, can use the same database.
1434              
1435             This module retrieves and parses reports from the Metabase, generating or
1436             updating entries in the cpanstats database, which extracts specific metadata
1437             from the reports. The information in the cpanstats database is then presented
1438             via CPAN::Testers::WWW::Reports on the CPAN Testers Reports website.
1439              
1440             A good example query from the cpanstats database for Acme-Colour would be:
1441              
1442             SELECT version, status, count(*) FROM cpanstats WHERE
1443             dist = "Acme-Colour" group by version, state;
1444              
1445             To create a database from scratch can take several days, as there are now over
1446             24 million submitted reports. As such updating from a known copy of the
1447             database is much more advisable. If you don't want to generate the database
1448             yourself, you can obtain a feed using CPAN::Testers::WWW::Report::Query::Reports.
1449              
1450             With over 24 million reports in the database, if you do plan to run this
1451             software to generate the databases it is recommended you utilise a high-end
1452             processor machine. Even with a reasonable processor it can take over a week!
1453              
1454             =head1 DATABASE SCHEMA
1455              
1456             The cpanstats database schema is very straightforward, one main table with
1457             several index tables to speed up searches. The main table is as below:
1458              
1459             CREATE TABLE `cpanstats` (
1460              
1461             `id` int(10) unsigned NOT NULL AUTO_INCREMENT,
1462             `guid` char(36) NOT NULL DEFAULT '',
1463             `state` varchar(32) DEFAULT NULL,
1464             `postdate` varchar(8) DEFAULT NULL,
1465             `tester` varchar(255) DEFAULT NULL,
1466             `dist` varchar(255) DEFAULT NULL,
1467             `version` varchar(255) DEFAULT NULL,
1468             `platform` varchar(255) DEFAULT NULL,
1469             `perl` varchar(255) DEFAULT NULL,
1470             `osname` varchar(255) DEFAULT NULL,
1471             `osvers` varchar(255) DEFAULT NULL,
1472             `fulldate` varchar(32) DEFAULT NULL,
1473             `type` int(2) DEFAULT '0',
1474            
1475             PRIMARY KEY (`id`),
1476             KEY `guid` (`guid`),
1477             KEY `distvers` (`dist`,`version`),
1478             KEY `tester` (`tester`),
1479             KEY `state` (`state`),
1480             KEY `postdate` (`postdate`)
1481            
1482             )
1483              
1484             It should be noted that 'postdate' refers to the YYYYMM formatted date, whereas
1485             the 'fulldate' field refers to the YYYYMMDDhhmm formatted date and time.
1486              
1487             The metabase database schema is again very straightforward, and consists of one
1488             main table, as below:
1489              
1490             CREATE TABLE `metabase` (
1491            
1492             `guid` char(36) NOT NULL,
1493             `id` int(10) unsigned NOT NULL,
1494             `updated` varchar(32) DEFAULT NULL,
1495             `report` longblob NOT NULL,
1496             `fact` longblob NOT NULL,
1497            
1498             PRIMARY KEY (`guid`),
1499             KEY `id` (`id`),
1500             KEY `updated` (`updated`)
1501            
1502             )
1503              
1504             The id field is a reference to the cpanstats.id field.
1505              
1506             The report field is JSON encoded, and is a cached version of the facts of a
1507             report, while the fact field is the full report fact, and associated child
1508             facts, Sereal encoded. Both are extracted from the returned fact from
1509             Metabase::Librarian.
1510              
1511             See F<examples/cpanstats-createdb> for the full list of tables used.
1512              
1513             =head1 SIGNIFICANT CHANGES
1514              
1515             =head2 v0.31 CHANGES
1516              
1517             With the release of v0.31, a number of changes to the codebase were made as
1518             a further move towards CPAN Testers 2.0. The first change is the name for this
1519             distribution. Now titled 'CPAN-Testers-Data-Generator', this now fits more
1520             appropriately within the CPAN-Testers namespace on CPAN.
1521              
1522             The second significant change is to now reference a MySQL cpanstats database.
1523             The SQLite version is still updated as before, as a number of other websites
1524             and toolsets still rely on that database file format. However, in order to make
1525             the CPAN Testers Reports website more dynamic, an SQLite database is not really
1526             appropriate for a high demand website.
1527              
1528             The database creation code is now available as a standalone program, in the
1529             examples directory, and all the database communication is now handled by the
1530             new distribution CPAN-Testers-Common-DBUtils.
1531              
1532             =head2 v0.41 CHANGES
1533              
1534             In the next stage of development of CPAN Testers 2.0, the id field used within
1535             the database schema above for the cpanstats table no longer matches the NNTP
1536             ID value, although the id in the articles does still reference the NNTP ID, at
1537             least for the reports submitted prior to the switch to the Metabase in 2010.
1538              
1539             In order to correctly reference the id in the articles table, you will need to
1540             use the function guid_to_nntp() with CPAN::Testers::Common::Utils, using the
1541             new guid field in the cpanstats table.
1542              
1543             As of this release the cpanstats id field is a unique auto incrementing field.
1544              
1545             The next release of this distribution will be focused on generation of stats
1546             using the Metabase storage API.
1547              
1548             =head2 v1.00 CHANGES
1549              
1550             Moved to Metabase API. The change to a definite major version number hopefully
1551             indicates that this is a major interface change. All previous NNTP access has
1552             been dropped and is no longer relavent. All report updates are now fed from
1553             the Metabase API.
1554              
1555             =head1 INTERFACE
1556              
1557             =head2 The Constructor
1558              
1559             =over
1560              
1561             =item * new
1562              
1563             Instatiates the object CPAN::Testers::Data::Generator. Accepts a hash containing
1564             values to prepare the object. These are described as:
1565              
1566             my $obj = CPAN::Testers::Data::Generator->new(
1567             logfile => './here/logfile',
1568             config => './here/config.ini'
1569             );
1570              
1571             Where 'logfile' is the location to write log messages. Log messages are only
1572             written if a logfile entry is specified, and will always append to any existing
1573             file. The 'config' should contain the path to the configuration file, used
1574             to define the database access and general operation settings.
1575              
1576             =back
1577              
1578             =head2 Public Methods
1579              
1580             =over
1581              
1582             =item * generate
1583              
1584             Starting from the last cached report, retrieves all the more recent reports
1585             from the Metabase Report Submission server, parsing each and recording each
1586             report in both the cpanstats database and the metabase cache database.
1587              
1588             =item * regenerate
1589              
1590             For a given date range, retrieves all the reports from the Metabase Report
1591             Submission server, parsing each and recording each report in both the cpanstats
1592             database and the metabase cache database.
1593              
1594             Note that as only 2500 can be returned at any one time due to Amazon SimpleDB
1595             restrictions, this method will only process the guids returned from a given
1596             start data, up to a maxiumu of 2500 guids.
1597              
1598             This method will return the guid of the last report processed.
1599              
1600             =item * rebuild
1601              
1602             In the event that the cpanstats database needs regenerating, either in part or
1603             for the whole database, this method allow you to do so. You may supply
1604             parameters as to the 'start' and 'end' values (inclusive), where all records
1605             are assumed by default. Records are rebuilt using the local metabase cache
1606             database.
1607              
1608             =item * reparse
1609              
1610             Rather than a complete rebuild the option to selective reparse selected entries
1611             is useful if there are reports which were previously unable to correctly supply
1612             a particular field, which now has supporting parsing code within the codebase.
1613              
1614             In addition there is the option to exclude fields from parsing checks, where
1615             they may be corrupted, and can be later amended using the 'cpanstats-update'
1616             tool.
1617              
1618             =item * parse
1619              
1620             Unlike reparse, parse is used to parse just missing reports. As such if a
1621             report has already been stored and cached, it won't be processed again, unless
1622             the 'force' option is used.
1623              
1624             In addition, as per reparse, there is the option to exclude fields from parsing
1625             checks, where they may be corrupted, and can be later amended using the
1626             'cpanstats-update' tool.
1627              
1628             =item * tail
1629              
1630             Write to a file, the list of GUIDs returned from a tail request.
1631              
1632             =back
1633              
1634             =head2 Private Methods
1635              
1636             =over
1637              
1638             =item * commit
1639              
1640             To speed up the transaction process, a commit is performed every 500 inserts.
1641             This method is used as part of the clean up process to ensure all transactions
1642             are completed.
1643              
1644             =item * get_tail_guids
1645              
1646             Get the list of GUIDs as would be seen for a tail log.
1647              
1648             =item * get_next_dates
1649              
1650             Get the list of dates to use in the next cycle of report retrieval.
1651              
1652             =item * get_next_guids
1653              
1654             Get the list of GUIDs for the reports that have been submitted since the last
1655             cached report.
1656              
1657             =item * retrieve_reports
1658              
1659             Abstracted loop of requesting GUIDs, then parsing, storing and caching each
1660             report as appropriate.
1661              
1662             =item * already_saved
1663              
1664             Given a guid, determines whether it has already been saved in the local
1665             metabase cache.
1666              
1667             =item * load_fact
1668              
1669             Get a specific report fact for a given GUID, from the local database.
1670              
1671             =item * get_fact
1672              
1673             Get a specific report fact for a given GUID, from the Metabase.
1674              
1675             =item * dereference_report
1676              
1677             When you retrieve the parent report fact from the database, you'll need to
1678             dereference it to ensure the child elements contain the child facts in the
1679             correct format for processing.
1680              
1681             =item * parse_report
1682              
1683             Parses a report extracting the metadata required for the cpanstats database.
1684              
1685             =item * reparse_report
1686              
1687             Parses a report (from a local metabase cache) extracting the metadata required
1688             for the stats database.
1689              
1690             =item * retrieve_report
1691              
1692             Given a guid will attempt to return the report metadata from the cpanstats
1693             database.
1694              
1695             =item * store_report
1696              
1697             Inserts the components of a parsed report into the cpanstats database.
1698              
1699             =item * cache_report
1700              
1701             Inserts a serialised report into a local metabase cache database.
1702              
1703             =item * cache_update
1704              
1705             For the current report will update the local metabase cache with the id used
1706             within the cpanstats database.
1707              
1708             =back
1709              
1710             =head2 Very Private methods
1711              
1712             The following modules load information enmasse to avoid DB connection hogging
1713             and IO blocking. Thus improving performance.
1714              
1715             =over 4
1716              
1717             =item * load_uploads
1718              
1719             Loads the upload information.
1720              
1721             =item * load_authors
1722              
1723             Loads information regarding each author's distribution.
1724              
1725             =item * load_perl_versions
1726              
1727             Loads all the known Perl versions.
1728              
1729             =item * save_perl_versions
1730              
1731             Saves any new Perl versions
1732              
1733             =back
1734              
1735             =head1 HISTORY
1736              
1737             The CPAN Testers was conceived back in May 1998 by Graham Barr and Chris
1738             Nandor as a way to provide multi-platform testing for modules. Today there
1739             are over 40 million tester reports and more than 100 testers each month
1740             giving valuable feedback for users and authors alike.
1741              
1742             =head1 BECOME A TESTER
1743              
1744             Whether you have a common platform or a very unusual one, you can help by
1745             testing modules you install and submitting reports. There are plenty of
1746             module authors who could use test reports and helpful feedback on their
1747             modules and distributions.
1748              
1749             If you'd like to get involved, please take a look at the CPAN Testers Wiki,
1750             where you can learn how to install and configure one of the recommended
1751             smoke tools.
1752              
1753             For further help and advice, please subscribe to the the CPAN Testers
1754             discussion mailing list.
1755              
1756             CPAN Testers Wiki
1757             - http://wiki.cpantesters.org
1758             CPAN Testers Discuss mailing list
1759             - http://lists.cpan.org/showlist.cgi?name=cpan-testers-discuss
1760              
1761             =head1 BUCKETS
1762              
1763             beta6 - 2014-01-21
1764             beta7 - 2014-11-12
1765              
1766             =head1 BUGS, PATCHES & FIXES
1767              
1768             There are no known bugs at the time of this release. However, if you spot a
1769             bug or are experiencing difficulties, that is not explained within the POD
1770             documentation, please send bug reports and patches to the RT Queue (see below).
1771              
1772             Fixes are dependent upon their severity and my availability. Should a fix not
1773             be forthcoming, please feel free to (politely) remind me.
1774              
1775             RT Queue -
1776             http://rt.cpan.org/Public/Dist/Display.html?Name=CPAN-Testers-Data-Generator
1777              
1778             =head1 SEE ALSO
1779              
1780             L<CPAN::Testers::Report>,
1781             L<Metabase>,
1782             L<Metabase::Fact>,
1783             L<CPAN::Testers::Fact::LegacyReport>,
1784             L<CPAN::Testers::Fact::TestSummary>,
1785             L<CPAN::Testers::Metabase::AWS>
1786              
1787             L<CPAN::Testers::WWW::Statistics>
1788              
1789             F<http://www.cpantesters.org/>,
1790             F<http://stats.cpantesters.org/>,
1791             F<http://wiki.cpantesters.org/>
1792              
1793             =head1 AUTHOR
1794              
1795             It should be noted that the original code for this distribution began life
1796             under another name. The original distribution generated data for the original
1797             CPAN Testers website. However, in 2008 the code was reworked to generate data
1798             in the format for the statistics data analysis, which in turn was reworked to
1799             drive the redesign of the all the CPAN Testers websites. To reflect the code
1800             changes, a new name was given to the distribution.
1801              
1802             =head2 CPAN-WWW-Testers-Generator
1803              
1804             Original author: Leon Brocard <acme@astray.com> (C) 2002-2008
1805             Current maintainer: Barbie <barbie@cpan.org> (C) 2008-2010
1806              
1807             =head2 CPAN-Testers-Data-Generator
1808              
1809             Original author: Barbie <barbie@cpan.org> (C) 2008-2015
1810              
1811             =head1 LICENSE
1812              
1813             This module is free software; you can redistribute it and/or
1814             modify it under the Artistic License 2.0.