File Coverage

blib/lib/CPAN/Testers/Data/Generator.pm
Criterion Covered Total %
statement 64 66 96.9
branch n/a
condition n/a
subroutine 22 22 100.0
pod n/a
total 86 88 97.7


line stmt bran cond sub pod time code
1             package CPAN::Testers::Data::Generator;
2              
3 9     9   181400 use warnings;
  9         17  
  9         379  
4 9     9   45 use strict;
  9         13  
  9         244  
5              
6 9     9   41 use vars qw($VERSION);
  9         17  
  9         636  
7             $VERSION = '1.21';
8              
9             #----------------------------------------------------------------------------
10             # Library Modules
11              
12 9     9   6162 use Config::IniFiles;
  9         269800  
  9         422  
13 9     9   6368 use CPAN::Testers::Common::Article;
  9         162597  
  9         71  
14 9     9   5462 use CPAN::Testers::Common::DBUtils;
  9         163793  
  9         68  
15             #use Data::Dumper;
16 9     9   6704 use Data::FlexSerializer;
  9         15808686  
  9         85  
17 9     9   17707 use DateTime;
  9         1394971  
  9         439  
18 9     9   90 use DateTime::Duration;
  9         15  
  9         216  
19 9     9   126 use File::Basename;
  9         14  
  9         915  
20 9     9   50 use File::Path;
  9         13  
  9         517  
21 9     9   7714 use File::Slurp;
  9         41225  
  9         778  
22 9     9   5964 use HTML::Entities;
  9         47225  
  9         773  
23 9     9   81 use IO::File;
  9         17  
  9         1315  
24 9     9   6079 use JSON;
  9         40736  
  9         44  
25 9     9   1252 use Time::Local;
  9         13  
  9         560  
26              
27 9     9   4665 use Metabase 0.004;
  9         1481  
  9         242  
28 9     9   28283 use Metabase::Fact;
  9         142104  
  9         337  
29 9     9   81 use Metabase::Resource;
  9         14  
  9         164  
30 9     9   5777 use CPAN::Testers::Fact::LegacyReport;
  9         11844  
  9         265  
31 9     9   4556 use CPAN::Testers::Fact::TestSummary;
  9         2850  
  9         261  
32 9     9   5027 use CPAN::Testers::Metabase::AWS;
  0            
  0            
33             use CPAN::Testers::Report;
34              
35             #----------------------------------------------------------------------------
36             # Variables
37              
38             my $DIFF = 30; # max difference allowed in seconds
39             my $MINS = 15; # split time in minutes
40              
41             my %testers;
42              
43             my $FROM = 'CPAN Tester Report Server <do_not_reply@cpantesters.org>';
44             my $HOW = '/usr/sbin/sendmail -bm';
45             my $HEAD = 'To: EMAIL
46             From: FROM
47             Date: DATE
48             Subject: CPAN Testers Generator Error Report
49              
50             ';
51              
52             my $BODY = '
53             The following reports failed to parse into the cpanstats database:
54              
55             INVALID
56              
57             Thanks,
58             CPAN Testers Server.
59             ';
60              
61             my @admins = (
62             'barbie@missbarbell.co.uk',
63             #'david@dagolden.com'
64             );
65              
66             my ($OSNAMES,%MAPPINGS);
67              
68             #----------------------------------------------------------------------------
69             # The Application Programming Interface
70              
71             sub new {
72             my $class = shift;
73             my %hash = @_;
74              
75             my $self = {
76             meta_count => 0,
77             stat_count => 0,
78             last => '',
79             };
80             bless $self, $class;
81              
82             # load configuration
83             my $cfg = Config::IniFiles->new( -file => $hash{config} );
84              
85             # configure databases
86             for my $db (qw(CPANSTATS METABASE)) {
87             die "No configuration for $db database\n" unless($cfg->SectionExists($db));
88             my %opts = map {$_ => ($cfg->val($db,$_)||undef);} qw(driver database dbfile dbhost dbport dbuser dbpass);
89             $opts{AutoCommit} = 0;
90             $self->{$db} = CPAN::Testers::Common::DBUtils->new(%opts);
91             die "Cannot configure $db database\n" unless($self->{$db});
92             $self->{$db}->{'mysql_enable_utf8'} = 1 if($opts{driver} =~ /mysql/i);
93             $self->{$db}->{'mysql_auto_reconnect'} = 1 if($opts{driver} =~ /mysql/i);
94             }
95              
96             if($cfg->SectionExists('ADMINISTRATION')) {
97             my @admins = $cfg->val('ADMINISTRATION','admins');
98             $self->{admins} = \@admins;
99             }
100              
101             # command line swtiches override configuration settings
102             for my $key (qw(logfile poll_limit stopfile offset aws_bucket aws_namespace)) {
103             $self->{$key} = $hash{$key} || $cfg->val('MAIN',$key);
104             }
105              
106             $self->{offset} ||= 1;
107             $self->{poll_limit} ||= 1000;
108              
109             my @rows = $self->{METABASE}->get_query('hash','SELECT * FROM testers_email');
110             for my $row (@rows) {
111             $testers{$row->{resource}} = $row->{email};
112             }
113              
114             # build OS names map
115             @rows = $self->{CPANSTATS}->get_query('array','SELECT osname,ostitle FROM osname');
116             for my $row (@rows) {
117             $self->{OSNAMES}{lc $row->[0]} ||= $row->[1];
118             }
119             $OSNAMES = join('|',keys %{$self->{OSNAMES}}) if(keys %{$self->{OSNAMES}});
120              
121             $self->load_uploads();
122             $self->load_authors();
123             $self->load_perl_versions();
124              
125             if($cfg->SectionExists('DISABLE')) {
126             my @values = $cfg->val('DISABLE','LIST');
127             $self->{DISABLE}{$_} = 1 for(@values);
128             }
129              
130             if($cfg->SectionExists('OSNAMES')) {
131             for my $param ($cfg->Parameters('OSNAMES')) {
132             $self->{OSNAMES}{lc $param} ||= lc $cfg->val('OSNAMES',$param);
133             }
134             }
135              
136             if($cfg->SectionExists('MAPPINGS')) {
137             for my $param ($cfg->Parameters('MAPPINGS')) {
138             $MAPPINGS{$param} = [ split(',', $cfg->val('MAPPINGS',$param), 2) ];
139             }
140             }
141              
142             eval {
143             $self->{metabase} = CPAN::Testers::Metabase::AWS->new(
144             bucket => $self->{aws_bucket},
145             namespace => $self->{aws_namespace},
146             );
147             $self->{librarian} = $self->{metabase}->public_librarian;
148             };
149              
150             # if we require remote access, we need the librarian
151             unless($hash{localonly}) {
152             return unless($self->{metabase} && $self->{librarian});
153             }
154              
155             # reports are now stored in a compressed format
156             $self->{serializer} = Data::FlexSerializer->new(
157             detect_compression => 1,
158             detect_json => 1,
159             output_format => 'json'
160             );
161             $self->{serializer2} = Data::FlexSerializer->new(
162             detect_compression => 1,
163             detect_sereal => 1,
164             output_format => 'sereal'
165             );
166              
167             return $self;
168             }
169              
170             sub DESTROY {
171             my $self = shift;
172              
173             $self->save_perl_versions();
174             }
175              
176             #----------------------------------------------------------------------------
177             # Public Methods
178              
179             sub generate {
180             my $self = shift;
181             my $nonstop = shift || 0;
182             my $maxdate = shift;
183             my ($to,@reports);
184              
185             $self->{reparse} = 0;
186              
187             $self->_log("START GENERATE nonstop=$nonstop\n");
188              
189             do {
190             my $start = localtime(time);
191             ($self->{processed},$self->{stored},$self->{cached}) = (0,0,0);
192              
193             if($maxdate) {
194             $to = $maxdate;
195             } else {
196             $to = sprintf "%sT%sZ", DateTime->now->ymd, DateTime->now->hms;
197             }
198              
199             $self->_log("DATES maxdate=$maxdate, to=$to \n");
200              
201             my $data = $self->get_next_dates($to);
202            
203             $self->_consume_reports( $to, $data );
204              
205             $nonstop = 0 if($self->{processed} == 0);
206             $nonstop = 0 if($self->{stopfile} && -f $self->{stopfile});
207             $nonstop = 0 if($maxdate && $maxdate le $to);
208              
209             $self->load_uploads() if($nonstop);
210             $self->load_authors() if($nonstop);
211              
212             $self->_log("CHECK nonstop=$nonstop\n");
213             } while($nonstop);
214             $self->_log("STOP GENERATE nonstop=$nonstop\n");
215             }
216              
217             sub regenerate {
218             my ($self,$hash) = @_;
219              
220             $self->{reparse} = 0;
221              
222             my $maxdate = sprintf "%sT%sZ", DateTime->now->ymd, DateTime->now->hms;
223              
224             $self->_log("START REGENERATE\n");
225              
226             my @data;
227             if($hash->{file}) {
228             my $fh = IO::File->new($hash->{file},'r') or die "Cannot open file [$hash->{file}]: $!\n";
229             while(<$fh>) {
230             s/\s+$//;
231             my ($fval,$tval) = split(/,/,$_,2);
232             my %data;
233             $data{gstart} = $fval if($fval =~ /^\w+-\w+-\w+-\w+-\w+$/);
234             $data{dstart} = $fval if($fval =~ /^\d+-\d+-\d+T\d+:\d+:\d+Z$/);
235             $data{gend} = $tval if($tval =~ /^\w+-\w+-\w+-\w+-\w+$/);
236             $data{dend} = $tval if($tval =~ /^\d+-\d+-\d+T\d+:\d+:\d+Z$/);
237             push @data, \%data;
238             }
239             $fh->close;
240             } else {
241             push @data, { gstart => $hash->{gstart}, gend => $hash->{gend},
242             dstart => $hash->{dstart}, dend => $hash->{dend} };
243             }
244              
245             $self->_consume_reports( $maxdate, \@data );
246              
247             $self->_log("STOP REGENERATE\n");
248             }
249              
250             sub rebuild {
251             my ($self,$hash) = @_;
252             $self->_log("START REBUILD\n");
253              
254             my $start = localtime(time);
255             ($self->{processed},$self->{stored},$self->{cached}) = (0,0,0);
256              
257             $self->{reparse} = 1;
258             $self->{localonly} = $hash->{localonly} ? 1 : 0;
259             $self->{check} = $hash->{check} ? 1 : 0;
260              
261              
262             # selection choices:
263             # 1) from guid [to guid]
264             # 2) from date [to date]
265              
266             $hash->{dstart} = $self->_get_createdate( $hash->{gstart}, $hash->{dstart} );
267             $hash->{dend} = $self->_get_createdate( $hash->{gend}, $hash->{dend} );
268              
269             my @where;
270             push @where, "updated >= '$hash->{dstart}'" if($hash->{dstart});
271             push @where, "updated <= '$hash->{dend}'" if($hash->{dend});
272            
273             my $sql = 'SELECT * FROM metabase' .
274             (@where ? ' WHERE ' . join(' AND ',@where) : '') .
275             ' ORDER BY updated ASC';
276              
277             $self->_log("START sql=[$sql]\n");
278              
279             # $self->{CPANSTATS}->do_query("DELETE FROM cpanstats WHERE id >= $start AND id <= $end");
280              
281             my $iterator = $self->{METABASE}->iterator('hash',$sql);
282             while(my $row = $iterator->()) {
283             $self->_log("GUID [$row->{guid}]");
284             $self->{processed}++;
285              
286             my $report = $self->load_fact(undef,0,$row);
287              
288             unless($report) {
289             $self->_log(" ... no report\n");
290             warn "No report returned [$row->{id},$row->{guid}]\n";
291             next;
292             }
293              
294             $self->{report}{id} = $row->{id};
295             $self->{report}{guid} = $row->{guid};
296             $self->{report}{metabase} = $self->{facts};
297              
298             # corrupt cached report?
299             if($self->reparse_report()) { # true if invalid report
300             $self->_log(".. cannot parse metabase cache report\n");
301             warn "Cannot parse cached report [$row->{id},$row->{guid}]\n";
302             next;
303             }
304              
305             if($self->store_report()) { $self->_log(".. cpanstats stored\n") }
306             else { $self->_log(".. cpanstats not stored\n") }
307             if($self->cache_update()) { $self->_log(".. metabase stored\n") }
308             else { $self->_log(".. bad metabase cache data\n") }
309              
310             $self->{stored}++;
311             $self->{cached}++;
312             }
313              
314             my $invalid = $self->{invalid} ? scalar(@{$self->{invalid}}) : 0;
315             my $stop = localtime(time);
316             $self->_log("MARKER: processed=$self->{processed}, stored=$self->{stored}, cached=$self->{cached}, invalid=$invalid, start=$start, stop=$stop\n");
317              
318             $self->commit();
319             $self->_log("STOP REBUILD\n");
320             }
321              
322             sub parse {
323             my ($self,$hash) = @_;
324             $self->_log("START PARSE\n");
325              
326             my @guids = $self->_get_guid_list($hash->{guid},$hash->{file});
327             return unless(@guids);
328              
329             $self->{force} ||= 0;
330              
331             for my $guid (@guids) {
332             $self->_log("GUID [$guid]");
333              
334             my ($report,$stored);
335             unless($hash->{force}) {
336             $report = $self->load_fact($guid,1);
337             $stored = $self->retrieve_report($guid);
338             }
339              
340             if($report && $stored) {
341             $self->_log(".. report already stored and cached\n");
342             next;
343             }
344              
345             $report = $self->get_fact($guid);
346              
347             unless($report) {
348             $self->_log(".. report not found [$guid]\n");
349             next;
350             }
351            
352             $self->{report}{guid} = $guid;
353             $hash->{report} = $report;
354             if($self->parse_report(%$hash)) { # true if invalid report
355             $self->_log(".. cannot parse report [$guid]\n");
356             next;
357             }
358              
359             if($self->store_report()) { $self->_log(".. stored"); }
360             else { $self->_log(".. already stored"); }
361              
362             if($self->cache_report()) { $self->_log(".. cached\n"); }
363             else { $self->_log(".. FAIL: bad cache data\n"); }
364             }
365              
366             $self->commit();
367             $self->_log("STOP PARSE\n");
368             return 1;
369             }
370              
371             sub reparse {
372             my ($self,$hash) = @_;
373             $self->_log("START REPARSE\n");
374              
375             my @guids = $self->_get_guid_list($hash->{guid},$hash->{file});
376             return unless(@guids);
377              
378             $self->{reparse} = $self->{force} ? 0 : 1;
379             $self->{localonly} = $hash->{localonly} ? 1 : 0;
380             $self->{check} = $hash->{check} ? 1 : 0;
381              
382             for my $guid (@guids) {
383             $self->_log("GUID [$guid]");
384              
385             my $report;
386             $report = $self->load_fact($guid) unless($hash->{force});
387              
388             if($report) {
389             $self->{report}{metabase} = $report;
390             $self->{report}{guid} = $guid;
391             $hash->{report} = $report;
392             if($self->reparse_report(%$hash)) { # true if invalid report
393             $self->_log(".. cannot parse report [$guid]\n");
394             return 0;
395             }
396             } else {
397             $report = $self->get_fact($guid) unless($report || $hash->{localonly});
398              
399             unless($report) {
400             if($self->{localonly}) {
401             $self->_log(".. report not available locally [$guid]\n");
402             return 0;
403             }
404             $self->_log(".. report not found [$guid]\n");
405             return 0;
406             }
407            
408             $self->{report}{guid} = $guid;
409             $hash->{report} = $report;
410             if($self->parse_report(%$hash)) { # true if invalid report
411             $self->_log(".. cannot parse report [$guid]\n");
412             return 0;
413             }
414             }
415              
416             if($self->store_report()) { $self->_log(".. stored"); }
417             else {
418             if($self->{time} gt $self->{report}{updated}) {
419             $self->_log(".. FAIL: older than requested [$self->{time}]\n");
420             return 0;
421             }
422            
423             $self->_log(".. already stored");
424             }
425             if($self->cache_report()) { $self->_log(".. cached\n"); }
426             else { $self->_log(".. FAIL: bad cache data\n"); }
427             }
428              
429             $self->commit();
430             $self->_log("STOP REPARSE\n");
431             return 1;
432             }
433              
434             sub tail {
435             my ($self,$hash) = @_;
436             return unless($hash->{file});
437              
438             $self->_log("START TAIL\n");
439              
440             my $guids = $self->get_tail_guids();
441             my $fh = IO::File->new($hash->{file},'a+') or die "Cannot read file [$hash->{file}]: $!";
442             print $fh "$_\n" for(@$guids);
443             $fh->close;
444              
445             $self->_log("STOP TAIL\n");
446             }
447              
448             #----------------------------------------------------------------------------
449             # Internal Methods
450              
451             sub commit {
452             my $self = shift;
453             for(qw(CPANSTATS)) {
454             next unless($self->{$_});
455             $self->{$_}->do_commit;
456             }
457             }
458              
459             sub get_tail_guids {
460             my $self = shift;
461             my $guids;
462              
463             eval {
464             # $guids = $self->{librarian}->search(
465             # 'core.type' => 'CPAN-Testers-Report',
466             # 'core.update_time' => { ">", 0 },
467             # '-desc' => 'core.update_time',
468             # '-limit' => $self->{poll_limit},
469             # );
470             $guids = $self->{librarian}->search(
471             '-where' => [
472             '-and' =>
473             [ '-eq' => 'core.type' => 'CPAN-Testers-Report' ],
474             [ '-ge' => 'core.update_time' => 0 ]
475             ],
476             '-order' => [ '-desc' => 'core.update_time' ],
477             '-limit' => $self->{poll_limit},
478             );
479             };
480              
481             $self->_log(" ... Metabase Tail Failed [$@]\n") if($@);
482             $self->_log("Retrieved ".($guids ? scalar(@$guids) : 0)." guids\n");
483              
484             return $guids;
485             }
486              
487             sub get_next_dates {
488             my ($self,$to) = @_;
489             my (@data,$from);
490              
491             my $time = sprintf "%sT%sZ", DateTime->now->ymd, DateTime->now->hms;
492              
493             $self->_log("DATES to=$to, time=$time\n");
494              
495             # note that because Amazon's SimpleDB can return odd entries out of sync, we have to look at previous entries
496             # to ensure we are starting from the right point. Also ignore date/times in the future.
497             my @rows = $self->{METABASE}->get_query('array','SELECT updated FROM metabase WHERE updated <= ? ORDER BY updated DESC LIMIT 10',$time);
498             for my $row (@rows) {
499             if($from) {
500             my $diff = abs( _date_diff($from,$row->[0]) ); # just interested in the difference
501             $self->_log("get_next_dates from=[$from], updated=[$row->[0]], diff=$diff, DIFF=$DIFF\n");
502             next if($diff < $DIFF);
503             }
504              
505             $from = $row->[0];
506             }
507              
508             $from ||= '1999-01-01T00:00:00Z';
509             if($from gt $to) {
510             my $xx = $from;
511             $from = $to;
512             $to = $xx;
513             }
514              
515             $self->_log("NEXT from=[$from], to=[$to]\n");
516              
517             while($from lt $to) {
518             my @from = $from =~ /(\d+)\-(\d+)\-(\d+)T(\d+):(\d+):(\d+)/;
519             my $dt = DateTime->new(
520             year => $from[0], month => $from[1], day => $from[2],
521             hour => $from[3], minute => $from[4], second => $from[5],
522             );
523             $dt->add( DateTime::Duration->new( minutes => $MINS ) );
524             my $split = sprintf "%sT%sZ", $dt->ymd, $dt->hms;
525             if($split lt $to) {
526             push @data, { dstart => $from, dend => $split };
527             } else {
528             push @data, { dstart => $from, dend => $to };
529             }
530              
531             $from = $split;
532             }
533              
534             return \@data;
535             }
536              
537             sub get_next_guids {
538             my ($self,$start,$end) = @_;
539             my ($guids);
540              
541             $self->{time} ||= 0;
542             $self->{last} ||= 0;
543             $start ||= 0;
544              
545             $self->_log("PRE time=[$self->{time}], last=[$self->{last}], start=[".($start||'')."], end=[".($end||'')."]\n");
546              
547             if($start) {
548             $self->{time} = $start;
549             $self->{time_to} = $end || '';
550             } else {
551             my $time = sprintf "%sT%sZ", DateTime->now->ymd, DateTime->now->hms;
552              
553             # note that because Amazon's SimpleDB can return odd entries out of sync, we have to look at previous entries
554             # to ensure we are starting from the right point. Also ignore date/times in the future.
555             my @rows = $self->{METABASE}->get_query('array','SELECT updated FROM metabase WHERE updated <= ? ORDER BY updated DESC LIMIT 10',$time);
556             for my $row (@rows) {
557             if($self->{time}) {
558             my $diff = abs( _date_diff($self->{time},$row->[0]) ); # just interested in the difference
559             next if($diff < $DIFF);
560             }
561              
562             $self->{time} = $row->[0];
563             }
564              
565             $self->{time} ||= '1999-01-01T00:00:00Z';
566             if($self->{last} ge $self->{time}) {
567             my @ts = $self->{last} =~ /(\d+)-(\d+)-(\d+)T(\d+):(\d+):(\d+)Z/;
568             $ts[1]--;
569             my $ts = timelocal(reverse @ts);
570             @ts = localtime($ts + $self->{offset}); # increment the offset for next time
571             $self->{time} = sprintf "%04d-%02d-%02dT%02d:%02d:%02dZ", $ts[5]+1900,$ts[4]+1,$ts[3], $ts[2],$ts[1],$ts[0];
572             }
573             }
574              
575             $self->_log("START time=[$self->{time}], last=[$self->{last}]\n");
576             $self->{last} = $self->{time};
577              
578             eval {
579             # if($self->{time_to}) {
580             # $guids = $self->{librarian}->search(
581             # 'core.type' => 'CPAN-Testers-Report',
582             # 'core.update_time' => { -and => { ">=" => $self->{time}, "<=" => $self->{time_to} } },
583             # '-asc' => 'core.update_time',
584             # '-limit' => $self->{poll_limit},
585             # );
586             # } else {
587             $guids = $self->{librarian}->search(
588             '-where' => [
589             '-and' =>
590             [ '-eq' => 'core.type' => 'CPAN-Testers-Report' ],
591             [ '-ge' => 'core.update_time' => $self->{time} ]
592             ],
593             '-order' => [ '-asc' => 'core.update_time' ],
594             '-limit' => $self->{poll_limit},
595             );
596             # }
597             };
598              
599             $self->_log(" ... Metabase Search Failed [$@]\n") if($@);
600             $self->_log("Retrieved ".($guids ? scalar(@$guids) : 0)." guids\n");
601             return $guids;
602             }
603              
604             sub retrieve_reports {
605             my ($self,$guids,$start) = @_;
606              
607             if($guids) {
608             for my $guid (@$guids) {
609             $self->_log("GUID [$guid]");
610             $self->{processed}++;
611             $self->{msg} = '';
612              
613             if(my $report = $self->get_fact($guid)) {
614             $self->{report}{guid} = $guid;
615             next if($self->parse_report(report => $report)); # true if invalid report
616              
617             if($self->store_report()) {
618             $self->{msg} .= ".. stored";
619             $self->{stored}++;
620              
621             } else {
622             if($self->{time} gt $self->{report}{updated}) {
623             $self->_log(".. FAIL: older than requested [$self->{time}]\n");
624             next;
625             }
626             $self->{msg} .= ".. already stored";
627             }
628             if($self->cache_report()) { $self->_log(".. cached\n"); $self->{cached}++; }
629             else { $self->_log(".. bad cache data\n"); }
630             } else {
631             $self->_log(".. FAIL\n");
632             }
633             }
634             }
635              
636             $self->commit();
637             my $invalid = $self->{invalid} ? scalar(@{$self->{invalid}}) : 0;
638             my $stop = localtime(time);
639             $self->_log("MARKER: processed=$self->{processed}, stored=$self->{stored}, cached=$self->{cached}, invalid=$invalid, start=$start, stop=$stop\n");
640              
641             # only email invalid reports during the generate process
642             $self->_send_email() if($self->{invalid});
643             }
644              
645             sub already_saved {
646             my ($self,$guid) = @_;
647             my @rows = $self->{METABASE}->get_query('array','SELECT updated FROM metabase WHERE guid=?',$guid);
648             return $rows[0]->[0] if(@rows);
649             return 0;
650             }
651              
652             sub load_fact {
653             my ($self,$guid,$check,$row) = @_;
654              
655             if(!$row && $guid) {
656             my @rows = $self->{METABASE}->get_query('hash','SELECT report,fact FROM metabase WHERE guid=?',$guid);
657             $row = $rows[0] if(@rows);
658             }
659              
660             if($row) {
661             if($row->{fact}) {
662             $self->{fact} = $self->{serializer2}->deserialize($row->{fact});
663             $self->{facts} = $self->dereference_report($self->{fact});
664             return $self->{facts};
665             }
666            
667             if($row->{report}) {
668             $self->{facts} = $self->{serializer}->deserialize($row->{report});
669             return $self->{facts};
670             }
671             }
672              
673             $self->_log(" ... no report [guid=$guid]\n") unless($check);
674             return;
675             }
676              
677             sub get_fact {
678             my ($self,$guid) = @_;
679             my $fact;
680             #print STDERR "guid=$guid\n";
681             eval { $fact = $self->{librarian}->extract( $guid ) };
682              
683             if($fact) {
684             $self->{fact} = $fact;
685             return $fact;
686             }
687              
688             $self->_log(" ... no report [guid=$guid] [$@]\n");
689             return;
690             }
691              
692             sub dereference_report {
693             my ($self,$report) = @_;
694             my %facts;
695              
696             my @facts = $report->facts();
697             for my $fact (@facts) {
698             my $name = ref $fact;
699             $facts{$name} = $fact->as_struct;
700             $facts{$name}{content} = decode_json($facts{$name}{content});
701             }
702              
703             return \%facts;
704             }
705              
706             sub parse_report {
707             my ($self,%hash) = @_;
708             my $options = $hash{options};
709             my $report = $hash{report};
710             my $guid = $self->{report}{guid};
711             my $invalid;
712              
713             $self->{report}{created} = $report->{metadata}{core}{creation_time};
714             $self->{report}{updated} = $report->{metadata}{core}{update_time};
715              
716             unless(ref($report) eq 'CPAN::Testers::Report') {
717             $self->{msg} .= ".. ref [" . ref($report) . "]";
718             return 1;
719             }
720              
721             my @facts = $report->facts();
722             for my $fact (@facts) {
723             if(ref $fact eq 'CPAN::Testers::Fact::TestSummary') {
724             $self->{report}{metabase}{'CPAN::Testers::Fact::TestSummary'} = $fact->as_struct;
725             $self->{report}{metabase}{'CPAN::Testers::Fact::TestSummary'}{content} = decode_json($self->{report}{metabase}{'CPAN::Testers::Fact::TestSummary'}{content});
726              
727             $self->{report}{state} = lc $fact->{content}{grade};
728             $self->{report}{platform} = $fact->{content}{archname};
729             $self->{report}{osname} = $self->_osname($fact->{content}{osname});
730             $self->{report}{osvers} = $fact->{content}{osversion};
731             $self->{report}{perl} = $fact->{content}{perl_version};
732             #$self->{report}{created} = $fact->{metadata}{core}{creation_time};
733             #$self->{report}{updated} = $fact->{metadata}{core}{update_time};
734              
735             my $dist = Metabase::Resource->new( $fact->resource );
736             $self->{report}{dist} = $dist->metadata->{dist_name};
737             $self->{report}{version} = $dist->metadata->{dist_version};
738             $self->{report}{resource} = $dist->metadata->{resource};
739              
740             # some distros are a pain!
741             if($self->{report}{version} eq '' && $MAPPINGS{$self->{report}{dist}}) {
742             $self->{report}{version} = $MAPPINGS{$self->{report}{dist}}->[1];
743             $self->{report}{dist} = $MAPPINGS{$self->{report}{dist}}->[0];
744             } elsif($self->{report}{version} eq '') {
745             $self->{report}{version} = 0;
746             }
747              
748             $self->{report}{from} = $self->_get_tester( $fact->creator->resource );
749              
750             # alternative API
751             #my $profile = $fact->creator->user;
752             #$self->{report}{from} = $profile->{email};
753             #$self->{report}{from} =~ s/'/''/g; #'
754             #$self->{report}{dist} = $fact->resource->dist_name;
755             #$self->{report}{version} = $fact->resource->dist_version;
756              
757             } elsif(ref $fact eq 'CPAN::Testers::Fact::LegacyReport') {
758             $self->{report}{metabase}{'CPAN::Testers::Fact::LegacyReport'} = $fact->as_struct;
759             $self->{report}{metabase}{'CPAN::Testers::Fact::LegacyReport'}{content} = decode_json($self->{report}{metabase}{'CPAN::Testers::Fact::LegacyReport'}{content});
760             $invalid = 'missing textreport' if(length $fact->{content}{textreport} < 10); # what is the smallest report?
761              
762             $self->{report}{perl} = $fact->{content}{perl_version};
763             }
764             }
765              
766             if($invalid) {
767             push @{$self->{invalid}}, {msg => $invalid, guid => $guid};
768             return 1;
769             }
770              
771             # fixes from metabase formatting
772             $self->{report}{perl} =~ s/^v//; # no leading 'v'
773             $self->_check_arch_os();
774              
775             if($self->{report}{created}) {
776             my @created = $self->{report}{created} =~ /(\d+)-(\d+)-(\d+)T(\d+):(\d+):(\d+)Z/; # 2010-02-23T20:33:52Z
777             $self->{report}{postdate} = sprintf "%04d%02d", $created[0], $created[1];
778             $self->{report}{fulldate} = sprintf "%04d%02d%02d%02d%02d", $created[0], $created[1], $created[2], $created[3], $created[4];
779             } else {
780             my @created = localtime(time);
781             $self->{report}{postdate} = sprintf "%04d%02d", $created[5]+1900, $created[4]+1;
782             $self->{report}{fulldate} = sprintf "%04d%02d%02d%02d%02d", $created[5]+1900, $created[4]+1, $created[3], $created[2], $created[1];
783             }
784              
785             $self->{msg} .= ".. time [$self->{report}{created}][$self->{report}{updated}]";
786              
787             $self->{report}{type} = 2;
788             if($self->{DISABLE} && $self->{DISABLE}{$self->{report}{from}}) {
789             $self->{report}{state} .= ':invalid';
790             $self->{report}{type} = 3;
791             } elsif($self->{report}{response} && $self->{report}{response} =~ m!/perl6/!) {
792             # $self->{report}{type} = 6;
793             return 1;
794             }
795              
796             #print STDERR "\n====\nreport=".Dumper($self->{report});
797              
798             return 1 unless($self->_valid_field($guid, 'dist' => $self->{report}{dist}) || ($options && $options->{exclude}{dist}));
799             return 1 unless($self->_valid_field($guid, 'version' => $self->{report}{version}) || ($options && $options->{exclude}{version}));
800             return 1 unless($self->_valid_field($guid, 'from' => $self->{report}{from}) || ($options && $options->{exclude}{from}));
801             return 1 unless($self->_valid_field($guid, 'perl' => $self->{report}{perl}) || ($options && $options->{exclude}{perl}));
802             return 1 unless($self->_valid_field($guid, 'platform' => $self->{report}{platform}) || ($options && $options->{exclude}{platform}));
803             return 1 unless($self->_valid_field($guid, 'osname' => $self->{report}{osname}) || ($options && $options->{exclude}{osname}));
804             return 1 unless($self->_valid_field($guid, 'osvers' => $self->{report}{osvers}) || ($options && $options->{exclude}{osname}));
805              
806             return 0
807             }
808              
809             sub reparse_report {
810             my ($self,%hash) = @_;
811             my $fact = 'CPAN::Testers::Fact::TestSummary';
812             my $options = $hash{options};
813              
814             $self->{report}{metabase}{$fact}{content} = encode_json($self->{report}{metabase}{$fact}{content});
815             my $report = CPAN::Testers::Fact::TestSummary->from_struct( $self->{report}{metabase}{$fact} );
816             my $guid = $self->{report}{guid};
817              
818             $self->{report}{state} = lc $report->{content}{grade};
819             $self->{report}{platform} = $report->{content}{archname};
820             $self->{report}{osname} = $self->_osname($report->{content}{osname});
821             $self->{report}{osvers} = $report->{content}{osversion};
822             $self->{report}{perl} = $report->{content}{perl_version};
823             $self->{report}{created} = $report->{metadata}{core}{creation_time};
824              
825             my $dist = Metabase::Resource->new( $report->{metadata}{core}{resource} );
826             $self->{report}{dist} = $dist->metadata->{dist_name};
827             $self->{report}{version} = $dist->metadata->{dist_version};
828             $self->{report}{resource} = $dist->metadata->{resource};
829              
830             $self->{report}{from} = $self->_get_tester( $report->{metadata}{core}{creator}{resource} );
831              
832             if($self->{report}{created}) {
833             my @created = $self->{report}{created} =~ /(\d+)-(\d+)-(\d+)T(\d+):(\d+):(\d+)Z/; # 2010-02-23T20:33:52Z
834             $self->{report}{postdate} = sprintf "%04d%02d", $created[0], $created[1];
835             $self->{report}{fulldate} = sprintf "%04d%02d%02d%02d%02d", $created[0], $created[1], $created[2], $created[3], $created[4];
836             } else {
837             my @created = localtime(time);
838             $self->{report}{postdate} = sprintf "%04d%02d", $created[5]+1900, $created[4]+1;
839             $self->{report}{fulldate} = sprintf "%04d%02d%02d%02d%02d", $created[5]+1900, $created[4]+1, $created[3], $created[2], $created[1];
840             }
841              
842             $self->{report}{type} = 2;
843             if($self->{DISABLE} && $self->{DISABLE}{$self->{report}{from}}) {
844             $self->{report}{state} .= ':invalid';
845             $self->{report}{type} = 3;
846             } elsif($self->{report}{response} && $self->{report}{response} =~ m!/perl6/!) {
847             # $self->{report}{type} = 6;
848             return 1;
849             }
850              
851             return 1 unless($self->_valid_field($guid, 'dist' => $self->{report}{dist}) || ($options && $options->{exclude}{dist}));
852             return 1 unless($self->_valid_field($guid, 'version' => $self->{report}{version}) || ($options && $options->{exclude}{version}));
853             return 1 unless($self->_valid_field($guid, 'from' => $self->{report}{from}) || ($options && $options->{exclude}{from}));
854             return 1 unless($self->_valid_field($guid, 'perl' => $self->{report}{perl}) || ($options && $options->{exclude}{perl}));
855             return 1 unless($self->_valid_field($guid, 'platform' => $self->{report}{platform}) || ($options && $options->{exclude}{platform}));
856             return 1 unless($self->_valid_field($guid, 'osname' => $self->{report}{osname}) || ($options && $options->{exclude}{osname}));
857             return 1 unless($self->_valid_field($guid, 'osvers' => $self->{report}{osvers}) || ($options && $options->{exclude}{osname}));
858              
859             return 0;
860             }
861              
862             sub retrieve_report {
863             my $self = shift;
864             my $guid = shift or return;
865              
866             my @rows = $self->{CPANSTATS}->get_query('hash','SELECT * FROM cpanstats WHERE guid=?',$guid);
867             return $rows[0] if(@rows);
868             return;
869             }
870              
871             sub store_report {
872             my $self = shift;
873             my @fields = qw(guid state postdate from dist version platform perl osname osvers fulldate type uploadid);
874              
875             my %fields = map {$_ => $self->{report}{$_}} @fields;
876             $fields{$_} ||= 0 for(qw(type uploadid));
877             $fields{$_} ||= '0' for(qw(perl));
878             $fields{$_} ||= '' for(@fields);
879             $fields{uploadid} ||= $self->{upload}{$fields{dist}}{$fields{version}};
880              
881             my @values = map {$fields{$_}} @fields;
882              
883             my %SQL = (
884             'SELECT' => {
885             CPANSTATS => 'SELECT id FROM cpanstats WHERE guid=?',
886             RELEASE => 'SELECT id FROM release_data WHERE guid=?',
887             },
888             'INSERT' => {
889             CPANSTATS => 'INSERT INTO cpanstats (guid,state,postdate,tester,dist,version,platform,perl,osname,osvers,fulldate,type,uploadid) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?)',
890             RELEASE => 'INSERT INTO release_data (id,guid,dist,version,oncpan,distmat,perlmat,patched,pass,fail,na,unknown,uploadid) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?)',
891             PASSES => 'INSERT IGNORE passreports SET platform=?, osname=?, perl=?, dist=?, postdate=?',
892             },
893             'UPDATE' => {
894             CPANSTATS => 'UPDATE cpanstats SET state=?,postdate=?,tester=?,dist=?,version=?,platform=?,perl=?,osname=?,osvers=?,fulldate=?,type=?,uploadid=? WHERE guid=?',
895             RELEASE => 'UPDATE release_data SET id=?,dist=?,version=?,oncpan=?,distmat=?,perlmat=?,patched=?,pass=?,fail=?,na=?,unknown=?,uploadid=? WHERE guid=?',
896             },
897             );
898              
899             # update the mysql database
900             my @rows = $self->{CPANSTATS}->get_query('array',$SQL{SELECT}{CPANSTATS},$values[0]);
901             if(@rows) {
902             if($self->{reparse}) {
903             my ($guid,@update) = @values;
904             if($self->{check}) {
905             $self->_log( "CHECK: $SQL{UPDATE}{CPANSTATS},[" . join(',',@update,$guid) . "]\n" );
906             } else {
907             $self->{CPANSTATS}->do_query($SQL{UPDATE}{CPANSTATS},@update,$guid);
908             }
909             } else {
910             $self->{report}{id} = $rows[0]->[0];
911             return 0;
912             }
913             } else {
914             if($self->{check}) {
915             $self->_log( "CHECK: $SQL{INSERT}{CPANSTATS},[" . join(',',@values) . "]\n" );
916             } else {
917             $self->{report}{id} = $self->{CPANSTATS}->id_query($SQL{INSERT}{CPANSTATS},@values);
918             }
919             }
920              
921             # in check mode, assume the rest happens
922             return 1 if($self->{check});
923              
924             # perl version components
925             my ($perl,$patch,$devel) = $self->_get_perl_version($fields{perl});
926              
927             # only valid perl5 reports
928             if($self->{report}{type} == 2) {
929             $fields{id} = $self->{report}{id};
930              
931             # push page requests
932             # - note we only update the author if this is the *latest* version of the distribution
933             my $author = $self->{report}{pauseid} || $self->_get_author($fields{uploadid},$fields{dist},$fields{version});
934             $self->{CPANSTATS}->do_query("INSERT INTO page_requests (type,name,weight,id) VALUES ('author',?,1,?)",$author,$fields{id}) if($author);
935             $self->{CPANSTATS}->do_query("INSERT INTO page_requests (type,name,weight,id) VALUES ('distro',?,1,?)",$fields{dist},$fields{id});
936              
937             my @rows = $self->{CPANSTATS}->get_query('array',$SQL{SELECT}{RELEASE},$fields{guid});
938             #print STDERR "# select release $SQL{SELECT}{RELEASE},$fields{guid}\n";
939             if(@rows) {
940             if($self->{reparse}) {
941             $self->{CPANSTATS}->do_query($SQL{UPDATE}{RELEASE},
942             $fields{id}, # id,
943             $fields{dist},$fields{version}, # dist, version
944              
945             $self->_oncpan($fields{uploadid},$fields{dist},$fields{version}) ? 1 : 2,
946              
947             $fields{version} =~ /_/ ? 2 : 1,
948             $devel ? 2 : 1,
949             $patch ? 2 : 1,
950              
951             $fields{state} eq 'pass' ? 1 : 0,
952             $fields{state} eq 'fail' ? 1 : 0,
953             $fields{state} eq 'na' ? 1 : 0,
954             $fields{state} eq 'unknown' ? 1 : 0,
955              
956             $fields{uploadid},
957              
958             $fields{guid}); # guid
959             }
960             } else {
961             #print STDERR "# insert release $SQL{INSERT}{RELEASE},$fields[0],$fields[1]\n";
962             $self->{CPANSTATS}->do_query($SQL{INSERT}{RELEASE},
963             $fields{id},$fields{guid}, # id, guid
964             $fields{dist},$fields{version}, # dist, version
965              
966             $self->_oncpan($fields{uploadid},$fields{dist},$fields{version}) ? 1 : 2,
967              
968             $fields{version} =~ /_/ ? 2 : 1,
969             $devel ? 2 : 1,
970             $patch ? 2 : 1,
971              
972             $fields{state} eq 'pass' ? 1 : 0,
973             $fields{state} eq 'fail' ? 1 : 0,
974             $fields{state} eq 'na' ? 1 : 0,
975             $fields{state} eq 'unknown' ? 1 : 0,
976              
977             $fields{uploadid});
978             }
979             }
980              
981             if($fields{state} eq 'pass') {
982             $fields{perl} =~ s/\s.*//; # only need to know the main release
983             $self->{CPANSTATS}->do_query($SQL{INSERT}{PASSES},
984             $fields{platform},
985             $fields{osname},
986             $fields{perl},
987             $fields{dist},
988             $fields{postdate});
989             }
990              
991             if((++$self->{stat_count} % 500) == 0) {
992             $self->commit;
993             }
994              
995             return 1;
996             }
997              
998             sub cache_report {
999             my $self = shift;
1000             return 0 unless($self->{report}{guid} && $self->{report}{metabase});
1001              
1002             # in check mode, assume the rest happens
1003             return 1 if($self->{check});
1004             return 1 if($self->{localonly});
1005              
1006             my ($json,$data,$fact);
1007              
1008             eval { $json = encode_json($self->{report}{metabase}); };
1009             eval { $data = $self->{serializer}->serialize("$json"); };
1010             eval { $data = $self->{serializer}->serialize( $self->{report}{metabase} ); } if($@);
1011             eval { $fact = $self->{serializer2}->serialize($self->{fact}); };
1012              
1013             $data ||= '';
1014             $fact ||= '';
1015              
1016             $self->{METABASE}->do_query('INSERT IGNORE INTO metabase (guid,id,updated,report,fact) VALUES (?,?,?,?,?)',
1017             $self->{report}{guid},$self->{report}{id},$self->{report}{updated},$data,$fact);
1018              
1019             if((++$self->{meta_count} % 500) == 0) {
1020             $self->{METABASE}->do_commit;
1021             }
1022              
1023             return 1;
1024             }
1025              
1026             sub cache_update {
1027             my $self = shift;
1028             return 0 unless($self->{report}{guid} && $self->{report}{id});
1029              
1030             # in check mode, assume the rest happens
1031             return 1 if($self->{check});
1032             return 1 if($self->{localonly});
1033              
1034             $self->{METABASE}->do_query('UPDATE metabase SET id=? WHERE guid=?',$self->{report}{id},$self->{report}{guid});
1035              
1036             if((++$self->{meta_count} % 500) == 0) {
1037             $self->{METABASE}->do_commit;
1038             }
1039              
1040             return 1;
1041             }
1042              
1043             #----------------------------------------------------------------------------
1044             # Internal Cache Methods
1045              
1046             sub load_uploads {
1047             my $self = shift;
1048              
1049             my @rows = $self->{CPANSTATS}->get_query('hash','SELECT uploadid,dist,version,type FROM uploads');
1050             for my $row (@rows) {
1051             $self->{oncpan}{$row->{uploadid}} = $row->{type};
1052             $self->{upload}{$row->{dist}}{$row->{version}} = $row->{uploadid};
1053             }
1054             }
1055              
1056             sub load_authors {
1057             my $self = shift;
1058              
1059             my @rows = $self->{CPANSTATS}->get_query('hash','SELECT author,dist,version,uploadid FROM ixlatest');
1060             for my $row (@rows) {
1061             $self->{author}{$row->{dist}}{$row->{version}} = $row->{author};
1062             $self->{author2}{$row->{uploadid}} = $row->{author};
1063             }
1064             }
1065              
1066             sub load_perl_versions {
1067             my $self = shift;
1068              
1069             my @rows = $self->{CPANSTATS}->get_query('hash','SELECT * FROM perl_version');
1070             for my $row (@rows) {
1071             $self->{perls}{$row->{version}} = {
1072             perl => $row->{perl},
1073             patch => $row->{patch},
1074             devel => $row->{devel},
1075             saved => 1
1076             };
1077             }
1078             }
1079              
1080             sub save_perl_versions {
1081             my $self = shift;
1082              
1083             for my $vers (keys %{ $self->{perls} }) {
1084             next if($self->{perls}{$vers}{saved});
1085             $self->{CPANSTATS}->do_query("INSERT INTO perl_version (version,perl,patch,devel) VALUES (?,?,?,?)",
1086             $vers, $self->{perls}{$vers}{perl}, $self->{perls}{$vers}{patch}, $self->{perls}{$vers}{devel});
1087             }
1088             }
1089              
1090             #----------------------------------------------------------------------------
1091             # Private Methods
1092              
1093             sub _consume_reports {
1094             my ($self,$maxdate,$dataset) = @_;
1095              
1096             for my $data (@$dataset) {
1097             my $start = $self->_get_createdate( $data->{gstart}, $data->{dstart} );
1098             my $end = $self->_get_createdate( $data->{gend}, $data->{dend} );
1099              
1100             unless($start && $end) {
1101             $start ||= '';
1102             $end ||= '';
1103             $self->_log("BAD DATES: start=$start, end=$end [missing dates]\n");
1104             next;
1105             }
1106             if($start ge $end) {
1107             $self->_log("BAD DATES: start=$start, end=$end [end before start]\n");
1108             next;
1109             }
1110             # if($end gt $maxdate) {
1111             # $self->_log("BAD DATES: start=$start, end=$end [exceeds $maxdate]\n");
1112             # next;
1113             # }
1114              
1115             $self->_log("LOOP: start=$start, end=$end\n");
1116              
1117             ($self->{processed},$self->{stored},$self->{cached}) = (0,0,0);
1118              
1119             # what guids do we already have?
1120             my $sql = 'SELECT guid FROM metabase WHERE updated >= ? AND updated <= ? ORDER BY updated asc';
1121             my @guids = $self->{METABASE}->get_query('hash',$sql,$data->{dstart},$data->{dend});
1122             my %guids = map {$_->{guid} => 1} @guids;
1123              
1124             # note that because Amazon's SimpleDB can return odd entries out of
1125             # sync, we have to look at previous entries to ensure we are starting
1126             # from the right point
1127             my ($update,$prev,$last) = ($start,$start,$start);
1128             my @times = ();
1129              
1130             my $prior = [ 0, 0 ];
1131             my $saved = 0;
1132             while($update lt $end) {
1133             $self->_log("UPDATE: update=$update, end=$end, saved=$saved, guids=".(scalar(@guids))."\n");
1134              
1135             # get list of guids from last update date
1136             my $guids = $self->get_next_guids($update,$end);
1137             last unless($guids);
1138              
1139             @guids = grep { !$guids{$_} } @$guids;
1140             last unless(@guids);
1141             last if($prior->[0] eq $guids[0] && $prior->[1] eq $guids[-1]); # prevent an endless loop
1142             $prior = [ $guids[0], $guids[-1] ];
1143              
1144             $self->_log("UPDATE: todo guids=".(scalar(@guids))."\n");
1145              
1146             my $current = $update;
1147             for my $guid (@guids) {
1148             # don't process too far
1149             shift @times if(@times > 9); # one off
1150             push @times, [ $current, (_date_diff($end,$current) <= 0 ? 0 : 1) ]; # one on ... max 10
1151              
1152             my $times = 0;
1153             $times += $_->[1] for(@times);
1154             last if($times == 10); # stop if all greater than end
1155              
1156             # okay process
1157             $self->_log("GUID [$guid]");
1158              
1159             $self->{processed}++;
1160              
1161             if(my $time = $self->already_saved($guid)) {
1162             $self->_log(".. already saved [$time]\n");
1163             $current = $time;
1164             $saved++;
1165             next;
1166             }
1167              
1168             if(my $report = $self->get_fact($guid)) {
1169             $current = $report->{metadata}{core}{update_time};
1170             $self->{report}{guid} = $guid;
1171             next if($self->parse_report(report => $report)); # true if invalid report
1172              
1173             if($self->store_report()) { $self->_log(".. stored"); $self->{stored}++; }
1174             else { $self->_log(".. already stored"); }
1175             if($self->cache_report()) { $self->_log(".. cached\n"); $self->{cached}++; }
1176             else { $self->_log(".. bad cache data\n"); }
1177             } else {
1178             $self->_log(".. FAIL\n");
1179             }
1180             }
1181              
1182             $update = $times[0]->[0];
1183              
1184             $self->commit();
1185             }
1186              
1187             $self->commit();
1188             my $invalid = $self->{invalid} ? scalar(@{$self->{invalid}}) : 0;
1189             my $stop = localtime(time);
1190             $self->_log("MARKER: processed=$self->{processed}, stored=$self->{stored}, cached=$self->{cached}, invalid=$invalid, start=$start, stop=$stop\n");
1191             }
1192              
1193             # only email invalid reports during the generate process
1194             $self->_send_email() if($self->{invalid});
1195             }
1196              
1197             sub _get_perl_version {
1198             my $self = shift;
1199             my $vers = shift;
1200              
1201             unless($self->{perls}{$vers}) {
1202             my $patch = $vers =~ /^5.(7|9|[1-9][13579])/ ? 1 : 0, # odd numbers now mark development releases
1203             my $devel = $vers =~ /(RC\d+|patch)/ ? 1 : 0,
1204             my ($perl) = $vers =~ /(5\.\d+(?:\.\d+)?)/;
1205              
1206             $self->{perls}{$vers} = {
1207             perl => $perl,
1208             patch => $patch,
1209             devel => $devel,
1210             saved => 0
1211             };
1212             }
1213              
1214             return $self->{perls}{$vers}{perl}, $self->{perls}{$vers}{patch}, $self->{perls}{$vers}{devel};
1215             }
1216              
1217             sub _get_guid_list {
1218             my ($self,$guid,$file) = @_;
1219             my (@ids,@guids);
1220              
1221             # we're only parsing one id
1222             if($guid) {
1223             if($guid =~ /^\d+$/) { push @ids, $guid }
1224             else { push @guids, $guid }
1225             } elsif($file) {
1226             my $fh = IO::File->new($file,'r') or die "Cannot read file [$file]: $!";
1227             while(<$fh>) {
1228             chomp;
1229             my ($num) = (m/^([\da-z-]+)/i);
1230             if($num =~ /^\d+$/) { push @ids, $num }
1231             else { push @guids, $num }
1232             }
1233             $fh->close;
1234             } else {
1235             return;
1236             }
1237              
1238             # turn ids into guids
1239             if(@ids) {
1240             my @rows = $self->{CPANSTATS}->get_query('array','SELECT guid FROM cpanstats WHERE id IN ('.join(',',@ids).')');
1241             push @guids, $_->[0] for(@rows);
1242             }
1243              
1244             my %guids = map {$_ => 1} @guids;
1245             my @list = keys %guids;
1246             return @list;
1247             }
1248              
1249             sub _get_createdate {
1250             my ($self,$guid,$date) = @_;
1251              
1252             return unless($guid || $date);
1253             if($guid) {
1254             my @rows = $self->{METABASE}->get_query('hash','SELECT updated FROM metabase WHERE guid=?',$guid);
1255             $date = $rows[0]->{updated} if(@rows);
1256             }
1257              
1258             return unless($date && $date =~ /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$/);
1259             return $date;
1260             }
1261              
1262             sub _get_tester {
1263             my ($self,$creator) = @_;
1264             return $testers{$creator} if($testers{$creator});
1265              
1266             my $profile = Metabase::Resource->new( $creator );
1267             return $creator unless($profile);
1268              
1269             my $user;
1270             eval { $user = $self->{librarian}->extract( $profile->guid ) };
1271             return $creator unless($user);
1272              
1273             my ($name,@emails);
1274             for my $fact ($user->facts()) {
1275             if(ref $fact eq 'Metabase::User::EmailAddress') {
1276             push @emails, $fact->{content};
1277             } elsif(ref $fact eq 'Metabase::User::FullName') {
1278             $name = encode_entities($fact->{content});
1279             }
1280             }
1281              
1282             $name ||= 'NONAME'; # shouldn't happen, but allows for checks later
1283              
1284             for my $em (@emails) {
1285             $self->{METABASE}->do_query('INSERT INTO testers_email (resource,fullname,email) VALUES (?,?,?)',$creator,$name,$em);
1286             }
1287              
1288             $testers{$creator} = @emails ? $emails[0] : $creator;
1289             $testers{$creator} =~ s/\'/''/g if($testers{$creator});
1290             return $testers{$creator};
1291             }
1292              
1293             sub _get_author {
1294             my ($self,$upid,$dist,$vers) = @_;
1295              
1296             my $author = $self->{author2}{$upid} || '';
1297             $author ||= $self->{author}{$dist}{$vers} || '';
1298              
1299             return $author;
1300             }
1301              
1302             sub _valid_field {
1303             my ($self,$id,$name,$value) = @_;
1304             return 1 if(defined $value);
1305             $self->_log(" . [$id] ... missing field: $name\n");
1306             return 0;
1307             }
1308              
1309             sub _get_lastid {
1310             my $self = shift;
1311              
1312             my @rows = $self->{METABASE}->get_query('array',"SELECT MAX(id) FROM metabase");
1313             return 0 unless(@rows);
1314             return $rows[0]->[0] || 0;
1315             }
1316              
1317             sub _oncpan {
1318             my ($self,$upid,$dist,$vers) = @_;
1319            
1320             $upid ||= $self->{upload}{$dist}{$vers};
1321             my $type = $self->{oncpan}{$upid};
1322              
1323             return 1 unless($type); # assume it's a new release
1324             return 0 if($type eq 'backpan'); # on backpan only
1325             return 1; # on cpan or new upload
1326             }
1327              
1328             sub _osname {
1329             my $self = shift;
1330             my $name = shift || return '';
1331              
1332             my $lname = lc $name;
1333             my $uname = uc $name;
1334             $self->{OSNAMES}{$lname} ||= do {
1335             $self->{CPANSTATS}->do_query(qq{INSERT INTO osname (osname,ostitle) VALUES ('$name','$uname')});
1336             $uname;
1337             };
1338              
1339             return $self->{OSNAMES}{$lname};
1340             }
1341              
1342             sub _check_arch_os {
1343             my $self = shift;
1344              
1345             my $text = $self->_platform_to_osname($self->{report}{platform});
1346             #print STDERR "_check: text=$text\n";
1347             #print STDERR "_check: platform=$self->{report}{platform}\n";
1348             #print STDERR "_check: osname=$self->{report}{osname}\n";
1349             return if($text && $self->{report}{osname} && lc $text eq lc $self->{report}{osname});
1350              
1351             #print STDERR "_check: metabase=".Dumper($self->{report}{metabase})."\n";
1352             my $textreport = $self->{report}{metabase}{'CPAN::Testers::Fact::LegacyReport'}{content}{textreport};
1353             $textreport =~ s/\\n/\n/g; # newlines may be escaped
1354              
1355             # create a fake mail, as CTC::Article parses a mail like text block
1356             my $mail = <<EMAIL;
1357             From: fake\@example.com
1358             To: fake\@example.com
1359             Subject: PASS Fake-0.01
1360             Date: 01-01-2010 01:01:01 Z
1361              
1362             $textreport
1363             EMAIL
1364             my $object = CPAN::Testers::Common::Article->new( $mail ) or return;
1365             $object->parse_report();
1366              
1367             $self->{report}{osname} = $object->osname;
1368             $self->{report}{platform} = $object->archname;
1369             }
1370              
1371             sub _platform_to_osname {
1372             my $self = shift;
1373             my $arch = shift || return '';
1374              
1375             $OSNAMES = join('|',keys %{$self->{OSNAMES}}) if(keys %{$self->{OSNAMES}});
1376              
1377             return $1 if($arch =~ /($OSNAMES)/i);
1378              
1379             for my $rx (keys %{ $self->{OSNAMES} }) {
1380             return $self->{OSNAMES}{$rx} if($arch =~ /$rx/i);
1381             }
1382              
1383             return '';
1384             }
1385              
1386             sub _send_email {
1387             my $self = shift;
1388             my $t = localtime;
1389             my $DATE = $t->strftime("%a, %d %b %Y %H:%M:%S +0000");
1390             $DATE =~ s/\s+$//;
1391             my $INVALID = join("\n",@{$self->{invalid}});
1392             $self->_log("INVALID:\n$INVALID\n");
1393              
1394             for my $admin (@{$self->{admins}}) {
1395             my $cmd = qq!| $HOW $admin!;
1396              
1397             my $body = $HEAD . $BODY;
1398             $body =~ s/FROM/$FROM/g;
1399             $body =~ s/EMAIL/$admin/g;
1400             $body =~ s/DATE/$DATE/g;
1401             $body =~ s/INVALID/$INVALID/g;
1402              
1403             if(my $fh = IO::File->new($cmd)) {
1404             print $fh $body;
1405             $fh->close;
1406             $self->_log(".. MAIL SEND - SUCCESS - $admin\n");
1407             } else {
1408             $self->_log(".. MAIL SEND - FAILED - $admin\n");
1409             }
1410             }
1411             }
1412              
1413             sub _date_diff {
1414             my ($date1,$date2) = @_;
1415              
1416             my (@dt1) = $date1 =~ /(\d+)-(\d+)-(\d+)T(\d+):(\d+):(\d+)Z/;
1417             my (@dt2) = $date2 =~ /(\d+)-(\d+)-(\d+)T(\d+):(\d+):(\d+)Z/;
1418              
1419             return -1 unless(@dt1 && @dt2);
1420              
1421             my $dt1 = DateTime->new( year => $dt1[0], month => $dt1[1], day => $dt1[2], hour => $dt1[3], minute => $dt1[4], second => $dt1[5], time_zone => 'UTC' )->epoch;
1422             my $dt2 = DateTime->new( year => $dt2[0], month => $dt2[1], day => $dt2[2], hour => $dt2[3], minute => $dt2[4], second => $dt2[5], time_zone => 'UTC' )->epoch;
1423              
1424             return $dt2 - $dt1;
1425             }
1426              
1427             sub _log {
1428             my $self = shift;
1429             my $log = $self->{logfile} or return;
1430             mkpath(dirname($log)) unless(-f $log);
1431             my $fh = IO::File->new($log,'a+') or die "Cannot append to log file [$log]: $!\n";
1432             print $fh $self->{msg} if($self->{msg});
1433             print $fh @_;
1434             $fh->close;
1435             $self->{msg} = '';
1436             }
1437              
1438             1;
1439              
1440             __END__
1441              
1442             =head1 NAME
1443              
1444             CPAN::Testers::Data::Generator - Download and summarize CPAN Testers data
1445              
1446             =head1 SYNOPSIS
1447              
1448             % cpanstats
1449             # ... wait patiently, very patiently
1450             # ... then use the cpanstats MySQL database
1451              
1452             =head1 DESCRIPTION
1453              
1454             This distribution was originally written by Leon Brocard to download and
1455             summarize CPAN Testers data. However, all of the original code has been
1456             rewritten to use the CPAN Testers Statistics database generation code. This
1457             now means that all the CPAN Testers sites including the Reports site, the
1458             Statistics site and the CPAN Dependencies site, can use the same database.
1459              
1460             This module retrieves and parses reports from the Metabase, generating or
1461             updating entries in the cpanstats database, which extracts specific metadata
1462             from the reports. The information in the cpanstats database is then presented
1463             via CPAN::Testers::WWW::Reports on the CPAN Testers Reports website.
1464              
1465             A good example query from the cpanstats database for Acme-Colour would be:
1466              
1467             SELECT version, status, count(*) FROM cpanstats WHERE
1468             dist = "Acme-Colour" group by version, state;
1469              
1470             To create a database from scratch can take several days, as there are now over
1471             24 million submitted reports. As such updating from a known copy of the
1472             database is much more advisable. If you don't want to generate the database
1473             yourself, you can obtain a feed using CPAN::Testers::WWW::Report::Query::Reports.
1474              
1475             With over 24 million reports in the database, if you do plan to run this
1476             software to generate the databases it is recommended you utilise a high-end
1477             processor machine. Even with a reasonable processor it can take over a week!
1478              
1479             =head1 DATABASE SCHEMA
1480              
1481             The cpanstats database schema is very straightforward, one main table with
1482             several index tables to speed up searches. The main table is as below:
1483              
1484             CREATE TABLE `cpanstats` (
1485              
1486             `id` int(10) unsigned NOT NULL AUTO_INCREMENT,
1487             `guid` char(36) NOT NULL DEFAULT '',
1488             `state` varchar(32) DEFAULT NULL,
1489             `postdate` varchar(8) DEFAULT NULL,
1490             `tester` varchar(255) DEFAULT NULL,
1491             `dist` varchar(255) DEFAULT NULL,
1492             `version` varchar(255) DEFAULT NULL,
1493             `platform` varchar(255) DEFAULT NULL,
1494             `perl` varchar(255) DEFAULT NULL,
1495             `osname` varchar(255) DEFAULT NULL,
1496             `osvers` varchar(255) DEFAULT NULL,
1497             `fulldate` varchar(32) DEFAULT NULL,
1498             `type` int(2) DEFAULT '0',
1499            
1500             PRIMARY KEY (`id`),
1501             KEY `guid` (`guid`),
1502             KEY `distvers` (`dist`,`version`),
1503             KEY `tester` (`tester`),
1504             KEY `state` (`state`),
1505             KEY `postdate` (`postdate`)
1506            
1507             )
1508              
1509             It should be noted that 'postdate' refers to the YYYYMM formatted date, whereas
1510             the 'fulldate' field refers to the YYYYMMDDhhmm formatted date and time.
1511              
1512             The metabase database schema is again very straightforward, and consists of one
1513             main table, as below:
1514              
1515             CREATE TABLE `metabase` (
1516            
1517             `guid` char(36) NOT NULL,
1518             `id` int(10) unsigned NOT NULL,
1519             `updated` varchar(32) DEFAULT NULL,
1520             `report` longblob NOT NULL,
1521             `fact` longblob NOT NULL,
1522            
1523             PRIMARY KEY (`guid`),
1524             KEY `id` (`id`),
1525             KEY `updated` (`updated`)
1526            
1527             )
1528              
1529             The id field is a reference to the cpanstats.id field.
1530              
1531             The report field is JSON encoded, and is a cached version of the facts of a
1532             report, while the fact field is the full report fact, and associated child
1533             facts, Sereal encoded. Both are extracted from the returned fact from
1534             Metabase::Librarian.
1535              
1536             See F<examples/cpanstats-createdb> for the full list of tables used.
1537              
1538             =head1 SIGNIFICANT CHANGES
1539              
1540             =head2 v0.31 CHANGES
1541              
1542             With the release of v0.31, a number of changes to the codebase were made as
1543             a further move towards CPAN Testers 2.0. The first change is the name for this
1544             distribution. Now titled 'CPAN-Testers-Data-Generator', this now fits more
1545             appropriately within the CPAN-Testers namespace on CPAN.
1546              
1547             The second significant change is to now reference a MySQL cpanstats database.
1548             The SQLite version is still updated as before, as a number of other websites
1549             and toolsets still rely on that database file format. However, in order to make
1550             the CPAN Testers Reports website more dynamic, an SQLite database is not really
1551             appropriate for a high demand website.
1552              
1553             The database creation code is now available as a standalone program, in the
1554             examples directory, and all the database communication is now handled by the
1555             new distribution CPAN-Testers-Common-DBUtils.
1556              
1557             =head2 v0.41 CHANGES
1558              
1559             In the next stage of development of CPAN Testers 2.0, the id field used within
1560             the database schema above for the cpanstats table no longer matches the NNTP
1561             ID value, although the id in the articles does still reference the NNTP ID, at
1562             least for the reports submitted prior to the switch to the Metabase in 2010.
1563              
1564             In order to correctly reference the id in the articles table, you will need to
1565             use the function guid_to_nntp() with CPAN::Testers::Common::Utils, using the
1566             new guid field in the cpanstats table.
1567              
1568             As of this release the cpanstats id field is a unique auto incrementing field.
1569              
1570             The next release of this distribution will be focused on generation of stats
1571             using the Metabase storage API.
1572              
1573             =head2 v1.00 CHANGES
1574              
1575             Moved to Metabase API. The change to a definite major version number hopefully
1576             indicates that this is a major interface change. All previous NNTP access has
1577             been dropped and is no longer relavent. All report updates are now fed from
1578             the Metabase API.
1579              
1580             =head1 INTERFACE
1581              
1582             =head2 The Constructor
1583              
1584             =over
1585              
1586             =item * new
1587              
1588             Instatiates the object CPAN::Testers::Data::Generator. Accepts a hash containing
1589             values to prepare the object. These are described as:
1590              
1591             my $obj = CPAN::Testers::Data::Generator->new(
1592             logfile => './here/logfile',
1593             config => './here/config.ini'
1594             );
1595              
1596             Where 'logfile' is the location to write log messages. Log messages are only
1597             written if a logfile entry is specified, and will always append to any existing
1598             file. The 'config' should contain the path to the configuration file, used
1599             to define the database access and general operation settings.
1600              
1601             =back
1602              
1603             =head2 Public Methods
1604              
1605             =over
1606              
1607             =item * generate
1608              
1609             Starting from the last cached report, retrieves all the more recent reports
1610             from the Metabase Report Submission server, parsing each and recording each
1611             report in both the cpanstats database and the metabase cache database.
1612              
1613             =item * regenerate
1614              
1615             For a given date range, retrieves all the reports from the Metabase Report
1616             Submission server, parsing each and recording each report in both the cpanstats
1617             database and the metabase cache database.
1618              
1619             Note that as only 2500 can be returned at any one time due to Amazon SimpleDB
1620             restrictions, this method will only process the guids returned from a given
1621             start data, up to a maxiumu of 2500 guids.
1622              
1623             This method will return the guid of the last report processed.
1624              
1625             =item * rebuild
1626              
1627             In the event that the cpanstats database needs regenerating, either in part or
1628             for the whole database, this method allow you to do so. You may supply
1629             parameters as to the 'start' and 'end' values (inclusive), where all records
1630             are assumed by default. Records are rebuilt using the local metabase cache
1631             database.
1632              
1633             =item * reparse
1634              
1635             Rather than a complete rebuild the option to selective reparse selected entries
1636             is useful if there are reports which were previously unable to correctly supply
1637             a particular field, which now has supporting parsing code within the codebase.
1638              
1639             In addition there is the option to exclude fields from parsing checks, where
1640             they may be corrupted, and can be later amended using the 'cpanstats-update'
1641             tool.
1642              
1643             =item * parse
1644              
1645             Unlike reparse, parse is used to parse just missing reports. As such if a
1646             report has already been stored and cached, it won't be processed again, unless
1647             the 'force' option is used.
1648              
1649             In addition, as per reparse, there is the option to exclude fields from parsing
1650             checks, where they may be corrupted, and can be later amended using the
1651             'cpanstats-update' tool.
1652              
1653             =item * tail
1654              
1655             Write to a file, the list of GUIDs returned from a tail request.
1656              
1657             =back
1658              
1659             =head2 Private Methods
1660              
1661             =over
1662              
1663             =item * commit
1664              
1665             To speed up the transaction process, a commit is performed every 500 inserts.
1666             This method is used as part of the clean up process to ensure all transactions
1667             are completed.
1668              
1669             =item * get_tail_guids
1670              
1671             Get the list of GUIDs as would be seen for a tail log.
1672              
1673             =item * get_next_dates
1674              
1675             Get the list of dates to use in the next cycle of report retrieval.
1676              
1677             =item * get_next_guids
1678              
1679             Get the list of GUIDs for the reports that have been submitted since the last
1680             cached report.
1681              
1682             =item * retrieve_reports
1683              
1684             Abstracted loop of requesting GUIDs, then parsing, storing and caching each
1685             report as appropriate.
1686              
1687             =item * already_saved
1688              
1689             Given a guid, determines whether it has already been saved in the local
1690             metabase cache.
1691              
1692             =item * load_fact
1693              
1694             Get a specific report fact for a given GUID, from the local database.
1695              
1696             =item * get_fact
1697              
1698             Get a specific report fact for a given GUID, from the Metabase.
1699              
1700             =item * dereference_report
1701              
1702             When you retrieve the parent report fact from the database, you'll need to
1703             dereference it to ensure the child elements contain the child facts in the
1704             correct format for processing.
1705              
1706             =item * parse_report
1707              
1708             Parses a report extracting the metadata required for the cpanstats database.
1709              
1710             =item * reparse_report
1711              
1712             Parses a report (from a local metabase cache) extracting the metadata required
1713             for the stats database.
1714              
1715             =item * retrieve_report
1716              
1717             Given a guid will attempt to return the report metadata from the cpanstats
1718             database.
1719              
1720             =item * store_report
1721              
1722             Inserts the components of a parsed report into the cpanstats database.
1723              
1724             =item * cache_report
1725              
1726             Inserts a serialised report into a local metabase cache database.
1727              
1728             =item * cache_update
1729              
1730             For the current report will update the local metabase cache with the id used
1731             within the cpanstats database.
1732              
1733             =back
1734              
1735             =head2 Very Private methods
1736              
1737             The following modules load information enmasse to avoid DB connection hogging
1738             and IO blocking. Thus improving performance.
1739              
1740             =over 4
1741              
1742             =item * load_uploads
1743              
1744             Loads the upload information.
1745              
1746             =item * load_authors
1747              
1748             Loads information regarding each author's distribution.
1749              
1750             =item * load_perl_versions
1751              
1752             Loads all the known Perl versions.
1753              
1754             =item * save_perl_versions
1755              
1756             Saves any new Perl versions
1757              
1758             =back
1759              
1760             =head1 HISTORY
1761              
1762             The CPAN Testers was conceived back in May 1998 by Graham Barr and Chris
1763             Nandor as a way to provide multi-platform testing for modules. Today there
1764             are over 40 million tester reports and more than 100 testers each month
1765             giving valuable feedback for users and authors alike.
1766              
1767             =head1 BECOME A TESTER
1768              
1769             Whether you have a common platform or a very unusual one, you can help by
1770             testing modules you install and submitting reports. There are plenty of
1771             module authors who could use test reports and helpful feedback on their
1772             modules and distributions.
1773              
1774             If you'd like to get involved, please take a look at the CPAN Testers Wiki,
1775             where you can learn how to install and configure one of the recommended
1776             smoke tools.
1777              
1778             For further help and advice, please subscribe to the the CPAN Testers
1779             discussion mailing list.
1780              
1781             CPAN Testers Wiki
1782             - http://wiki.cpantesters.org
1783             CPAN Testers Discuss mailing list
1784             - http://lists.cpan.org/showlist.cgi?name=cpan-testers-discuss
1785              
1786             =head1 BUCKETS
1787              
1788             beta6 - 2014-01-21
1789             beta7 - 2014-11-12
1790              
1791             =head1 BUGS, PATCHES & FIXES
1792              
1793             There are no known bugs at the time of this release. However, if you spot a
1794             bug or are experiencing difficulties, that is not explained within the POD
1795             documentation, please send bug reports and patches to the RT Queue (see below).
1796              
1797             Fixes are dependent upon their severity and my availability. Should a fix not
1798             be forthcoming, please feel free to (politely) remind me.
1799              
1800             RT Queue -
1801             http://rt.cpan.org/Public/Dist/Display.html?Name=CPAN-Testers-Data-Generator
1802              
1803             =head1 SEE ALSO
1804              
1805             L<CPAN::Testers::Report>,
1806             L<Metabase>,
1807             L<Metabase::Fact>,
1808             L<CPAN::Testers::Fact::LegacyReport>,
1809             L<CPAN::Testers::Fact::TestSummary>,
1810             L<CPAN::Testers::Metabase::AWS>
1811              
1812             L<CPAN::Testers::WWW::Statistics>
1813              
1814             F<http://www.cpantesters.org/>,
1815             F<http://stats.cpantesters.org/>,
1816             F<http://wiki.cpantesters.org/>
1817              
1818             =head1 AUTHOR
1819              
1820             It should be noted that the original code for this distribution began life
1821             under another name. The original distribution generated data for the original
1822             CPAN Testers website. However, in 2008 the code was reworked to generate data
1823             in the format for the statistics data analysis, which in turn was reworked to
1824             drive the redesign of the all the CPAN Testers websites. To reflect the code
1825             changes, a new name was given to the distribution.
1826              
1827             =head2 CPAN-WWW-Testers-Generator
1828              
1829             Original author: Leon Brocard <acme@astray.com> (C) 2002-2008
1830             Current maintainer: Barbie <barbie@cpan.org> (C) 2008-2010
1831              
1832             =head2 CPAN-Testers-Data-Generator
1833              
1834             Original author: Barbie <barbie@cpan.org> (C) 2008-2015
1835              
1836             =head1 LICENSE
1837              
1838             This module is free software; you can redistribute it and/or
1839             modify it under the Artistic License 2.0.