File Coverage

blib/lib/Combine/DataBase.pm
Criterion Covered Total %
statement 15 132 11.3
branch 0 44 0.0
condition 0 21 0.0
subroutine 5 10 50.0
pod 0 5 0.0
total 20 212 9.4


line stmt bran cond sub pod time code
1             # Copyright (c) 2004, 2005 Anders Ardö
2             ## $Id: DataBase.pm 326 2011-05-27 07:44:58Z it-aar $
3              
4             #
5             # See the file LICENCE included in the distribution.
6              
7             package Combine::DataBase;
8              
9 1     1   460 use strict;
  1         2  
  1         36  
10 1     1   482 use Combine::MySQLhdb;
  1         3  
  1         111  
11 1     1   8 use Digest::MD5;
  1         2  
  1         44  
12 1     1   6 use Encode qw(encode_utf8);
  1         3  
  1         1323  
13              
14             sub new {
15 0     0 0   my ($class, $xwi, $sv, $loghandle) = @_;
16 0 0         $xwi = new Combine::XWI unless ref $xwi;
17 0           my $self = {};
18 0           $self->{'xwi'} = $xwi;
19 0           $self->{'databasehandle'} = $sv;
20 0           $self->{'loghandle'} = $loghandle;
21 0           bless $self, $class;
22 0           return $self;
23             }
24              
25             #uses table recordurl with columns: recordid, urlid, lastchecked PRIMARY KEY (urlid), key recordid
26             # recordid and urlid starts at 1 !!
27              
28             sub delete {
29 0     0 0   my ($self) = @_;
30 0           my $xwi = $self->{'xwi'};
31 0 0         return undef unless ref $xwi;
32            
33 0           my $urlid = $xwi->urlid;
34 0           my ($recordid, $md5) = $self->{databasehandle}->selectrow_array(
35             qq{SELECT recordid,md5 FROM recordurl WHERE urlid=$urlid;}); #Only one
36 0 0         return if !defined($recordid);
37 0           $self->{'loghandle'}->say("DataBase::delete $urlid, $recordid, $md5;");
38             #LOCK recordurl - needed?
39 0           $self->{databasehandle}->prepare(qq{LOCK TABLES recordurl WRITE;})->execute();
40             #delete URL from recordurl
41 0           $self->{databasehandle}->prepare(qq{DELETE FROM recordurl WHERE urlid=?;})->execute($urlid);
42 0           my ($ant) = $self->{databasehandle}->selectrow_array(
43             qq{SELECT recordid FROM recordurl WHERE recordid=$recordid LIMIT 1;});
44             #UNLOCK recordurl - needed?
45 0           $self->{databasehandle}->prepare(qq{UNLOCK TABLES;})->execute();
46 0 0 0       if ( !defined($ant) || ($ant == 0) ) { Combine::MySQLhdb::DeleteKey($recordid,$md5); } #Should handle del's of non-existing recs
  0            
47              
48 0           $xwi->nofollow("true"); #?? check??
49             }
50              
51             sub insert {
52 0     0 0   my ($self) = @_;
53 0           my $xwi = $self->{'xwi'};
54 0 0         return undef unless ref $xwi;
55              
56 0           my $urlid = $xwi->urlid;
57             ## my $md5 = $xwi->md5;
58              
59 0           my $md5D = new Digest::MD5;
60 0           $md5D->reset;
61 0 0         if ( length($xwi->text) > 0 ) {
62 0           my $text = ${$xwi->text} . $xwi->title;
  0            
63 0           $text =~ s/[\s\n\r]+//g;
64 0           $md5D->add(encode_utf8($text)); #use only visible text without whitespace
65             } else {
66 0           $md5D->add($xwi->url);
67 0           $md5D->add($xwi->type());
68             }
69 0           $_ = $md5D->hexdigest;
70 0           tr/a-z/A-Z/;
71 0           $xwi->md5($_);
72 0           my $md5 = $_;
73              
74 0           $self->{'loghandle'}->say("DataBase::insert $urlid, $md5;");
75              
76             #actions according the following truth table based presence in recordurl
77             #urlid: there is a document in the database for this url
78             #recordid: there is as documenent in the database with the same MD5 as the new page
79             #
80             # recordid | ! recordid
81             # urlid if same md5 | delete(urlid_recordid);
82             # update(lastcheck) | update(urlid); insertRec
83             # else delete(urlid_recordid);|
84             # add(urlid) |
85             # -----------------------------------------------------------------------------
86             # ! urlid add(urlid) | add(urlid); insertRec
87              
88 0           my $existurlid = 0;
89 0           my $existrecordid = 0;
90 0           my $oldmd5='';
91             #LOCK recordurl
92 0           $self->{databasehandle}->prepare(qq{LOCK TABLES recordurl WRITE;})->execute();
93 0           ($existurlid,$oldmd5) = $self->{databasehandle}->selectrow_array(
94             qq{SELECT urlid,md5 FROM recordurl WHERE urlid=$urlid;});
95 0           ($existrecordid) = $self->{databasehandle}->selectrow_array(
96             qq{SELECT recordid FROM recordurl WHERE md5='$md5';});
97              
98 0 0         if (!defined($existrecordid)) { $existrecordid = 0; }
  0            
99 0 0         if (!defined($existurlid)) { $existurlid = 0; }
  0            
100 0 0         if (!defined($oldmd5)) { $oldmd5 = ''; }
  0            
101             #Log not locked $self->{'loghandle'}->say("DataBase:: $urlid, $md5; $existrecordid; $existurlid; $oldmd5;");
102              
103             #CASE 1: There are documents for both the URL and the MD5 and they have the same md5
104 0 0 0       if ( ($existrecordid && $existurlid) && ($md5 eq $oldmd5) ) {
    0 0        
    0 0        
    0 0        
    0 0        
      0        
105             # updateLastCheck
106 0           $self->{databasehandle}->prepare(
107             qq{UPDATE recordurl SET lastchecked=NOW() WHERE urlid=?;})->execute($urlid);
108 0           $self->{databasehandle}->prepare(qq{UNLOCK TABLES;})->execute(); #UNLOCK recordurl
109 0           $self->{'loghandle'}->say("DataBase:: case 1: $existrecordid; $existurlid; $oldmd5;");
110              
111             #CASE 2
112             } elsif ( $existrecordid && $existurlid ) {
113             #eg } elsif ( ($existrecordid && $existurlid) && ($md5 ne $oldmd5) ) {
114             #There are documents for both the URL and the MD5 and they have different md5
115             # deleteOld
116 0           my $oldrecordid = 0;
117 0           ($oldrecordid) = $self->{databasehandle}->selectrow_array(
118             qq{SELECT recordid FROM recordurl WHERE urlid=$urlid;});
119             #delete URL from recordurl
120 0           $self->{databasehandle}->prepare(qq{DELETE FROM recordurl WHERE urlid=?;})->execute($urlid);
121 0           $self->{databasehandle}->prepare(
122             qq{INSERT INTO recordurl SET urlid=?, recordid=?, md5=?, lastchecked=NOW();})->execute($urlid, $existrecordid, $md5);
123 0           $self->{databasehandle}->prepare(qq{UNLOCK TABLES;})->execute(); #UNLOCK recordurl
124 0           my ($ant) = $self->{databasehandle}->selectrow_array(
125             qq{SELECT recordid FROM recordurl WHERE recordid=$oldrecordid LIMIT 1;}); #Outside LOCK?
126 0 0         if ( ! defined($ant) ) { $ant = 0; }
  0            
127 0           $self->{'loghandle'}->say("DataBase::DelURL case 2: $oldrecordid; $ant;; $existrecordid; $existurlid; $oldmd5;");
128 0 0         if ( $ant == 0 ) { Combine::MySQLhdb::DeleteKey($oldrecordid, $oldmd5); }
  0            
129              
130             #CASE 3
131             } elsif ( $existrecordid && ! $existurlid ) {
132             # addUrlId
133 0           $self->{databasehandle}->prepare(
134             qq{INSERT INTO recordurl SET urlid=?, recordid=?, md5=?, lastchecked=NOW();})->execute($urlid, $existrecordid, $md5);
135 0           $self->{databasehandle}->prepare(qq{UNLOCK TABLES;})->execute(); #UNLOCK recordurl
136 0           $self->{'loghandle'}->say("DataBase:: case 3: $existrecordid; $existurlid; $oldmd5;");
137              
138             #CASE 4
139             } elsif ( ! $existrecordid && $existurlid ) {
140             # deleteOld
141 0           my $oldrecordid = 0;
142 0           ($oldrecordid) = $self->{databasehandle}->selectrow_array(
143             qq{SELECT recordid FROM recordurl WHERE urlid=$urlid;});
144              
145             # delete($self, $urlid, $oldrecordid); #Problem med LOCK!!! -> ny subrutin
146             #delete URL from recordurl
147 0           $self->{databasehandle}->prepare(qq{DELETE FROM recordurl WHERE urlid=?;})->execute($urlid);
148             #ASSIGN NEW RECORDID done with auto_increment in SQL
149 0           $self->{databasehandle}->prepare(
150             qq{INSERT INTO recordurl SET urlid=?, md5=?, lastchecked=NOW();})->execute($urlid,$md5);
151 0           $self->{databasehandle}->prepare(qq{UNLOCK TABLES;})->execute(); #UNLOCK recordurl
152 0           my ($ant) = $self->{databasehandle}->selectrow_array(
153             qq{SELECT recordid FROM recordurl WHERE recordid=$oldrecordid LIMIT 1;});
154 0 0         if ( ! defined($ant) ) { $ant = 0; }
  0            
155 0           $self->{'loghandle'}->say("DataBase::DelURL $oldrecordid; $ant;");
156 0 0         if ( $ant == 0 ) { Combine::MySQLhdb::DeleteKey($oldrecordid, $oldmd5); }
  0            
157 0           my ($recordid) = $self->{databasehandle}->selectrow_array(
158             qq{SELECT recordid FROM recordurl WHERE urlid=$urlid;});
159 0           $xwi->recordid($recordid);
160 0           $self->{'loghandle'}->say("DataBase::Write $recordid case 4: $existrecordid; $existurlid; $oldmd5;");
161 0           Combine::MySQLhdb::Write($xwi);
162              
163             #CASE 5
164             } elsif ( ! $existrecordid && ! $existurlid ) {
165             #ASSIGN NEW RECORDID done with auto_increment in SQL
166 0           $self->{databasehandle}->prepare(
167             qq{INSERT INTO recordurl SET urlid=?, md5=?, lastchecked=NOW();})->execute($urlid,$md5);
168 0           $self->{databasehandle}->prepare(qq{UNLOCK TABLES;})->execute(); #UNLOCK recordurl
169 0           my ($recordid) = $self->{databasehandle}->selectrow_array(
170             qq{SELECT recordid FROM recordurl WHERE urlid=$urlid;});
171 0           $xwi->recordid($recordid);
172 0           $self->{'loghandle'}->say("DataBase::Write $recordid case 5: $existrecordid; $existurlid; $oldmd5;");
173 0           Combine::MySQLhdb::Write($xwi);
174             }
175              
176             #Should not happen
177             else {
178 0           $self->{'loghandle'}->say("DataBase::ERR $existrecordid; $existurlid; $oldmd5;");
179 0           print "ERR DataBase impossible case\n";
180             }
181              
182 0           $xwi->nofollow("false"); # was set to true by delete...???
183             # my ($follow,$add,$replaced) = &COMB::Policy::url_accept($url,@urls);???
184             }
185              
186             sub newLinks {
187 0     0 0   my ($self) = @_;
188 0           my $xwi = $self->{'xwi'};
189 0 0         return undef unless ref $xwi;
190 0           my $recordid = $xwi->recordid; #SANITY CHECK?
191 0           $self->{databasehandle}->prepare(
192             qq{INSERT IGNORE INTO newlinks SELECT urlid,netlocid FROM links WHERE recordid=?;})->execute($recordid);
193             }
194              
195             sub newRedirect {
196 0     0 0   my ($self) = @_;
197 0           my $xwi = $self->{'xwi'};
198 0 0         return undef unless ref $xwi;
199 1     1   4 use Combine::selurl;
  1         3  
  1         219  
200 0           my ($u, $netlocid, $urlid, $urlstr);
201             # my $tl=$xwi->location; my $tb=$xwi->base; print "NL: $tl, $tb\n";
202 0 0         if ( $u = Combine::selurl->new_abs($xwi->location, $xwi->base) ) {
203 0           $urlstr = $u->normalise();
204             # print "NL: $urlstr\n";
205 0           my $lsth = $self->{databasehandle}->prepare(qq{SELECT netlocid,urlid FROM urls WHERE urlstr=?;});
206 0           $lsth->execute($urlstr);
207 0           ($netlocid,$urlid) = $lsth->fetchrow_array;
208 0 0         if ( !defined($urlid) ) {
209 0           my $netlocstr = $u->authority;
210 0           my $path_query = $u->path_query;
211             # print "NL: $netlocstr, $path_query\n";
212 0           $self->{databasehandle}->prepare(qq{INSERT IGNORE INTO netlocs SET netlocstr=?;})->execute($netlocstr);
213 0           ($netlocid) = $self->{databasehandle}->selectrow_array(qq{SELECT netlocid FROM netlocs WHERE netlocstr='$netlocstr';});
214 0           $self->{databasehandle}->prepare(qq{INSERT IGNORE INTO urls SET urlstr=?, netlocid=?, path=?;})->execute($urlstr,$netlocid,$path_query);
215 0           $lsth->execute($urlstr);
216 0           ($netlocid,$urlid) = $lsth->fetchrow_array;
217             }
218             # print "NL INS: $urlid,$netlocid\n";
219             #test if undefined
220 0           $self->{databasehandle}->prepare(
221             qq{INSERT IGNORE INTO newlinks SET urlid=?, netlocid=?;})->execute($urlid,$netlocid);
222             }
223             }
224              
225             1;