File Coverage

blib/lib/MogileFS/Worker/Delete.pm
Criterion Covered Total %
statement 18 190 9.4
branch 0 80 0.0
condition 0 18 0.0
subroutine 6 19 31.5
pod 0 8 0.0
total 24 315 7.6


line stmt bran cond sub pod time code
1             package MogileFS::Worker::Delete;
2             # deletes files
3              
4 21     21   137 use strict;
  21         166  
  21         1080  
5 21     21   134 use base 'MogileFS::Worker';
  21         53  
  21         3986  
6 21     21   138 use MogileFS::Util qw(error);
  21         52  
  21         1456  
7 21     21   159 use MogileFS::Server;
  21         49  
  21         543  
8              
9             # we select 1000 but only do a random 100 of them, to allow
10             # for stateless parallelism
11 21     21   209 use constant LIMIT => 1000;
  21         49  
  21         1380  
12 21     21   111 use constant PER_BATCH => 100;
  21         40  
  21         57236  
13              
14             sub new {
15 0     0 0   my ($class, $psock) = @_;
16 0           my $self = fields::new($class);
17 0           $self->SUPER::new($psock);
18              
19 0           return $self;
20             }
21              
22 0     0 0   sub watchdog_timeout { 120 }
23              
24             sub work {
25 0     0 0   my $self = shift;
26              
27 0           my $sleep_for = 0; # we sleep longer and longer until we hit max_sleep
28 0           my $sleep_max = 5; # max sleep when there's nothing to do.
29            
30 0           my $old_queue_check = 0; # next time to check the old queue.
31 0           my $old_queue_backoff = 0; # backoff index
32              
33 0           while (1) {
34 0           $self->send_to_parent("worker_bored 50 delete");
35 0           $self->read_from_parent(1);
36 0 0         next unless $self->validate_dbh;
37              
38             # call our workers, and have them do things
39             # RETVAL = 0; I think I am done working for now
40             # RETVAL = 1; I have more work to do
41 0           my $lock = 'mgfs:tempfiles';
42             # This isn't something we need to wait for: just need to ensure one is.
43 0           my $tempres;
44 0 0         if (Mgd::get_store()->get_lock($lock, 0)) {
45 0           $tempres = $self->process_tempfiles;
46 0           Mgd::get_store()->release_lock($lock);
47             }
48 0           my $delres;
49 0 0         if (time() > $old_queue_check) {
50 0           $self->reenqueue_delayed_deletes;
51 0           $delres = $self->process_deletes;
52             # if we did no work, crawl the backoff.
53 0 0         if ($delres) {
54 0           $old_queue_backoff = 0;
55 0           $old_queue_check = 0;
56             } else {
57 0 0         $old_queue_check = time() + $old_queue_backoff
58             if $old_queue_backoff > 360;
59 0 0         $old_queue_backoff++ unless $old_queue_backoff > 1800;
60             }
61             }
62              
63 0           my $delres2 = $self->process_deletes2;
64              
65             # unless someone did some work, let's sleep
66 0 0 0       unless ($tempres || $delres || $delres2) {
67 0 0         $sleep_for++ if $sleep_for < $sleep_max;
68 0           sleep $sleep_for;
69             } else {
70 0           $sleep_for = 0;
71             }
72             }
73              
74             }
75              
76             # deletes a given DevFID from the storage device
77             # returns true on success, false on failure
78             sub delete_devfid {
79 0     0 0   my ($self, $dfid) = @_;
80              
81             # send delete request
82 0 0         error("Sending delete for " . $dfid->url) if $Mgd::DEBUG >= 2;
83              
84 0           my $res;
85 0     0     $dfid->device->host->http("DELETE", $dfid->uri_path, undef, sub { ($res) = @_ });
  0            
86 0     0     Danga::Socket->SetPostLoopCallback(sub { !defined $res });
  0            
87 0           Danga::Socket->EventLoop;
88              
89 0           my $httpcode = $res->code;
90              
91             # effectively means all went well
92 0 0 0       return 1 if (($httpcode >= 200 && $httpcode <= 299) || $httpcode == 404);
      0        
93              
94 0           my $status = $res->status_line;
95 0           error("Error: unlink failure: " . $dfid->url . ": HTTP code $status");
96 0           return 0;
97             }
98              
99             sub process_tempfiles {
100 0     0 0   my $self = shift;
101             # also clean the tempfile table
102             #mysql> select * from tempfile where createtime < unix_timestamp() - 86400 limit 50;
103             #+--------+------------+---------+------+---------+--------+
104             #| fid | createtime | classid | dmid | dkey | devids |
105             #+--------+------------+---------+------+---------+--------+
106             #| 3253 | 1149451058 | 1 | 1 | file574 | 1,2 |
107             #| 4559 | 1149451156 | 1 | 1 | file83 | 1,2 |
108             #| 11024 | 1149451697 | 1 | 1 | file836 | 2,1 |
109             #| 19885 | 1149454542 | 1 | 1 | file531 | 1,2 |
110              
111             # BUT NOTE:
112             # the fids might exist on one of the devices in devids column if we assigned them those,
113             # they wrote some to one of them, then they died or for whatever reason didn't create_close
114             # to use, so we shouldn't delete from tempfile before going on a hunt of the missing fid.
115             # perhaps we should just add to the file_on table for both devids, and let the regular delete
116             # process discover via 404 that they're not there.
117             # so we should:
118             # select fid, devids from tempfile where createtime < unix_timestamp() - 86400
119             # add file_on rows for both of those,
120             # add fid to fids_to_delete table,
121             # delete from tempfile where fid=?
122              
123              
124             # dig up some temporary files to purge
125 0           my $sto = Mgd::get_store();
126 0   0       my $too_old = int($ENV{T_TEMPFILE_TOO_OLD} || 3600);
127 0           my $tempfiles = $sto->old_tempfiles($too_old);
128 0 0 0       return 0 unless $tempfiles && @$tempfiles;
129              
130             # insert the right rows into file_on and file_to_delete and remove the
131             # now expunged (or soon to be) rows from tempfile
132 0           my (@devfids, @fidids);
133 0           foreach my $row (@$tempfiles) {
134              
135             # If FID is still loadable, we've arrived here due to a bug or race
136             # condition elsewhere. Remove the tempfile row but don't delete the
137             # file!
138 0           my $fidid = $row->[0];
139 0           my $fid = MogileFS::FID->new($fidid);
140 0 0         if ($fid->exists) {
141 0           $sto->delete_tempfile_row($fidid);
142 0           next;
143             }
144 0           push @fidids, $fidid;
145              
146             # sanity check the string column.
147 0           my $devids = $row->[1];
148 0 0         unless ($devids =~ /^(\d+)(,\d+)*$/) {
149 0           $devids = "";
150             }
151              
152 0           foreach my $devid (split /,/, $devids) {
153 0           push @devfids, MogileFS::DevFID->new($devid, $row->[0]);
154             }
155             }
156              
157             # We might've done no work due to discovering the tempfiles are real.
158 0 0         return 0 unless @fidids;
159              
160 0           $sto->mass_insert_file_on(@devfids);
161 0           $sto->enqueue_fids_to_delete2(@fidids);
162 0           $sto->dbh->do("DELETE FROM tempfile WHERE fid IN (" . join(',', @fidids) . ")");
163 0           return 1;
164             }
165              
166             # new style delete queueing. I'm not putting a lot of effort into commonizing
167             # code between the old one and the new one. Feel free to send a patch!
168             sub process_deletes2 {
169 0     0 0   my $self = shift;
170              
171 0           my $sto = Mgd::get_store();
172              
173 0           my $queue_todo = $self->queue_todo('delete');
174 0 0         unless (@$queue_todo) {
175             # No work.
176 0           return 0;
177             }
178              
179 0           while (my $todo = shift @$queue_todo) {
180 0           $self->still_alive;
181              
182             # load all the devids related to this fid, and delete.
183 0           my $fid = MogileFS::FID->new($todo->{fid});
184 0           my $fidid = $fid->id;
185              
186             # if it's currently being replicated, wait for replication to finish
187             # before deleting to avoid stale files
188 0 0         if (! $sto->should_begin_replicating_fidid($fidid)) {
189 0           $sto->reschedule_file_to_delete2_relative($fidid, 1);
190 0           next;
191             }
192              
193 0           $sto->delete_fidid_enqueued($fidid);
194              
195 0           my @devids = $fid->devids;
196 0           my %devids = map { $_ => 1 } @devids;
  0            
197              
198              
199 0           for my $devid (@devids) {
200 0 0         my $dev = $devid ? Mgd::device_factory()->get_by_id($devid) : undef;
201 0 0 0       error("deleting fid $fidid, on devid ".($devid || 'NULL')."...") if $Mgd::DEBUG >= 2;
202 0 0         unless ($dev) {
203 0           next;
204             }
205 0 0         if ($dev->dstate->is_perm_dead) {
206 0           $sto->remove_fidid_from_devid($fidid, $devid);
207 0           delete $devids{$devid};
208 0           next;
209             }
210             # devid is observed down/readonly: delay for at least
211             # 10 minutes.
212 0 0         unless ($dev->observed_writeable) {
213 0           $sto->reschedule_file_to_delete2_relative($fidid,
214             60 * (10 + $todo->{failcount}));
215 0           next;
216             }
217             # devid is marked readonly/down/etc: delay for
218             # at least 1 hour.
219 0 0         unless ($dev->can_delete_from) {
220 0           $sto->reschedule_file_to_delete2_relative($fidid,
221             60 * 60 * (1 + $todo->{failcount}));
222 0           next;
223             }
224              
225 0           my $dfid = MogileFS::DevFID->new($dev, $fidid);
226 0           my $path = $dfid->url;
227              
228             # dormando: "There are cases where url can return undefined,
229             # Mogile appears to try to replicate to bogus devices
230             # sometimes?"
231 0 0         unless ($path) {
232 0           error("in deleter, url(devid=$devid, fid=$fidid) returned nothing");
233 0           next;
234             }
235              
236 0 0         if ($self->delete_devfid($dfid)) {
237             # effectively means all went well
238 0           $sto->remove_fidid_from_devid($fidid, $devid);
239 0           delete $devids{$devid};
240             } else {
241             # remote file system error? connect failure? retry in 30min
242 0           $sto->reschedule_file_to_delete2_relative($fidid,
243             60 * 30 * (1 + $todo->{failcount}));
244 0           next;
245             }
246             }
247              
248             # fid has no pants.
249 0 0         unless (keys %devids) {
250 0           $sto->delete_fid_from_file_to_delete2($fidid);
251             }
252 0           $sto->note_done_replicating($fidid);
253             }
254              
255             # did work.
256 0           return 1;
257             }
258              
259             sub process_deletes {
260 0     0 0   my $self = shift;
261              
262 0           my $sto = Mgd::get_store();
263 0           my $dbh = $sto->dbh;
264              
265 0           my $delmap = $dbh->selectall_arrayref("SELECT fd.fid, fo.devid ".
266             "FROM file_to_delete fd ".
267             "LEFT JOIN file_on fo ON fd.fid=fo.fid ".
268             "LIMIT " . LIMIT);
269 0 0         my $count = $delmap ? scalar @$delmap : 0;
270 0 0         return 0 unless $count;
271              
272 0           my $done = 0;
273 0           foreach my $dm (List::Util::shuffle(@$delmap)) {
274 0 0         last if ++$done > PER_BATCH;
275              
276 0           $self->still_alive;
277 0           my ($fid, $devid) = @$dm;
278 0 0 0       error("deleting fid $fid, on devid ".($devid || 'NULL')."...") if $Mgd::DEBUG >= 2;
279              
280             my $done_with_fid = sub {
281 0     0     my $reason = shift;
282 0           $dbh->do("DELETE FROM file_to_delete WHERE fid=?", undef, $fid);
283 0           $sto->condthrow("Failure to delete from file_to_delete for fid=$fid");
284 0           };
285              
286             my $done_with_devid = sub {
287 0     0     my $reason = shift;
288 0           $dbh->do("DELETE FROM file_on WHERE fid=? AND devid=?",
289             undef, $fid, $devid);
290 0           $sto->condthrow("Failure to delete from file_on for $fid/$devid");
291 0 0         die "Failed to delete from file_on: " . $dbh->errstr if $dbh->err;
292 0           };
293              
294             my $reschedule_fid = sub {
295 0     0     my ($secs, $reason) = (int(shift), shift);
296 0           $sto->insert_ignore("INTO file_to_delete_later (fid, delafter) ".
297             "VALUES (?,".$sto->unix_timestamp."+$secs)", undef,
298             $fid);
299 0 0         error("delete of fid $fid rescheduled: $reason") if $Mgd::DEBUG >= 2;
300 0           $done_with_fid->("rescheduled");
301 0           };
302              
303             # Cases:
304             # devid is null: doesn't exist anywhere anymore, we're done with this fid.
305             # devid is observed down/readonly: delay for 10 minutes
306             # devid is marked readonly: delay for 2 hours
307             # devid is marked dead or doesn't exist: consider it deleted on this devid.
308              
309             # CASE: devid is null, which means we're done deleting all instances.
310 0 0         unless (defined $devid) {
311 0           $done_with_fid->("no_more_locations");
312 0           next;
313             }
314              
315             # CASE: devid is marked dead or doesn't exist: consider it deleted on this devid.
316             # (Note: we're tolerant of '0' as a devid, due to old buggy version which
317             # would sometimes put that in there)
318 0 0         my $dev = $devid ? Mgd::device_factory()->get_by_id($devid) : undef;
319 0 0         unless ($dev) {
320 0           $done_with_devid->("devid_doesnt_exist");
321 0           next;
322             }
323 0 0         if ($dev->dstate->is_perm_dead) {
324 0           $done_with_devid->("devid_marked_dead");
325 0           next;
326             }
327              
328             # CASE: devid is observed down/readonly: delay for 10 minutes
329 0 0         unless ($dev->observed_writeable) {
330 0           $reschedule_fid->(60 * 10, "not_observed_writeable");
331 0           next;
332             }
333              
334             # CASE: devid is marked readonly/down/etc: delay for 2 hours
335 0 0         unless ($dev->can_delete_from) {
336 0           $reschedule_fid->(60 * 60 * 2, "devid_marked_not_alive");
337 0           next;
338             }
339              
340 0           my $dfid = MogileFS::DevFID->new($dev, $fid);
341 0           my $path = $dfid->url;
342              
343             # dormando: "There are cases where url can return undefined,
344             # Mogile appears to try to replicate to bogus devices
345             # sometimes?"
346 0 0         unless ($path) {
347 0           error("in deleter, url(devid=$devid, fid=$fid) returned nothing");
348 0           next;
349             }
350              
351 0 0         if ($self->delete_devfid($dfid)) {
352 0           $done_with_devid->("deleted");
353             } else {
354             # remote file system error? connect failure? retry in 30min
355 0           $reschedule_fid->(60 * 30, "http_failure");
356             }
357             }
358              
359             # as far as we know, we have more work to do
360 0           return 1;
361             }
362              
363             sub reenqueue_delayed_deletes {
364 0     0 0   my $self = shift;
365              
366 0           my $sto = Mgd::get_store();
367 0           my $dbh = $sto->dbh;
368              
369 0 0         my @fidids = $sto->fids_to_delete_again
370             or return;
371              
372 0           $sto->enqueue_fids_to_delete(@fidids);
373              
374 0           $dbh->do("DELETE FROM file_to_delete_later WHERE fid IN (" .
375             join(",", @fidids) . ")");
376 0           $sto->condthrow("reenqueue file_to_delete_later delete");
377             }
378              
379             1;
380              
381             # Local Variables:
382             # mode: perl
383             # c-basic-indent: 4
384             # indent-tabs-mode: nil
385             # End: