File Coverage

blib/lib/WWW/Link/Repair.pm
Criterion Covered Total %
statement 46 64 71.8
branch 15 34 44.1
condition 7 15 46.6
subroutine 6 9 66.6
pod 2 3 66.6
total 76 125 60.8


line stmt bran cond sub pod time code
1             package WWW::Link::Repair;
2             $REVISION=q$Revision: 1.11 $ ; $VERSION = sprintf ( "%d.%02d", $REVISION =~ /(\d+).(\d+)/ );
3              
4             =head1 NAME
5              
6             Repair.pm - repair links in files.
7              
8             =head1 SYNOPSIS
9              
10             use Repair::Substitutor;
11             use Repair;
12              
13             $linksubs1 = WWW::Link::Repair::Substitutor::gen_substitutor(
14             "http://bounce.com/" ,
15             "http://bing.bong/",
16             0, 0,
17             );
18             $filehand = WWW::Link::Repair::Substitutor::gen_simple_file_handler ($linksubs);
19              
20             &$filehand("fix-this-file");
21              
22             use CDB_File::BiIndex;
23             $::index = new CDB_File::BiIndex "page-index-file", "link-index-file";
24              
25             WWW::Link::Repair::infostructure($index, $filehand, "http://bounce.com/");
26              
27              
28             =head1 DESCRIPTION
29              
30             This module provides functions that allow the repair of files.
31              
32             =cut
33              
34             our ($fakeit);
35             our ($verbose);
36             our ($no_warn);
37              
38             $fakeit = 0 unless defined $fakeit;
39             $verbose = 0 unless defined $verbose;
40             $no_warn = 0 unless defined $no_warn;
41              
42 2     2   3184 use File::Copy;
  2         11127  
  2         166  
43 2     2   17 use Carp;
  2         4  
  2         174  
44 2     2   10 use strict;
  2         5  
  2         317  
45              
46             =head2 directory(file handler, directory ... )
47              
48             This function recurses through each given directory argument. For
49             each file found it calls the file handler function.
50              
51             The B should be a function which can be called on a
52             filename and will update that file with the new URL.
53              
54             B is the base URL which should be iterated from. It must
55             exist within the B.
56              
57             B controlls whether to attempt to replace all links
58             below that link. In this case the index is iterated beyond the first
59             link for all links which begin with the first link.
60              
61             =cut
62              
63             sub directory {
64 0     0 1 0 my $handler=shift;
65 0         0 our ($fixed);
66 0         0 local $fixed;
67 0         0 $fixed=0;
68 0 0   0   0 my $sub = sub {-d && return 0; $fixed += &$handler($File::Find::name)};
  0         0  
  0         0  
69 0         0 File::Find::find($sub, @_);
70 0         0 return $fixed;
71             }
72              
73             =head2 infostructure(index object, file handler, oldurl, as_directory)
74              
75             This function will use a previously build index to update all of the
76             files referenced from that index that need updating.
77              
78             The B object will be treated as a BiIndex.
79              
80             The B should be a function which can be called on a
81             filename and will update that file with the new URL.
82              
83             B is the base URL which should be iterated from. It must
84             exist within the B.
85              
86             B controlls whether to attempt to replace all links
87             below that link. In this case the index is iterated beyond the first
88             link for all links which begin with the first link.
89              
90             =cut
91              
92 2     2   9 use vars qw($infostrucbase $filebase);
  2         4  
  2         1796  
93              
94             sub _link_fix($$$) {
95 4     4   9 my $url_to_file=shift;
96 4         10 my $file_handler=shift;
97 4         6 my $editlist=shift;
98 4         9 my $fixed=0;
99 4         26 PAGE: foreach my $member (@$editlist) {
100 4 50       19 print STDERR "going to convert $member to file\n"
101             if $WWW::Link::Repair::verbose & 32;
102 4         16 my $file = &$url_to_file($member);
103 4 50       113 defined $file or do {
104 0         0 print STDERR "No filename for $member. Skipping\n";
105 0         0 next PAGE;
106             };
107 4 50       14 print STDERR "file is $file\n" if $WWW::Link::Repair::verbose & 32;
108 4         22 $fixed+=&$file_handler($file);
109             }
110 4         12 return $fixed;
111             }
112              
113              
114             sub infostructure ($$$$;$) {
115 3     3 1 60192 my ($oldurl, $index, $url_to_file, $file_handler, $recursive, $junk)=@_;
116              
117 3 50       49 defined $file_handler or
118             croak "missing argument to infostructure(\$\$\$\$;\$)";
119 3 50       116 $oldurl =~ m/^[a-z][a-z0-9-]*:/ or
120             croak "first argument to infostructure() must be a url not $oldurl";
121 3 50 33     141 ref $index and $index->can("lookup_second") or
122             croak "second argument to infostructure() must be a biindex not $index";
123 3 50       43 (ref $url_to_file) =~ m/CODE/ or
124             croak "third argument to infostructure() must be a CODE ref not $url_to_file";
125 3 50       24 (ref $file_handler) =~ m/CODE/ or
126             croak "fourth argument to infostructure() must be a CODE ref not $url_to_file";
127 3 50       22 defined $junk and croak "extra argument to infostructure(\$\$\$\$;\$)";
128              
129 3         40 my $key=$index->second_set_iterate($oldurl);
130 3         143 my $fixed=0;
131              
132 3 50       14 if (defined $key) {
133 3 100       15 if ($recursive) { #we should substitute all links below this
134 2   100     136 while (defined $key and $key =~ m/^$oldurl/) {
135 3         31 my $editlist=$index->lookup_second($key);
136 3         82 $fixed += _link_fix($url_to_file,$file_handler,$editlist);
137 3         19 $key = $index->second_next();
138             }
139             } else { #just warn if there are any links below this.
140 1         2 my $next;
141 1 50 33     60 if ( $key =~ m/^$oldurl/ and not $key eq $oldurl ) {
142 0         0 warn "There were no files with exactly that link to edit.\n";
143 0         0 last;
144             } else {
145 1         8 my $editlist=$index->lookup_second($oldurl);
146 1         38 $fixed += _link_fix($url_to_file,$file_handler,$editlist);
147 1         21 $key=$index->second_next();
148             }
149 1 50 33     52 warn "Ignoring URLs starting with your URL such as $key.\n"
150             if defined $key and $key =~ m/^$oldurl/;
151             }
152             } else {
153 0         0 warn "There were no files with exactly that link to edit.\n";
154 0 0       0 print STDERR "key was beyond all keys in index\n"
155             if $WWW::Link::Repair::verbose & 16;
156             }
157              
158 3 50 33     36 $fixed or carp "didn't make any substitutions for $oldurl" unless $no_warn;
159             #FIXME repair the infostructure index..
160 3         10 return $fixed;
161             }
162              
163             # =head2 map_url_to_editable
164              
165             # Given any url, get us something we can edit in order to change the
166             # resource referenced by that url. Or not, if we can't. In the case
167             # that we can't, return undef.
168              
169             # The aim of this function is to return something which is not tainted.
170              
171             # N.B. This will accept any filename which is within the infostructure
172             # whatsoever.. it is possible that that includes more than you wish to
173             # let people edit.
174              
175             # For this function to work the two variables:
176              
177             # $WWW::Link::Repair::filebase
178             # $WWW::Link::Repair::infostrucbase
179              
180             # must be defined appropriately
181              
182             # =cut
183              
184             # # sub{}
185              
186             # # @conversions = [
187             # # { regexp => 'http:://stuff..../'
188             # # changeurlfunc => sub {
189              
190             # # }
191              
192             # # ]
193              
194             # sub map_url_to_editable ($) {
195             # my $save=$_;
196             # $_=shift;
197             # print STDERR "trying to map $_ to editable object\n"
198             # if $WWW::Link::Repair::verbose & 64;
199              
200             # unless (m/^$infostrucbase/) {
201             # my $print=$_;
202             # $_=$save;
203             # croak "can't deal with url '$print' not in our infostructure"; #taint??
204             # }
205             # die 'config variable $WWW::Link::Repair::infostrucbase must be defined'
206             # unless defined $infostrucbase;
207             # s/^$infostrucbase//;
208              
209             # # Now we clean up the filename. For This we assume unix semantics.
210             # # These have been around for long enough that any sensible operating
211             # # system could have simply copied them.
212              
213             # s,/./,,g;
214              
215             # #now chop away down references..
216              
217             # # substitute a downchange (dirname/) followed by an upchange ( /../ )
218             # # for nothing.
219             # 1 while s,([^.]|(.[^.])|(..?))/+..($|/),,g ;
220              
221             # # clean up multiple slashes
222              
223             # s,//,/,g;
224              
225             # # delete leading slash
226              
227             # s,^/,,g;
228              
229              
230             # if (m,(^|/)..($|/),) {
231             # $_=$save;
232             # croak "upreferences (/../) put that outside our infostructure";
233             # }
234              
235             # #what are the properties of the filename we can return..
236             # #any string which doesn't contain /.. (and refuse /.
237              
238             # #now we untaint and do a check..
239              
240             # $_ =~ m,( (?: # directory name; xxx/ or filename; xxx
241             # (?: # some filename ....
242             # (?:[^./][^/]+) #a filename with no dot
243             # |(?:.[^./][^/]+) #a filename starting with .
244             # |(?:..[^./][^/]+) #a filename starting with .. why bother?
245             # )
246             # (?:/|$) # seperator to next directory name or end of filename
247             # ) +
248             # ),x; #we set $1 to the whole qualified filename.
249              
250             # my $fixable = $1;
251             # $_=$save;
252             # return undef unless defined $fixable;
253             # die 'config variable $WWW::Link::Repair::filebase must be defined'
254             # unless defined $filebase;
255             # #FIXME: filebase can contain a / so this can end up with //. do we care?
256             # return $filebase . '/' . $fixable; #filebase should be an internal variable
257             # }
258              
259              
260             =head1 check_url_is_full
261              
262             The aim of this function is to check whether a given url is full
263              
264             =cut
265              
266              
267             sub check_url ($) {
268 0     0 0   my $fixable=shift;
269 0           FIXABLE: foreach (@$fixable) {
270 0 0         m,^[A-Za-z]+://^[A-Za-z]+/, or die "unqualified URL in database $_";
271             }
272             }
273              
274             1; #why are we all afraid of require? Why do we give in??