File Coverage

blib/lib/WWW/Link/Repair/Substitutor.pm
Criterion Covered Total %
statement 96 111 86.4
branch 40 76 52.6
condition 14 17 82.3
subroutine 8 8 100.0
pod 2 2 100.0
total 160 214 74.7


line stmt bran cond sub pod time code
1             package WWW::Link::Repair::Substitutor;
2             $REVISION=q$Revision: 1.12 $ ; $VERSION = sprintf ( "%d.%02d", $REVISION =~ /(\d+).(\d+)/ );
3              
4             =head1 NAME
5              
6             WWW::Link::Repair::Substitutor - repair links by text substitution
7              
8             =head1 SYNOPSIS
9              
10             use WWW::Link::Repair::Substitutor;
11             $dirsubs = WWW::Link::Repair::Substitutor::gen_substitutor
12             ( "http://bounce.bounce.com/frodo/dogo" ,
13             "http://thing.thong/ding/dong",
14             1, 0, ); #directory substitution don't replace subsidiary links
15             &$dirsubs ($line_from_file)
16              
17             =head1 DESCRIPTION
18              
19             A module for substituting one link in a file for another.
20              
21             This link repairer works by going through a file line by line and
22             doing a substitute on each line. It will substitute absolute links
23             all of the time, including within the text of the HTML page. This is
24             useful because it means that things like instructions to people about
25             what to do with URLs will be corrected.
26              
27             =head1 SUBSTITUTORS
28              
29             A substituter is a function which substitutes one url for another in a
30             string. Typically it would be fed a file a line at a time and would
31             substitute it directly. It works on it's argument directly.
32              
33             The two urls should be provided in absolute form.
34              
35             =head2 FILE HANDLERS
36              
37             A file handler goes through files calling a substitutor as needed.
38              
39             =head2 gen_directory_substitutor
40              
41             B: I think the logic around here is more than a little dubious
42              
43             =cut
44              
45 3     3   2247 use Carp;
  3         6  
  3         222  
46 3     3   841 use File::Copy;
  3         5257  
  3         150  
47 3     3   57 use strict;
  3         7  
  3         120  
48 3     3   2913 use URI;
  3         16809  
  3         2304  
49              
50             our ($verbose);
51             $verbose=0 unless defined $verbose;
52              
53             =head2 gen_substitutor
54              
55             This function was previously an exported interface and currently
56             remains visible. I think it's interface is likely to change though.
57             Preferably use generate_file_substitutor as an entry point instead.
58              
59             This function generates a function which can be called either on a
60             complete line of text from a file or on a URL and which will update
61             the URL based on the URLs it has been given
62              
63             If the third argument is true then the function will return a
64             substitutor which works on all of the links below a given url and
65             substitutes them all together. Thus if we change
66              
67             http://fred.jim/eating/
68              
69             to
70              
71             http://roger.jemima/food/eating-out/
72              
73             we also change
74              
75              
76             http://fred.jim/eating/hotels.html
77              
78             to
79              
80             http://roger.jemima/food/eating-out/hotels.html
81              
82             This function should handle fragments correctly. This means that we
83             should allow fragments to be substituted to and from normal links, but
84             also when we fix a url to a url all of the internal fragments should
85             follow. Fragments are not relative links. Cases
86              
87             =over 4
88              
89             =item 1
90              
91             substitution of fragment for fragment
92              
93             =item 2
94              
95             substitution of link for link
96              
97             =item 3
98              
99             substitution of link to fragment
100              
101             =item 4
102              
103             substitution of fragment to link
104              
105             =item 5
106              
107             substitution of url base for url base with all relative links
108              
109             =back
110              
111             Note that right now it isn't possible to substitute a tree under a
112             fragment. There is no such thing as a sub-fragment defined in the
113             standards.
114              
115             If we stubstitute a link to a fragment then we should not substitute
116             fragments under that link. that would loose information. Rather we
117             should issue a warning. Maybe there should be an option that lets this
118             happen.
119              
120             =cut
121              
122             #substitution of link for link - including tree mode (never has fragments)
123             #match url without fragment - fragment remains
124              
125             #substitution of fragment for fragment
126             #match whole link
127              
128             #substitution of link to link with fragment
129             #substitute whole link
130             #SHOULD MATCH BUT WARN IF WE FIND AN EXISTING FRAGMENT
131             #will fail to match and leave
132              
133             #substitution of link with fragment to link
134             #match the whole link
135              
136              
137             #substitution of url base for url base with all relative links
138              
139              
140              
141             sub gen_substitutor ($$;$$) {
142 11     11 1 91 my ($orig_url,$new_url,$tree_mode,$baseuri) = @_;
143 11         21 my ($new_has_fragment) = my ($orig_has_fragment ) =0;
144 11 100       43 $orig_url =~ m/#/ and $orig_has_fragment =1;
145 11 100       36 $new_url =~ m/#/ and $new_has_fragment =1;
146              
147 11 100 100     106 ( $orig_has_fragment or $new_has_fragment ) and $tree_mode
      100        
148             and die "can't do tree mode substitution with fragments";
149              
150 10 0       28 print STDERR
    50          
151             "Generating substitutor from $orig_url to $new_url \n",
152             (defined $baseuri ? "using base $baseuri\n" : "" ) if ($verbose & 32);
153              
154 10 50 66     46 defined $baseuri and ( not $baseuri =~ m/^[a-z][a-z0-9]*:/ )
155             and croak "baseuri must be absolute URI, not $baseuri";
156              
157 10         21 my $orig_rel;
158             my $new_rel;
159 10 100       26 defined $baseuri and do {
160 1         7 my $orig_uri=URI->new($orig_url);
161 1         13179 my $new_uri=URI->new($new_url);
162 1         57 $orig_rel=$orig_uri->rel($baseuri);
163 1         558 $new_rel=$new_uri->rel($baseuri);
164             };
165              
166 10         244 my $perlcode = <<'EOF';
167             sub {
168             my $substitutions=0;
169             EOF
170              
171 10 50       30 $perlcode .= <<'EOF' if ($verbose & 16);
172             print STDERR "Subs in : $_[0]\n";
173             EOF
174              
175 10 100       26 $perlcode .= <<'EOF' if ($baseuri);
176             EOF
177              
178 10         16 my $restart = <<'EOF';
179             $substitutions += $_[0] =~ s,( (?:^) #the start of a line
180             |(?:[^A-Za-z0-9]) #or a boundary character..
181             )
182             EOF
183              
184             # $remiddle terminates the url to be replaced... three possibilities
185             #
186             # 1) we are replacing a tree of URLs where the base URL is terminated with a /
187             # => what happens after doesn't matter.. '
188             # 2) we are replacing a tree of URLs where the base URL is unterminated..
189             # => end of the string must be end of the URL or '/' must follow
190             # 3) we only replace the exact URL
191             # => end of the string must be end of the URL
192              
193 10         25 my $remiddle = '';
194              
195 10 100 100     81 unless ($orig_url=~ m,/$, and $tree_mode) {
196 8         13 $remiddle .= <<'EOF';
197              
198             (?=(
199             EOF
200              
201             #end_of_uri - ends at a fragment unless the first url is a fragment
202              
203 8         13 my $end_of_uri;
204             CASE: {
205 8 100       10 $tree_mode && do {
  8         18  
206 2         4 $end_of_uri = <<'EOF' ;
207             ([#"'/>]) #" either end or end of section
208             EOF
209 2         4 last CASE;
210             };
211 6 100 66     25 ( not $orig_has_fragment and not $new_has_fragment ) && do {
212 4         7 $end_of_uri = <<'EOF' ;
213             ([#"'>]) #" this checks for the end of the url..
214             EOF
215 4         9 last CASE;
216             };
217 2         2 do {
218 2         3 $end_of_uri = <<'EOF' ;
219             (["'>]) #" this checks for the end of the url..
220             EOF
221 2         2 last CASE;
222             };
223 0         0 die "not reached";
224             }
225              
226 8         12 $remiddle .= $end_of_uri;
227              
228 8 100       25 $remiddle .= <<'EOF' unless $orig_url=~ m,/$,;
229             |(\s)
230             |($)
231             EOF
232 8         25 $remiddle .= <<'EOF';
233             )
234             )
235             EOF
236              
237             }
238              
239              
240 10         18 $remiddle .= ' ,$1' ;
241              
242 10         21 my $reend = ",gxo;\n";
243 10         24 my $relreend = ",gx;\n";
244              
245             #FIXME: url quoting into regex??
246              
247 10         29 $perlcode .= $restart . $orig_url . $remiddle . $new_url . $reend;
248 10 100       28 if ($baseuri) {
249 1         7 $perlcode .= $restart . $orig_rel . $remiddle . $new_rel . $relreend;
250             }
251              
252 10 50       35 $perlcode .= <<'EOF' if ($verbose & 16);
253             print STDERR "Gives : $_[0]\n";
254             EOF
255              
256 10         18 $perlcode .= <<'EOF';
257             return $substitutions;
258             }
259             EOF
260 10 50       24 print STDERR "creating substitutor function as follows\n",$perlcode, "\n"
261             if ($verbose & 32);
262 10         2143 my $returnme=(eval $perlcode);
263 10 50       34 if ($@) {
264 0         0 chomp $@; # to get line no in message
265 0         0 die "sub creation failed: $@";
266             }
267 10         36 return $returnme;
268             }
269              
270             =head2 gen_file_substitutor(, , [args...])
271              
272             This function returns a function which will act on a text file or
273             other file which can be treated as a text file and will carry out URL
274             substitutions within it.
275              
276             The returned code reference should be called with a filename as an
277             argument, it will then replace all occurrences of original url with
278             new url.
279              
280             There are various options to this which can be set by putting various
281             key value pairs in the call.
282              
283             fakeit - set to create a function which actually does nothing
284              
285             tree_mode - set to true to substitute also URLs which are "beneath"
286             original url
287              
288             keep_orig - set to false to inhibit creation of backup files
289              
290             relative - substitute also relative relative URLs which are equivalent
291             to original url (requires file_to_url)
292              
293             file_to_url - provide a function which can translate a given filename
294             to a URL, so we can work out relative URLs for the current
295             file.
296              
297              
298             so a call like
299              
300             $subs=gen_file_substitutor
301             ("http://www.example.com/friendstuff/old",
302             "http://www.example.com/friendstuff/new",
303             relative => 1, tree_mode => 1;
304             file_to_url =>
305             sub { my $ret=shift;
306             $ret =~ return s,/var/www/me,http://www.example.com/mystuff,;
307             return $ret});
308              
309             &$subs("/var/www/me/index.html");
310             &$subs("/var/www/me/friends.html");
311              
312             should allow you to fix your web pages if your friend renames a whole
313             directory.
314              
315             =head1 BUGS
316              
317             One problem with directory substitutors is treatment of the two different urls
318              
319             http://fred.jim/eating/
320              
321             and
322              
323             http://fred.jim/eating
324              
325             Most of the time, the latter of the pair is really just a mistaken
326             reference to the earlier. This is B always true. What is more,
327             where it is true, a user of LinkController will usually have changed
328             to the correct version. For this reason, if gen_directory_substitutor
329             is passed the first form of a url, it will not substitute the second.
330             If passed the second, it will substitute the first.
331              
332             We have to be fed whole URLs at a time. If a url is split between two
333             different chunks then we may not handle it correctly. Always feeding
334             in a complete line protects us from this because a URL cannot contain
335             an unencoded line break.
336              
337             =cut
338              
339              
340 3     3   30 use vars qw($tmpdir $tmpref $tmpname $keeporig_default);
  3         6  
  3         2655  
341             $tmpdir="/tmp/";
342             $tmpref="link_repair";
343             $tmpname="$tmpdir$tmpref" . "repair.$$";
344             $keeporig_default=1;
345              
346             sub gen_file_substitutor ($$;%) {
347 5     5 1 28630 my $origurl=shift;
348 5         32 my $finalurl=shift;
349 5         39 my %settings=@_;
350              
351 5         23 my $tree_mode=$settings{tree_mode};
352 5         18 my $relative=$settings{relative};
353 5         10 my $file_to_url=$settings{file_to_url};
354 5         11 my $keeporig=$settings{keeporig};
355 5         11 my $fakeit=$settings{fakeit};
356              
357 5 50       28 $keeporig=$keeporig_default unless defined $keeporig;
358              
359 5 50       26 print STDERR "generating a file substitutor\n" if $verbose;
360 5 50       18 $verbose & 32 and do {
361 0         0 print STDERR <
362             From: $origurl
363             To: $finalurl
364             Settings:-
365             EOF
366 0 0       0 print "keeporig: " . ( $keeporig ? $keeporig : "undef" )
    0          
    0          
    0          
367             . " relative: " . ( $relative ? $relative : "undef" )
368             . " file_to_url: " . ( $file_to_url ? $file_to_url : "undef" )
369             . " fakeit: " . ( $fakeit ? $fakeit : "undef" ) . " \n";
370             };
371              
372 5         9 my $subs;
373 5 50       23 if ($relative) {
374 0 0       0 $file_to_url
375             or die "relative substitution needs a file_to_url translator"
376             } else {
377 5         73 $subs = gen_substitutor($origurl,$finalurl,$tree_mode);
378             }
379              
380             return sub () {
381 6     6   27 my $filename=shift;
382              
383 6 50 50     26 print STDERR "file handler called for $filename\n" if $verbose && 8;
384              
385 6 50       17 if ($relative) {
386 0         0 my $baseuri=&$file_to_url($filename);
387 0 0       0 print STDERR "URI for file $filename is $baseuri\n" if $verbose;
388 0         0 $subs = gen_substitutor($origurl,$finalurl,$tree_mode, $baseuri);
389             }
390              
391 6         14 my $fixed=0;
392              
393 6 50       15 die "file handler called with undefined values" unless defined $filename;
394 6 50       162 -d $filename && return 0;
395 6 50       84 -f $filename or do {warn "can't fix special file $filename"; return 0};
  0         0  
  0         0  
396              
397 6 50       33 if ($fakeit) {
398 0         0 print STDERR "pretending to edit $filename\n";
399 0 0       0 -W $filename or warn "file $filename can't be edited";
400             } else {
401             open (FIXFILE, "<$filename")
402 6 50       252 or do { die "can't access $filename"; return 0};
  0         0  
  0         0  
403 6 50       610 open (TMPFILE, ">$tmpname") or die "can't use tempfile $tmpname";
404 6         163 while () {
405 145         4568 $fixed += &$subs( $_);
406 145         669 print TMPFILE $_;
407             }
408 6         358 close TMPFILE;
409 6         52 close FIXFILE;
410             #FIXME edit failure?? LOGME
411 6 50       20 print STDERR "Changed links in file $filename\n"
412             if $WWW::Link::Repair::verbose & 16;
413             #I think this is the key bit of the program which needs to be SUID
414             #and could even be separated out for more security.. <
415 6 50       634 rename($filename, $filename . ".orig") if $keeporig;
416 6         71 copy($tmpname, $filename);
417             #EOSU
418 6         2797 unlink $tmpname; #assuming we used it..
419             }
420 6         32 return $fixed;
421             }
422 5         61 }
423              
424              
425              
426             1;
427              
428