File Coverage

blib/lib/Combine/selurl.pm
Criterion Covered Total %
statement 79 157 50.3
branch 20 70 28.5
condition 6 32 18.7
subroutine 10 13 76.9
pod 0 7 0.0
total 115 279 41.2


line stmt bran cond sub pod time code
1             package Combine::selurl;
2              
3 5     5   1469 use strict;
  5         10  
  5         245  
4              
5             our $AUTOLOAD;
6              
7 5     5   6006 use URI;
  5         40520  
  5         171  
8             #use URI::Escape;
9             #use URI::http;
10             #use URI::https;
11             #use URI::ftp;
12              
13 5     5   48 use Carp;
  5         10  
  5         430  
14              
15 5     5   604 use Combine::Config;
  5         15  
  5         9813  
16              
17             1; # package return thang
18              
19             ##########################################################
20             sub AUTOLOAD
21             {
22 143     143   2484 my $whatever = $AUTOLOAD;
23 143         555 $whatever =~ s/^Combine::selurl\:\://;
24              
25 143 50       471 return if $whatever eq 'DESTROY';
26              
27 143         181 my($self) = shift;
28              
29 143         178 my $uri = $self->{'URI'};
30 143         561 return $uri->$whatever(@_);
31             }
32              
33              
34             sub new
35             {
36 17     17 0 236 my($class, $uristr, $scheme, %opt) = @_;
37              
38 17         23 my $self = {};
39              
40 17 100       60 if($opt{'sloppy'})
41             {
42             # If the original input has no scheme (like 'www.dtu.dk/some/path'
43             # or even just 'www.dtu.dk'), and does not start with //, we'll
44             # prepend http:// in the assumption the first thing is a hostname.
45              
46 1 50       9 if( $uristr !~ /^\w+:/ )
47             {
48 0 0       0 if($uristr =~ m|^//|)
49             {
50 0         0 $uristr = 'http:' . $uristr;
51             } else {
52 0         0 $uristr = 'http://' . $uristr;
53             }
54             }
55             }
56              
57             # Whether sloppy is set or not, we will provide URI->new with a
58             # default method to avoid a crash on host() overloading
59 17 50       34 $scheme = 'http' unless $scheme;
60              
61              
62 17         84 $self->{'URI'} = new URI($uristr, $scheme);
63             # quickly catch if we can handle this scheme at all; otherwise
64             # URI will cause trouble when we try to use host()
65              
66 17 50       11945 return undef unless $self->{'URI'}->scheme() =~ /^http$|^https$|^ftp$/;
67              
68 17         416 $self->{'orguri'} = $uristr;
69              
70 17         30 bless $self, $class;
71 17 50       55 return undef unless $self->{'URI'}->host();
72 17 50       431 return undef unless $self->_init();
73              
74 17         51 return $self;
75             }
76              
77              
78             # We need new_abs to fulfill the pledge of inheritance from URI.
79             sub new_abs
80             {
81 39     39 0 1125 my($class, $uristr, $base, %opt) = @_;
82              
83             # Does NOT accept sloppy option!
84              
85 39         60 my $self = {};
86             # $self->{'URI'} = new_abs URI($uristr, $base);
87 39         117 $self->{'URI'} = URI->new_abs($uristr, $base);
88 39         18588 $self->{'orguri'} = $uristr;
89              
90 39         71 bless $self, $class;
91 39         77 $self->_init();
92 39         107 return $self;
93             }
94              
95              
96             sub _init
97             {
98             # common instance initialiser for new*
99 56     56   68 my $self = shift;
100 56         82 my $uri = $self->{'URI'};
101              
102             # Set some elements we're probably going to use anyway
103              
104 56         124 my $h = $uri->host();
105 56         1152 $self->{'orghost'} = $h; # without default port
106 56         140 $self->{'orgport'} = $uri->port(); # even default port
107             # TODO: may want to use host:port as id
108 56   33     1285 $self->{'preferredhost'} = $Combine::Config::serverbyalias{$h} || $h;
109 56         95 $self->{'normalised'} = '';
110 56         72 $self->{'dirtycgi'} = 0; # Flag if cgi normalisation detected duplication
111 56         135 $self->{'invalidreason'} = '';
112              
113 56         245 $self->{'normlevel'} = {'prefhost' => 1,
114             'canonical' => 1,
115             'cgidup' => 2, # 1=keys, 2=key+val
116             'cgisessid' => 1,
117             'fragment' => 1,
118             'colapsedots' => 1
119             };
120              
121 56         169 $self->{'validlevel'} = {'checkallow' => 1,
122             'checkexclude' => 1,
123             'checklength' => 1,
124             'checkcgidup' => 0 # needn't be set if cgidup norm is set
125             };
126 56         108 return 1;
127             }
128              
129              
130             sub normlevel
131             {
132             # Get/set normalisation options
133 0     0 0 0 my($self, %newnorm) = @_;
134             # TODO: check option validity
135              
136 0         0 my %oldnorm = %{$self->{'normlevel'}};
  0         0  
137              
138 0 0       0 if(%newnorm)
139             {
140             # I assume Perl assigns values left-to-right... otherwise...
141 0         0 my %tmp = (%oldnorm, %newnorm);
142 0         0 $self->{'normlevel'} = \%tmp;
143 0         0 $self->{'normalised'} = ''; # clean cache
144 0         0 $self->{'dirtycgi'} = 0;
145             # do not reset invalidreason
146             }
147              
148 0         0 return (%oldnorm);
149             }
150              
151              
152             sub validlevel
153             {
154             # Get/set validation barriers
155 0     0 0 0 my($self, %newlevel) = @_;
156             # TODO: check option validity
157              
158 0         0 my %oldlevel = %{$self->{'validlevel'}};
  0         0  
159              
160 0 0       0 if(%newlevel)
161             {
162             # I assume Perl assigns values left-to-right... otherwise...
163 0         0 my %tmp = (%oldlevel, %newlevel);
164 0         0 $self->{'validlevel'} = \%tmp;
165 0         0 $self->{'normalised'} = ''; # clean cache
166 0         0 $self->{'dirtycgi'} = 0;
167 0         0 $self->{'invalidreason'} = '';
168             }
169              
170 0         0 return (%oldlevel);
171             }
172              
173              
174             sub validate
175             {
176 1     1 0 8 my $self = shift;
177              
178             #Check if global init done
179             # if (!defined($selurl_init_done)) {
180             # if(!selurl_init())
181             # {
182             # die("Could not initialise selurl global config data");
183             # }
184             # $selurl_init_done=1;
185             # }
186              
187             #####
188             # 1: normalise. Always.
189 1         5 my $norm = $self->normalise();
190              
191 1         2 my %validlevel = %{$self->{'validlevel'}};
  1         5  
192              
193              
194             #####
195             # 2 test length
196 1 0 33     10 if($validlevel{'checklength'} and length($norm) > Combine::Config::Get('maxUrlLength') )
197             {
198 0         0 $self->{'invalidreason'} = 'length: ' . length($norm) . ' > ' . Combine::Config::Get('maxUrlLength');
199 0         0 return 0;
200             }
201              
202              
203             #####
204             # 3 test allow
205              
206             # TODO: do we need a host:port comparison for host entries?
207              
208 0 0       0 if($validlevel{'checkallow'})
209             {
210 0         0 my $allow = 0;
211 0         0 foreach my $rule (@Combine::Config::allow)
212             {
213 0         0 my($hostind, $patt, $orgpatt) = @{$rule};
  0         0  
214 0 0 0     0 if($hostind eq 'H' and $self->{'preferredhost'} =~ $patt)
    0 0        
215             {
216 0         0 $allow = 1;
217 0         0 last;
218             } elsif($hostind ne 'H' and $norm =~ $patt) {
219 0         0 $allow = 1;
220 0         0 last;
221             }
222             }
223 0 0       0 if(!$allow)
224             {
225 0         0 $self->{'invalidreason'} = "allow: nomatch";
226 0         0 return 0;
227             }
228             }
229              
230              
231             #####
232             # 4 test exclude
233 0 0       0 if($validlevel{'checkexclude'})
234             {
235 0         0 my $exclude = 0;
236 0         0 my $havocpatt;
237 0         0 foreach my $rule (@Combine::Config::exclude)
238             {
239 0         0 my($hostind, $patt, $orgpatt) = @{$rule};
  0         0  
240 0 0 0     0 if($hostind eq 'H' and $self->{'preferredhost'} =~ $patt)
    0 0        
241             {
242 0         0 $exclude = 1;
243 0         0 $havocpatt = $orgpatt;
244 0         0 last;
245             } elsif($hostind ne 'H' and $norm =~ $patt) {
246 0         0 $exclude = 1;
247 0         0 $havocpatt = $orgpatt;
248 0         0 last;
249             }
250             }
251 0 0       0 if($exclude)
252             {
253 0         0 $self->{'invalidreason'} = "exclude: $havocpatt";
254 0         0 return 0;
255             }
256              
257             }
258              
259             #####
260             # 5 test CGI repetition sanity
261 0 0       0 if($validlevel{'checkcgidup'})
262             {
263             # Hmmm.... TODO: how to combine this with norm:cgidup settings?
264             }
265              
266 0         0 return 1;
267             }
268              
269              
270             sub normalise
271             {
272 56     56 0 245 my($self, %opt) = @_;
273              
274             # If cached, no action unless force option is set
275 56 50 33     164 return $self->{'normalised'} if $self->{'normalised'} && !$opt{'force'};
276              
277 56         54 my %level = %{$self->{'normlevel'}};
  56         260  
278 56         106 my $newuri = $self->{'URI'};
279              
280             # 1: goodbye fragment. Buglet: if you set a '' fragment, URI appends # to
281             # all URIs.
282 56 50       202 $newuri->fragment(undef) if $level{'fragment'};
283              
284             # 2: Set preferred server.
285 56 50       516 $newuri->host($self->{'preferredhost'}) if $level{'prefhost'};
286              
287             # TODO: first canonical
288              
289             # 3: perform URI->canonical: remove default port dep. on method,
290             # lowercase host, unnecessary % escape removal
291             #
292             # We want to do this before CGI normalisation, because there
293             # is a slight chance that canonical() changes the CGI string.
294             # This implies we need to reconstruct a new URI from the
295             # canonical string.
296              
297 56 50       8852 if($level{'canonical'})
298             {
299 56         165 $self->{'URI'} = $self->{'URI'}->canonical();
300 56         4859 $newuri = $self->{'URI'};
301             }
302              
303             # clean CGI repetition (groovy)
304 56         164 my $q = $newuri->query();
305 56         421 my $newq;
306             # TODO: add sessid to cleanquery based on cgisessid option
307 56 50 33     1385 if($q and $level{'cgidup'} == 1)
    50 33        
308             {
309 0         0 $newq = cleanquery($q, 'unique' => 'keys', 'cleansessions' => $level{'cgisessid'});
310 0         0 $newuri->query($newq);
311             } elsif($q and $level{'cgidup'} == 2) {
312 0         0 $newq = cleanquery($q, 'unique' => 'kvpairs', 'cleansessions' => $level{'cgisessid'});
313 0         0 $newuri->query($newq);
314             }
315              
316 56 50 33     121 $self->{'dirtycgi'} = 1 if $newq and $newq ne $q;
317              
318 56 50       104 if($level{'colapsedots'})
319             {
320             #remove a '.' if last in host
321 56         128 my $host = $newuri->host;
322 56 100       1604 if ( $host =~ s|\.$|| ) { $newuri->host($host); }
  6         16  
323             #remove '%20' (space) if in host
324 56 100       1660 if ( $host =~ s|\s+||g ) { $newuri->host($host); }
  4         10  
325             #collapsing ./, ../ in the path
326             # You can also have the abs() method ignore excess ".." segments in the
327             # relative URI by setting $URI::ABS_REMOTE_LEADING_DOTS to a TRUE value.
328 56         362 my $path = $newuri->path;
329 56         1295 while ( $path =~ s%[^/]+/\.\./?|/\./|//|/\.$|/\.\.$%/% ) { }
330 56         300 $newuri->path($path);
331             }
332              
333             #more expensive normalization like adding trailing '/' if validated by database entries
334              
335             # We'll only keep the string, not the normalised URI object
336 56         1812 $self->{'normalised'} = $newuri->as_string();
337 56         339 return $self->{'normalised'};
338             }
339              
340             sub cleanquery
341             {
342             # Class method.
343             # check and clean a query string from repetitive keys or key/value
344             # pairs. Maintain the order in which elements are specified.
345              
346 0     0 0   my($query, %opt) = @_;
347              
348             # which criterion to use for determining repetition: 'keys' or
349             # 'kvpairs' (default)
350 0   0       my $unique = $opt{'unique'} || 'kvpairs';
351              
352             # Which keys are session ids and must be omitted entirely.
353             # Very experimental. Might go to configuration.
354             # Case sensitive!
355            
356 0           my %sessid;
357 0 0         if($opt{'cleansessions'} == 1)
358             {
359 0           my $c = Combine::Config::Get('url');
360             # %sessid = %{${%{$c}}{'sessionids'}}; #Problems with Ubuntu 9.10
361 0           %sessid = %{$c->{'sessionids'}};
  0            
362             }
363              
364             #OLD
365             # if($opt{'sessionid'})
366             # {
367             # foreach my $s (split(',', $opt{'sessionid'}))
368             # {
369             # $sessid{$s} = 1;
370             # }
371             # }
372             #/OLD
373              
374 0           my $cleaned= '';
375 0           my @kvpairs = split('\&', $query);
376 0           my(@keys, @values, %seen);
377 0           foreach my $item (@kvpairs)
378             {
379 0           my($k, undef, $v) = $item =~ /(\w+)(=(.*))?/;
380 0 0         next if defined($sessid{$k});
381 0 0         if($unique eq 'kvpairs')
382             {
383 0 0         $cleaned .= '&' . $item unless $seen{$item};
384 0           $seen{$item}++;
385             } else { # 'keys'
386 0 0         $cleaned .= '&' . $item unless $seen{$k};
387 0           $seen{$k}++;
388             }
389             }
390              
391 0           return substr($cleaned,1);
392             }
393              
394              
395             =pod
396              
397             =head1 NAME
398              
399             selurl - Normalise and validate URIs for harvesting
400              
401             =head1 INTRODUCTION
402              
403             Selurl selects and normalises URIs on basis of both general practice
404             (hostname lowercasing, portnumber substsitution etc.) and
405             Combine-specific handling (aplpying config_allow, config_exclude,
406             config_serveralias and other relevant config settings).
407              
408             The Config settings catered for currently are:
409              
410             maxUrlLength - the maximum length of an unnormalised URL
411             allow - Perl regular to identify allowed URLs
412             exclude - Perl regular expressions to exclude URLs from harvesting
413             serveralias - Aliases of server names
414             sessionids - List sessionid markers to be removed
415              
416             A selurl object can hold a single URL and has methods to obtain its
417             subparts as defined in URI.pm, plus some methods to normalise and
418             validate it in Combine context.
419              
420              
421             =head1 BUGS
422              
423             Currently, the only schemes supported are http, https and ftp. Others
424             may or may not work correctly. For one thing, we assume the scheme has
425             an internet hostname/port.
426              
427             clone() will only return a copy of the real URI object, not a new
428             selurl.
429              
430             URI URI-escapes the strings fed into it by new() once. Existing
431             percent signs in the input are left untouched, which implicates that:
432              
433             (a) there is no risk of double-encoding; and
434              
435             (b) if the original contained an inadvertent sequence that could
436             be interpreted as an escape sequence, uri_unescape will not
437             render the original input (e.g. url_with_%66_in_it goes whoop)
438             If you know that the original has not yet been escaped and wish to
439             safeguard potential percent signs, you'll have to escape them (and
440             only them) once before you offer it to new().
441              
442             A problem with URI is, that its object is not a hash we can
443             piggyback our data on, so I had to resort to AUTOLOAD to emulate
444             inheritance. I find this ugly, but well, this *is* Perl, so what'd
445             you expect?
446              
447             =cut
448