File Coverage

blib/lib/WWW/Link_Controller/URL.pm
Criterion Covered Total %
statement 63 81 77.7
branch 26 54 48.1
condition 1 3 33.3
subroutine 9 9 100.0
pod 3 5 60.0
total 102 152 67.1


line stmt bran cond sub pod time code
1             =head2 link_url
2              
3             C looks at a candidate.
4              
5             =cut
6              
7             package WWW::Link_Controller::URL;
8             $REVISION=q$Revision: 1.8 $ ; $VERSION = sprintf ( "%d.%02d", $REVISION =~ /(\d+).(\d+)/ );
9 1     1   1716 use URI;
  1         7931  
  1         27  
10 1     1   8 use Carp;
  1         2  
  1         83  
11 1     1   17 use strict;
  1         2  
  1         47  
12 1     1   6 use warnings;
  1         2  
  1         1336  
13              
14             our $verbose;
15             our $no_warn;
16              
17             #charset definitions
18              
19             sub RESERVED () { '[;/?:@&=+$]';}
20             sub ALPHA () { '[A-Za-z]'; }
21             sub ALPHA_NUM () { '[A-Za-z0-9]'; }
22             sub SCHEME_CHAR () { '[A-Za-z0-9+.-]'; }
23             sub MARK () { "[-_.!~*'()]"; }
24             sub UNRESERVED () { '(?:' . ALPHA_NUM . '|' . MARK . ')' };
25             sub ESCAPE () { '[%]'; }
26             sub SCHEME () { '(?:(?i)[a-z][a-z0-9+.-]*)';}
27             sub ABSURI () { SCHEME . ':' . '/';}
28              
29             sub CONTROL () { '[\x00-\x1F\x7F]'; }
30              
31             #N.B. % and # normally included here. We don't include % since it is
32             #allowed as the escape character. and we don't include # since within
33             #LinkController we consider the fragment as part of the URL in
34             #contradiction with the standards, since we may be interested to check
35             #that the exact fragment of the resource exists
36             sub DELIMS () { '[<>"]'; }
37             sub UNWISE () { '[{}|\^[]`]'; }
38             sub EXCLUDED () { '(?:'. CONTROL .'|'. DELIMS .'|'. UNWISE .'|'.' '.'|'.'#'.')' };
39             sub URIC () { '(?:' . RESERVED . '|' . UNRESERVED . '|' . ESCAPE . ')' };
40              
41             #sub AUTHORITY () { '(?:(?i)[a-z][a-z0-9+.-]*)';}
42             #sub NET_PATH () { '//' . AUTHORITY . '/' . ABS_PATH; }
43              
44              
45             =head2 verify_url
46              
47             verify url checks that a url is a valid and possible uri in the terms
48             of RFC2396
49              
50             =cut
51              
52             sub verify_url ($) {
53 17     17 1 91 my $url=shift;
54 17         18 my $control=CONTROL;
55             # we don't print it out directly.. maybe we shouldn't even print out
56             # the warning.
57 17 50       60 do { carp "url $url contains control characters" unless $no_warn;
  1 100       3  
58 1         2 return undef; } if $url =~ m/$control/;
59 16         17 my $exclude=EXCLUDED;
60 16         17 my $ex;
61 16 50       187 do { carp "url $url contains excluded character: $ex" unless $no_warn;
  2 100       5  
62 2         5 return undef; } if ($ex) = $url =~ m/($exclude)/;
63              
64             #try to identify invalid schemes. The problem here is that it's possible
65             #to have a : elsewhere in a URL so we have to be very careful.
66              
67 14         18 my $scheme=$url;
68              
69             #chop off anything which is definitely not the scheme.. this gets rid of
70             #the second part of any paths etc. This protects us against relative urls
71             #which have a : in them (N.B.
72              
73 14         30 $scheme =~ s,[#/].*,, ;
74              
75             #now keep the bit preceeding the :
76              
77 14         46 ($scheme) = $scheme =~ m/^([^:]*):/;
78              
79 14 100       34 if ( defined $scheme ) {
80 13         13 my $scheme_re= '^' . ALPHA .'('. ALPHA_NUM ."|". SCHEME_CHAR .')*$' ;
81 13 50       69 do { carp "url $url has illegal scheme: $scheme" unless $no_warn;
  3 100       6  
82 3         8 return undef; } unless $scheme =~ m/$scheme_re/;
83             }
84              
85 11         27 return 1;
86             }
87              
88             =head2 untaint_url
89              
90             Used in our CGI bin programs, untaint_url takes a scalar and returns
91             it untainted if and only if it's contains only valid url characters
92             and it is a valid url according to verify_url.
93              
94             A fundamental assumption in using this function is that your software
95             can handle B which looks like a valid URL, even if it isn't
96             a valid url. E.g. C.
97              
98             =cut
99              
100             sub untaint_url {
101 2     2 1 14 my $url=shift;
102 2         3 my $re='^'. URIC .'+$';
103 2         44 my ($ret)= $url =~ m/($re)/;
104 2 100       6 defined $ret or do {
105             # $url =~ y/[A-Za-z0-9]/_/c;# clean url so we can print it out
106 1 50       4 warn "bad url passed to url_untaint" unless $no_warn;
107 1         2 return undef;
108             };
109 1 50       3 return undef unless verify_url($ret);
110 1         4 return $ret;
111             }
112              
113             =head2 verify_fragment
114              
115             Fragments have fairly free syntax but RFC 2396 says clearly they
116             should conform to the same character set as URIs. Unfortunately, it
117             seems that many people put spaces in their fragments in contradiction
118             with the RFC since it works in HTML in practice.
119              
120             We choose not to accept those and people should be able to change over?
121              
122             If it turns out, as it probably will, that there is a real need for
123             spaces in cross references to other people's documents which can't be
124             fixed then maybe we will have to reconsider.
125              
126             =cut
127              
128             sub verify_fragment ($) {
129 2     2 1 3 my $fragment=shift;
130 2 50       8 defined $fragment or return undef;
131 0         0 my $control=CONTROL;
132             # we don't print it out directly.. maybe we shouldn't even print out
133             # the warning.
134 0 0       0 do { carp "url $fragment contains control characters" unless $no_warn;
  0 0       0  
135 0         0 return undef; } if $fragment =~ m/$control/;
136 0         0 my $exclude=EXCLUDED;
137 0         0 my $ex;
138 0 0       0 do { carp "url $fragment contains excluded character: $ex" unless $no_warn;
  0 0       0  
139 0         0 return undef; } if ($ex) = $fragment =~ m/($exclude)/;
140              
141 0         0 return 1;
142             }
143              
144             sub extract_fragment {
145 2     2 0 3 my $link=shift;
146 2         10 my ($url,$fragment)= $link =~ m/([^#]*)(?:#(.*))?/;
147 2 50       6 $::verbose & 16 and do {
148 0 0       0 print STDERR "URL is $url and fragment is $fragment\n"
149             if defined $fragment;
150 0 0       0 print STDERR "URL is $url no fragment\n" unless defined $fragment;
151             };
152 2         5 return $url,$fragment;
153             }
154              
155             sub fixup_link_url ($$) {
156 2     2 0 21 my $link=shift;
157 2         4 my $base=shift;
158 2 50       6 croak "usage link_url(,)" unless defined $link;
159              
160 2         7 my ($url,$fragment)=extract_fragment($link);
161              
162 2 50       5 unless (verify_url($url)) {
163 0 0       0 warn "dropping url: $url" unless $no_warn;
164 0         0 return undef;
165             };
166              
167 2 50       5 unless (verify_fragment($fragment)) {
168 2 50       5 warn "dropping illegal fragment: $fragment for url $url" if defined $fragment;
169 2         3 $fragment=undef;
170             };
171              
172             $url =~ m,^(?:ftp|gopher|http|https|ldap|rsync|telnet):(?:[^/]|.[^/]),
173 2 50       8 and do {
174 0         0 warn "ERROR: ignoring relative url with scheme $url";
175 0         0 return undef;
176             };
177              
178 2         13 my $urlo=URI->new($url);
179 2         9055 my $aurlo=$urlo->abs($base);
180 2         553 my $ret_url;
181 2 100       7 if ( URI::eq($urlo,$aurlo) ) {
182 1         322 $ret_url = $url;
183             } else {
184 1         10 $ret_url=$aurlo->as_string();
185             }
186              
187             $ret_url =~ m,^(?:ftp|gopher|http|https|ldap|rsync|telnet):(?:[^/]|.[^/]),
188 2 50       26 and do {
189 0         0 warn "ERROR: abs(url) $url gave $ret_url";
190 0         0 return undef;
191             };
192              
193 2 50 33     15 print STDERR "fixed up link name $url\n"
194             if $::verbose & 16 and defined $url;
195 2         36 return $ret_url;
196             }
197              
198             99;