File Coverage

blib/lib/URI/Find.pm
Criterion Covered Total %
statement 131 136 96.3
branch 40 56 71.4
condition 20 26 76.9
subroutine 22 23 95.6
pod 11 13 84.6
total 224 254 88.1


line stmt bran cond sub pod time code
1             # Copyright (c) 2000, 2009 Michael G. Schwern. All rights reserved.
2             # This program is free software; you can redistribute it and/or modify
3             # it under the same terms as Perl itself.
4              
5             package URI::Find;
6              
7             require 5.006;
8              
9 6     6   123039 use strict;
  6         16  
  6         340  
10 6     6   33 use base qw(Exporter);
  6         11  
  6         792  
11 6     6   48 use vars qw($VERSION @EXPORT);
  6         9  
  6         576  
12              
13             $VERSION = 20140709;
14             @EXPORT = qw(find_uris);
15              
16 6     6   45 use constant YES => (1==1);
  6         11  
  6         852  
17 6     6   32 use constant NO => !YES;
  6         15  
  6         268  
18              
19 6     6   122 use Carp qw(croak);
  6         12  
  6         14148  
20              
21             require URI;
22              
23             my $reserved = q(;/?:@&=+$,[]);
24             my $mark = q(-_.!~*'());
25             my $unreserved = "A-Za-z0-9\Q$mark\E";
26             my $uric = quotemeta($reserved) . '\p{isAlpha}' . $unreserved . "%";
27              
28             # URI scheme pattern without the non-alpha numerics.
29             # Those are extremely uncommon and interfere with the match.
30             my($schemeRe) = qr/[a-zA-Z][a-zA-Z0-9\+]*/;
31             my($uricSet) = $uric; # use new set
32              
33             # Some schemes which URI.pm does not explicitly support.
34             my $extraSchemesRe = qr{^(?:git|svn|ssh|svn\+ssh)$};
35              
36             # We need to avoid picking up 'HTTP::Request::Common' so we have a
37             # subset of uric without a colon ("I have no colon and yet I must poop")
38             my($uricCheat) = __PACKAGE__->uric_set;
39             $uricCheat =~ tr/://d;
40              
41             # Identifying characters accidentally picked up with a URI.
42             my($cruftSet) = q{])\},.'";}; #'#
43              
44              
45             =head1 NAME
46              
47             URI::Find - Find URIs in arbitrary text
48              
49             =head1 SYNOPSIS
50              
51             require URI::Find;
52              
53             my $finder = URI::Find->new(\&callback);
54              
55             $how_many_found = $finder->find(\$text);
56              
57             =head1 DESCRIPTION
58              
59             This module does one thing: Finds URIs and URLs in plain text. It
60             finds them quickly and it finds them B (or what URI.pm considers
61             a URI to be.) It only finds URIs which include a scheme (http:// or
62             the like), for something a bit less strict have a look at
63             L.
64              
65             For a command-line interface, L is provided.
66              
67             =head2 Public Methods
68              
69             =over 4
70              
71             =item B
72              
73             my $finder = URI::Find->new(\&callback);
74              
75             Creates a new URI::Find object.
76              
77             &callback is a function which is called on each URI found. It is
78             passed two arguments, the first is a URI object representing the URI
79             found. The second is the original text of the URI found. The return
80             value of the callback will replace the original URI in the text.
81              
82             =cut
83              
84             sub new {
85 208 50   208 1 514939 @_ == 2 || __PACKAGE__->badinvo;
86 208         636 my($proto, $callback) = @_;
87 208   33     1624 my($class) = ref $proto || $proto;
88 208         1328 my $self = bless {}, $class;
89              
90 208         1198 $self->{callback} = $callback;
91              
92 208         854 return $self;
93             }
94              
95             =item B
96              
97             my $how_many_found = $finder->find(\$text);
98              
99             $text is a string to search and possibly modify with your callback.
100              
101             Alternatively, C can be called with a replacement function for
102             the rest of the text:
103              
104             use CGI qw(escapeHTML);
105             # ...
106             my $how_many_found = $finder->find(\$text, \&escapeHTML);
107              
108             will not only call the callback function for every URL found (and
109             perform the replacement instructions therein), but also run the rest
110             of the text through C. This makes it easier to turn
111             plain text which contains URLs into HTML (see example below).
112              
113             =cut
114              
115             sub find {
116 211 50 66 211 1 2727 @_ == 2 || @_ == 3 || __PACKAGE__->badinvo;
117 211         378 my($self, $r_text, $escape_func) = @_;
118              
119             # Might be slower, but it makes the code simpler
120 211   100 424   1842 $escape_func ||= sub { return $_[0] };
  424         1362  
121              
122             # Store the escape func in the object temporarily for use
123             # by other methods.
124 211         607 local $self->{escape_func} = $escape_func;
125              
126 211         1017 $self->{_uris_found} = 0;
127              
128             # Yes, evil. Basically, look for something vaguely resembling a URL,
129             # then hand it off to URI for examination. If it passes, throw
130             # it to a callback and put the result in its place.
131 211         1103 local $SIG{__DIE__} = 'DEFAULT';
132 211         668 my $uri_cand;
133             my $uri;
134              
135 211         1067 my $uriRe = sprintf '(?:%s|%s)', $self->uri_re, $self->schemeless_uri_re;
136              
137 211     3   46471 $$r_text =~ s{ (.*?) (?:(<(?:URL:)?)(.+?)(>)|($uriRe)) | (.+?)$ }{
  3         5599  
  3         96  
  3         47  
138 424         376728 my $replace = '';
139 424 100       1810 if( defined $6 ) {
140 111         322 $replace = $escape_func->($6);
141             }
142             else {
143 313         639 my $maybe_uri = '';
144              
145 313 100       1918 $replace = $escape_func->($1) if length $1;
146              
147 313 100       1508 if( defined $2 ) {
148 60         268 $maybe_uri = $3;
149 60         89 my $is_uri = do { # Don't alter $1...
150 60         169 $maybe_uri =~ s/\s+//g;
151 60         4712 $maybe_uri =~ /^$uriRe/;
152             };
153              
154 60 100       43291 if( $is_uri ) {
155 27         96 $replace .= $escape_func->($2);
156 27         243 $replace .= $self->_uri_filter($maybe_uri);
157 27         90 $replace .= $escape_func->($4);
158             }
159             else {
160             # the whole text inside of the <...> was not a url, but
161             # maybe it has a url (like an HTML link)
162 33         57 my $has_uri = do { # Don't alter $1...
163 33         86 $maybe_uri = $3;
164 33         1154 $maybe_uri =~ /$uriRe/;
165             };
166 33 100       23489 if( $has_uri ) {
167 4         11 my $pre = $2;
168 4         7 my $post = $4;
169 4         6 do { $self->find(\$maybe_uri, $escape_func) };
  4         24  
170 4         12 $replace .= $escape_func->($pre);
171 4         16 $replace .= $maybe_uri; # already escaped by find()
172 4         8 $replace .= $escape_func->($post);
173             }
174             else {
175 29         159 $replace .= $escape_func->($2.$3.$4);
176             }
177             }
178             }
179             else {
180 253         1531 $replace .= $self->_uri_filter($5);
181             }
182             }
183              
184 424         8719 $replace;
185             }gsex;
186              
187 211         9041 return $self->{_uris_found};
188             }
189              
190              
191             sub _uri_filter {
192 280     280   1318 my($self, $orig_match) = @_;
193              
194             # A heuristic. Often you'll see things like:
195             # "I saw this site, http://www.foo.com, and its really neat!"
196             # or "Foo Industries (at http://www.foo.com)"
197             # We want to avoid picking up the trailing paren, period or comma.
198             # Of course, this might wreck a perfectly valid URI, more often than
199             # not it corrects a parse mistake.
200 280         874 $orig_match = $self->decruft($orig_match);
201              
202 280         768 my $replacement = '';
203 280 100       960 if( my $uri = $self->_is_uri(\$orig_match) ) {
204             # It's a URI
205 265         2796 $self->{_uris_found}++;
206 265         1062 $replacement = $self->{callback}->($uri, $orig_match);
207             }
208             else {
209             # False alarm
210 15         59 $replacement = $self->{escape_func}->($orig_match);
211             }
212              
213             # Return recrufted replacement
214 280         1988 return $self->recruft($replacement);
215             }
216              
217              
218             =back
219              
220             =head2 Protected Methods
221              
222             I got a bunch of mail from people asking if I'd add certain features
223             to URI::Find. Most wanted the search to be less restrictive, do more
224             heuristics, etc... Since many of the requests were contradictory, I'm
225             letting people create their own custom subclasses to do what they
226             want.
227              
228             The following are methods internal to URI::Find which a subclass can
229             override to change the way URI::Find acts. They are only to be called
230             B a URI::Find subclass. Users of this module are NOT to use
231             these methods.
232              
233             =over
234              
235             =item B
236              
237             my $uri_re = $self->uri_re;
238              
239             Returns the regex for finding absolute, schemed URIs
240             (http://www.foo.com and such). This, combined with
241             schemeless_uri_re() is what finds candidate URIs.
242              
243             Usually this method does not have to be overridden.
244              
245             =cut
246              
247             sub uri_re {
248 211 50   211 1 1952 @_ == 1 || __PACKAGE__->badinvo;
249 211         648 my($self) = shift;
250 211         1289 return sprintf '%s:[%s][%s#]*', $schemeRe,
251             $uricCheat,
252             $self->uric_set;
253             }
254              
255             =item B
256              
257             my $schemeless_re = $self->schemeless_uri_re;
258              
259             Returns the regex for finding schemeless URIs (www.foo.com and such) and
260             other things which might be URIs. By default this will match nothing
261             (though it used to try to find schemeless URIs which started with C
262             and C).
263              
264             Many people will want to override this method. See L
265             for a subclass does a reasonable job of finding URIs which might be missing
266             the scheme.
267              
268             =cut
269              
270             sub schemeless_uri_re {
271 313 50   313 1 1100 @_ == 1 || __PACKAGE__->badinvo;
272 313         551 my($self) = shift;
273 313         3948 return qr/\b\B/; # match nothing
274             }
275              
276             =item B
277              
278             my $uric_set = $self->uric_set;
279              
280             Returns a set matching the 'uric' set defined in RFC 2396 suitable for
281             putting into a character set ([]) in a regex.
282              
283             You almost never have to override this.
284              
285             =cut
286              
287             sub uric_set {
288 219 50   219 1 973 @_ == 1 || __PACKAGE__->badinvo;
289 219         1929 return $uricSet;
290             }
291              
292             =item B
293              
294             my $cruft_set = $self->cruft_set;
295              
296             Returns a set of characters which are considered garbage. Used by
297             decruft().
298              
299             =cut
300              
301             sub cruft_set {
302 2 50   2 1 16 @_ == 1 || __PACKAGE__->badinvo;
303 2         10 return $cruftSet;
304             }
305              
306             =item B
307              
308             my $uri = $self->decruft($uri);
309              
310             Sometimes garbage characters like periods and parenthesis get
311             accidentally matched along with the URI. In order for the URI to be
312             properly identified, it must sometimes be "decrufted", the garbage
313             characters stripped.
314              
315             This method takes a candidate URI and strips off any cruft it finds.
316              
317             =cut
318              
319             my %balanced_cruft = (
320             '(' => ')',
321             '{' => '}',
322             '[' => ']',
323             '"' => '"',
324             q['] => q['],
325             );
326              
327             sub decruft {
328 280 50   280 1 984 @_ == 2 || __PACKAGE__->badinvo;
329 280         512 my($self, $orig_match) = @_;
330              
331 280         1098 $self->{start_cruft} = '';
332 280         599 $self->{end_cruft} = '';
333              
334 280 100       3391 if( $orig_match =~ s/([\Q$cruftSet\E]+)$// ) {
335             # urls can end with HTML entities if found in HTML so let's put back semicolons
336             # if this looks like the case
337 63         132 my $cruft = $1;
338 63 100 66     275 if( $cruft =~ /^;/ && $orig_match =~ /\&(\#[1-9]\d{1,3}|[a-zA-Z]{2,8})$/) {
339 1         3 $orig_match .= ';';
340 1         4 $cruft =~ s/^;//;
341             }
342              
343 63         514 while( my($open, $close) = each %balanced_cruft ) {
344 315         839 $self->recruft_balanced(\$orig_match, \$cruft, $open, $close);
345             }
346              
347 63 100       224 $self->{end_cruft} = $cruft if $cruft;
348             }
349              
350 280         1366 return $orig_match;
351             }
352              
353              
354             sub recruft_balanced {
355 315     315 0 390 my $self = shift;
356 315         3706 my($orig_match, $cruft, $open, $close) = @_;
357              
358 315         3789 my $open_count = () = $$orig_match =~ m{\Q$open}g;
359 315         3793 my $close_count = () = $$orig_match =~ m{\Q$close}g;
360              
361 315 100 100     3835 if ( $$cruft =~ /\Q$close\E$/ && $open_count == ( $close_count + 1 ) ) {
362 18         45 $$orig_match .= $close;
363 18         208 $$cruft =~ s/\Q$close\E$//;
364             }
365              
366 315         3170 return;
367             }
368              
369              
370             =item B
371              
372             my $uri = $self->recruft($uri);
373              
374             This method puts back the cruft taken off with decruft(). This is necessary
375             because the cruft is destructively removed from the string before invoking
376             the user's callback, so it has to be put back afterwards.
377              
378             =cut
379              
380             #'#
381              
382             sub recruft {
383 280 50   280 1 1004 @_ == 2 || __PACKAGE__->badinvo;
384 280         491 my($self, $uri) = @_;
385              
386 280         1655 return $self->{start_cruft} . $uri . $self->{end_cruft};
387             }
388              
389             =item B
390              
391             my $schemed_uri = $self->schemeless_to_schemed($schemeless_uri);
392              
393             This takes a schemeless URI and returns an absolute, schemed URI. The
394             standard implementation supplies ftp:// for URIs which start with ftp.,
395             and http:// otherwise.
396              
397             =cut
398              
399             sub schemeless_to_schemed {
400 42 50   42 1 305 @_ == 2 || __PACKAGE__->badinvo;
401 42         101 my($self, $uri_cand) = @_;
402              
403 42 100       2549 $uri_cand =~ s|^(
404             or $uri_cand =~ s|^(
405              
406 42         132 return $uri_cand;
407             }
408              
409             =item B
410              
411             $obj->is_schemed($uri);
412              
413             Returns whether or not the given URI is schemed or schemeless. True for
414             schemed, false for schemeless.
415              
416             =cut
417              
418             sub is_schemed {
419 2 50   2 1 1077 @_ == 2 || __PACKAGE__->badinvo;
420 2         5 my($self, $uri) = @_;
421 2         70 return scalar $uri =~ /^
422             }
423              
424             =item I
425              
426             __PACKAGE__->badinvo($extra_levels, $msg)
427              
428             This is used to complain about bogus subroutine/method invocations.
429             The args are optional.
430              
431             =cut
432              
433             sub badinvo {
434 0     0 1 0 my $package = shift;
435 0 0       0 my $level = @_ ? shift : 0;
436 0 0       0 my $msg = @_ ? " (" . shift() . ")" : '';
437 0         0 my $subname = (caller $level + 1)[3];
438 0         0 croak "Bogus invocation of $subname$msg";
439             }
440              
441             =back
442              
443             =head2 Old Functions
444              
445             The old find_uri() function is still around and it works, but its
446             deprecated.
447              
448             =cut
449              
450             # Old interface.
451             sub find_uris (\$&) {
452 64 50   64 0 197543 @_ == 2 || __PACKAGE__->badinvo;
453 64         704 my($r_text, $callback) = @_;
454              
455 64         321 my $self = __PACKAGE__->new($callback);
456 64         327 return $self->find($r_text);
457             }
458              
459              
460             =head1 EXAMPLES
461              
462             Store a list of all URIs (normalized) in the document.
463              
464             my @uris;
465             my $finder = URI::Find->new(sub {
466             my($uri) = shift;
467             push @uris, $uri;
468             });
469             $finder->find(\$text);
470              
471             Print the original URI text found and the normalized representation.
472              
473             my $finder = URI::Find->new(sub {
474             my($uri, $orig_uri) = @_;
475             print "The text '$orig_uri' represents '$uri'\n";
476             return $orig_uri;
477             });
478             $finder->find(\$text);
479              
480             Check each URI in document to see if it exists.
481              
482             use LWP::Simple;
483              
484             my $finder = URI::Find->new(sub {
485             my($uri, $orig_uri) = @_;
486             if( head $uri ) {
487             print "$orig_uri is okay\n";
488             }
489             else {
490             print "$orig_uri cannot be found\n";
491             }
492             return $orig_uri;
493             });
494             $finder->find(\$text);
495              
496              
497             Turn plain text into HTML, with each URI found wrapped in an HTML anchor.
498              
499             use CGI qw(escapeHTML);
500             use URI::Find;
501              
502             my $finder = URI::Find->new(sub {
503             my($uri, $orig_uri) = @_;
504             return qq|$orig_uri|;
505             });
506             $finder->find(\$text, \&escapeHTML);
507             print "
$text
";
508              
509             =cut
510              
511              
512             sub _is_uri {
513 280 50   280   772 @_ == 2 || __PACKAGE__->badinvo;
514 280         534 my($self, $r_uri_cand) = @_;
515              
516 280         485 my $uri = $$r_uri_cand;
517              
518             # Translate schemeless to schemed if necessary.
519 280 100 66     34833 $uri = $self->schemeless_to_schemed($uri) if
520             $uri =~ $self->schemeless_uri_re and
521             $uri !~ /^
522              
523 280         769 eval {
524 280         7611 $uri = URI->new($uri);
525              
526             # Throw out anything with an invalid scheme.
527 280   100     198230 my $has_invalid_scheme = $uri->isa("URI::_foreign") &&
528             $uri->scheme !~ $extraSchemesRe;
529              
530             # Toss out things like http:// but keep file:///
531 280         30761 my $is_empty = $uri =~ m{^$schemeRe://$};
532              
533 280 100 100     3935 undef $uri if $has_invalid_scheme || $is_empty;
534             };
535              
536 280 100 66     8381 if($@ || !defined $uri) { # leave everything untouched, its not a URI.
537 15         131 return NO;
538             }
539             else { # Its a URI.
540 265         2764 return $uri;
541             }
542             }
543              
544              
545             =head1 NOTES
546              
547             Will not find URLs with Internationalized Domain Names or pretty much
548             any non-ascii stuff in them. See
549             L
550              
551              
552             =head1 AUTHOR
553              
554             Michael G Schwern with insight from Uri Gutman,
555             Greg Bacon, Jeff Pinyan, Roderick Schertler and others.
556              
557             Roderick Schertler maintained versions 0.11 to 0.16.
558              
559             Darren Chamberlain wrote urifind.
560              
561              
562             =head1 LICENSE
563              
564             Copyright 2000, 2009-2010 by Michael G Schwern Eschwern@pobox.comE.
565              
566             This program is free software; you can redistribute it and/or
567             modify it under the same terms as Perl itself.
568              
569             See F
570              
571             =head1 SEE ALSO
572              
573             L, L, L, RFC 3986 Appendix C
574              
575             =cut
576              
577             1;