File Coverage

blib/lib/URI/Find.pm
Criterion Covered Total %
statement 134 139 96.4
branch 40 56 71.4
condition 20 26 76.9
subroutine 23 24 95.8
pod 11 13 84.6
total 228 258 88.3


line stmt bran cond sub pod time code
1             package URI::Find;
2              
3             require 5.006;
4              
5 6     6   67870 use strict;
  6         10  
  6         155  
6 6     6   21 use warnings;
  6         8  
  6         170  
7 6     6   28 use base qw(Exporter);
  6         7  
  6         518  
8 6     6   25 use vars qw($VERSION @EXPORT);
  6         5  
  6         440  
9              
10             $VERSION = 20160806;
11             @EXPORT = qw(find_uris);
12              
13 6     6   27 use constant YES => (1==1);
  6         7  
  6         407  
14 6     6   25 use constant NO => !YES;
  6         6  
  6         227  
15              
16 6     6   22 use Carp qw(croak);
  6         7  
  6         8926  
17              
18             require URI;
19              
20             my $reserved = q(;/?:@&=+$,[]);
21             my $mark = q(-_.!~*'());
22             my $unreserved = "A-Za-z0-9\Q$mark\E";
23             my $uric = quotemeta($reserved) . '\p{isAlpha}' . $unreserved . "%";
24              
25             # URI scheme pattern without the non-alpha numerics.
26             # Those are extremely uncommon and interfere with the match.
27             my($schemeRe) = qr/[a-zA-Z][a-zA-Z0-9\+]*/;
28             my($uricSet) = $uric; # use new set
29              
30             # Some schemes which URI.pm does not explicitly support.
31             my $extraSchemesRe = qr{^(?:git|svn|ssh|svn\+ssh)$};
32              
33             # We need to avoid picking up 'HTTP::Request::Common' so we have a
34             # subset of uric without a colon ("I have no colon and yet I must poop")
35             my($uricCheat) = __PACKAGE__->uric_set;
36             $uricCheat =~ tr/://d;
37              
38             # Identifying characters accidentally picked up with a URI.
39             my($cruftSet) = q{])\},.'";}; #'#
40              
41              
42             =head1 NAME
43              
44             URI::Find - Find URIs in arbitrary text
45              
46             =head1 SYNOPSIS
47              
48             require URI::Find;
49              
50             my $finder = URI::Find->new(\&callback);
51              
52             $how_many_found = $finder->find(\$text);
53              
54             =head1 DESCRIPTION
55              
56             This module does one thing: Finds URIs and URLs in plain text. It
57             finds them quickly and it finds them B (or what URI.pm considers
58             a URI to be.) It only finds URIs which include a scheme (http:// or
59             the like), for something a bit less strict have a look at
60             L.
61              
62             For a command-line interface, L is provided.
63              
64             =head2 Public Methods
65              
66             =over 4
67              
68             =item B
69              
70             my $finder = URI::Find->new(\&callback);
71              
72             Creates a new URI::Find object.
73              
74             &callback is a function which is called on each URI found. It is
75             passed two arguments, the first is a URI object representing the URI
76             found. The second is the original text of the URI found. The return
77             value of the callback will replace the original URI in the text.
78              
79             =cut
80              
81             sub new {
82 208 50   208 1 216205 @_ == 2 || __PACKAGE__->badinvo;
83 208         275 my($proto, $callback) = @_;
84 208   33     893 my($class) = ref $proto || $proto;
85 208         392 my $self = bless {}, $class;
86              
87 208         437 $self->{callback} = $callback;
88              
89 208         349 return $self;
90             }
91              
92             =item B
93              
94             my $how_many_found = $finder->find(\$text);
95              
96             $text is a string to search and possibly modify with your callback.
97              
98             Alternatively, C can be called with a replacement function for
99             the rest of the text:
100              
101             use CGI qw(escapeHTML);
102             # ...
103             my $how_many_found = $finder->find(\$text, \&escapeHTML);
104              
105             will not only call the callback function for every URL found (and
106             perform the replacement instructions therein), but also run the rest
107             of the text through C. This makes it easier to turn
108             plain text which contains URLs into HTML (see example below).
109              
110             =cut
111              
112             sub find {
113 211 50 66 211 1 876 @_ == 2 || @_ == 3 || __PACKAGE__->badinvo;
114 211         225 my($self, $r_text, $escape_func) = @_;
115              
116             # Might be slower, but it makes the code simpler
117 211   100 422   879 $escape_func ||= sub { return $_[0] };
  422         1003  
118              
119             # Store the escape func in the object temporarily for use
120             # by other methods.
121 211         409 local $self->{escape_func} = $escape_func;
122              
123 211         307 $self->{_uris_found} = 0;
124              
125             # Yes, evil. Basically, look for something vaguely resembling a URL,
126             # then hand it off to URI for examination. If it passes, throw
127             # it to a callback and put the result in its place.
128 211         768 local $SIG{__DIE__} = 'DEFAULT';
129 211         215 my $uri_cand;
130             my $uri;
131              
132 211         431 my $uriRe = sprintf '(?:%s|%s)', $self->uri_re, $self->schemeless_uri_re;
133              
134 211     3   21812 $$r_text =~ s{ (.*?) (?:(<(?:URL:)?)(.+?)(>)|($uriRe)) | (.+?)$ }{
  3         2231  
  3         27  
  3         39  
135 422         167140 my $replace = '';
136 422 100       1032 if( defined $6 ) {
137 109         194 $replace = $escape_func->($6);
138             }
139             else {
140 313         351 my $maybe_uri = '';
141              
142 313 100       1268 $replace = $escape_func->($1) if length $1;
143              
144 313 100       763 if( defined $2 ) {
145 60         107 $maybe_uri = $3;
146 60         70 my $is_uri = do { # Don't alter $1...
147 60         144 $maybe_uri =~ s/\s+//g;
148 60         2601 $maybe_uri =~ /^$uriRe/;
149             };
150              
151 60 100       16785 if( $is_uri ) {
152 27         78 $replace .= $escape_func->($2);
153 27         96 $replace .= $self->_uri_filter($maybe_uri);
154 27         79 $replace .= $escape_func->($4);
155             }
156             else {
157             # the whole text inside of the <...> was not a url, but
158             # maybe it has a url (like an HTML link)
159 33         37 my $has_uri = do { # Don't alter $1...
160 33         62 $maybe_uri = $3;
161 33         760 $maybe_uri =~ /$uriRe/;
162             };
163 33 100       4228 if( $has_uri ) {
164 4         7 my $pre = $2;
165 4         6 my $post = $4;
166 4         3 do { $self->find(\$maybe_uri, $escape_func) };
  4         18  
167 4         8 $replace .= $escape_func->($pre);
168 4         8 $replace .= $maybe_uri; # already escaped by find()
169 4         7 $replace .= $escape_func->($post);
170             }
171             else {
172 29         132 $replace .= $escape_func->($2.$3.$4);
173             }
174             }
175             }
176             else {
177 253         656 $replace .= $self->_uri_filter($5);
178             }
179             }
180              
181 422         4756 $replace;
182             }gsex;
183              
184 211         3317 return $self->{_uris_found};
185             }
186              
187              
188             sub _uri_filter {
189 280     280   585 my($self, $orig_match) = @_;
190              
191             # A heuristic. Often you'll see things like:
192             # "I saw this site, http://www.foo.com, and its really neat!"
193             # or "Foo Industries (at http://www.foo.com)"
194             # We want to avoid picking up the trailing paren, period or comma.
195             # Of course, this might wreck a perfectly valid URI, more often than
196             # not it corrects a parse mistake.
197 280         563 $orig_match = $self->decruft($orig_match);
198              
199 280         304 my $replacement = '';
200 280 100       670 if( my $uri = $self->_is_uri(\$orig_match) ) {
201             # It's a URI
202 265         1525 $self->{_uris_found}++;
203 265         690 $replacement = $self->{callback}->($uri, $orig_match);
204             }
205             else {
206             # False alarm
207 15         48 $replacement = $self->{escape_func}->($orig_match);
208             }
209              
210             # Return recrufted replacement
211 280         1594 return $self->recruft($replacement);
212             }
213              
214              
215             =back
216              
217             =head2 Protected Methods
218              
219             I got a bunch of mail from people asking if I'd add certain features
220             to URI::Find. Most wanted the search to be less restrictive, do more
221             heuristics, etc... Since many of the requests were contradictory, I'm
222             letting people create their own custom subclasses to do what they
223             want.
224              
225             The following are methods internal to URI::Find which a subclass can
226             override to change the way URI::Find acts. They are only to be called
227             B a URI::Find subclass. Users of this module are NOT to use
228             these methods.
229              
230             =over
231              
232             =item B
233              
234             my $uri_re = $self->uri_re;
235              
236             Returns the regex for finding absolute, schemed URIs
237             (http://www.foo.com and such). This, combined with
238             schemeless_uri_re() is what finds candidate URIs.
239              
240             Usually this method does not have to be overridden.
241              
242             =cut
243              
244             sub uri_re {
245 211 50   211 1 425 @_ == 1 || __PACKAGE__->badinvo;
246 211         239 my($self) = shift;
247 211         456 return sprintf '%s:[%s][%s#]*', $schemeRe,
248             $uricCheat,
249             $self->uric_set;
250             }
251              
252             =item B
253              
254             my $schemeless_re = $self->schemeless_uri_re;
255              
256             Returns the regex for finding schemeless URIs (www.foo.com and such) and
257             other things which might be URIs. By default this will match nothing
258             (though it used to try to find schemeless URIs which started with C
259             and C).
260              
261             Many people will want to override this method. See L
262             for a subclass does a reasonable job of finding URIs which might be missing
263             the scheme.
264              
265             =cut
266              
267             sub schemeless_uri_re {
268 313 50   313 1 724 @_ == 1 || __PACKAGE__->badinvo;
269 313         334 my($self) = shift;
270 313         2501 return qr/\b\B/; # match nothing
271             }
272              
273             =item B
274              
275             my $uric_set = $self->uric_set;
276              
277             Returns a set matching the 'uric' set defined in RFC 2396 suitable for
278             putting into a character set ([]) in a regex.
279              
280             You almost never have to override this.
281              
282             =cut
283              
284             sub uric_set {
285 219 50   219 1 443 @_ == 1 || __PACKAGE__->badinvo;
286 219         1214 return $uricSet;
287             }
288              
289             =item B
290              
291             my $cruft_set = $self->cruft_set;
292              
293             Returns a set of characters which are considered garbage. Used by
294             decruft().
295              
296             =cut
297              
298             sub cruft_set {
299 2 50   2 1 9 @_ == 1 || __PACKAGE__->badinvo;
300 2         7 return $cruftSet;
301             }
302              
303             =item B
304              
305             my $uri = $self->decruft($uri);
306              
307             Sometimes garbage characters like periods and parenthesis get
308             accidentally matched along with the URI. In order for the URI to be
309             properly identified, it must sometimes be "decrufted", the garbage
310             characters stripped.
311              
312             This method takes a candidate URI and strips off any cruft it finds.
313              
314             =cut
315              
316             my %balanced_cruft = (
317             '(' => ')',
318             '{' => '}',
319             '[' => ']',
320             '"' => '"',
321             q['] => q['],
322             );
323              
324             sub decruft {
325 280 50   280 1 635 @_ == 2 || __PACKAGE__->badinvo;
326 280         390 my($self, $orig_match) = @_;
327              
328 280         526 $self->{start_cruft} = '';
329 280         445 $self->{end_cruft} = '';
330              
331 280 100       2203 if( $orig_match =~ s/([\Q$cruftSet\E]+)$// ) {
332             # urls can end with HTML entities if found in HTML so let's put back semicolons
333             # if this looks like the case
334 63         125 my $cruft = $1;
335 63 100 66     219 if( $cruft =~ /^;/ && $orig_match =~ /\&(\#[1-9]\d{1,3}|[a-zA-Z]{2,8})$/) {
336 1         1 $orig_match .= ';';
337 1         4 $cruft =~ s/^;//;
338             }
339              
340 63         239 while( my($open, $close) = each %balanced_cruft ) {
341 315         631 $self->recruft_balanced(\$orig_match, \$cruft, $open, $close);
342             }
343              
344 63 100       180 $self->{end_cruft} = $cruft if $cruft;
345             }
346              
347 280         609 return $orig_match;
348             }
349              
350              
351             sub recruft_balanced {
352 315     315 0 294 my $self = shift;
353 315         335 my($orig_match, $cruft, $open, $close) = @_;
354              
355 315         2560 my $open_count = () = $$orig_match =~ m{\Q$open}g;
356 315         2157 my $close_count = () = $$orig_match =~ m{\Q$close}g;
357              
358 315 100 100     2762 if ( $$cruft =~ /\Q$close\E$/ && $open_count == ( $close_count + 1 ) ) {
359 18         42 $$orig_match .= $close;
360 18         177 $$cruft =~ s/\Q$close\E$//;
361             }
362              
363 315         1190 return;
364             }
365              
366              
367             =item B
368              
369             my $uri = $self->recruft($uri);
370              
371             This method puts back the cruft taken off with decruft(). This is necessary
372             because the cruft is destructively removed from the string before invoking
373             the user's callback, so it has to be put back afterwards.
374              
375             =cut
376              
377             #'#
378              
379             sub recruft {
380 280 50   280 1 689 @_ == 2 || __PACKAGE__->badinvo;
381 280         383 my($self, $uri) = @_;
382              
383 280         1133 return $self->{start_cruft} . $uri . $self->{end_cruft};
384             }
385              
386             =item B
387              
388             my $schemed_uri = $self->schemeless_to_schemed($schemeless_uri);
389              
390             This takes a schemeless URI and returns an absolute, schemed URI. The
391             standard implementation supplies ftp:// for URIs which start with ftp.,
392             and http:// otherwise.
393              
394             =cut
395              
396             sub schemeless_to_schemed {
397 42 50   42 1 106 @_ == 2 || __PACKAGE__->badinvo;
398 42         64 my($self, $uri_cand) = @_;
399              
400 42 100       361 $uri_cand =~ s|^(
401             or $uri_cand =~ s|^(
402              
403 42         108 return $uri_cand;
404             }
405              
406             =item B
407              
408             $obj->is_schemed($uri);
409              
410             Returns whether or not the given URI is schemed or schemeless. True for
411             schemed, false for schemeless.
412              
413             =cut
414              
415             sub is_schemed {
416 2 50   2 1 702 @_ == 2 || __PACKAGE__->badinvo;
417 2         4 my($self, $uri) = @_;
418 2         36 return scalar $uri =~ /^
419             }
420              
421             =item I
422              
423             __PACKAGE__->badinvo($extra_levels, $msg)
424              
425             This is used to complain about bogus subroutine/method invocations.
426             The args are optional.
427              
428             =cut
429              
430             sub badinvo {
431 0     0 1 0 my $package = shift;
432 0 0       0 my $level = @_ ? shift : 0;
433 0 0       0 my $msg = @_ ? " (" . shift() . ")" : '';
434 0         0 my $subname = (caller $level + 1)[3];
435 0         0 croak "Bogus invocation of $subname$msg";
436             }
437              
438             =back
439              
440             =head2 Old Functions
441              
442             The old find_uri() function is still around and it works, but its
443             deprecated.
444              
445             =cut
446              
447             # Old interface.
448             sub find_uris (\$&) {
449 64 50   64 0 132713 @_ == 2 || __PACKAGE__->badinvo;
450 64         151 my($r_text, $callback) = @_;
451              
452 64         1208 my $self = __PACKAGE__->new($callback);
453 64         166 return $self->find($r_text);
454             }
455              
456              
457             =head1 EXAMPLES
458              
459             Store a list of all URIs (normalized) in the document.
460              
461             my @uris;
462             my $finder = URI::Find->new(sub {
463             my($uri) = shift;
464             push @uris, $uri;
465             });
466             $finder->find(\$text);
467              
468             Print the original URI text found and the normalized representation.
469              
470             my $finder = URI::Find->new(sub {
471             my($uri, $orig_uri) = @_;
472             print "The text '$orig_uri' represents '$uri'\n";
473             return $orig_uri;
474             });
475             $finder->find(\$text);
476              
477             Check each URI in document to see if it exists.
478              
479             use LWP::Simple;
480              
481             my $finder = URI::Find->new(sub {
482             my($uri, $orig_uri) = @_;
483             if( head $uri ) {
484             print "$orig_uri is okay\n";
485             }
486             else {
487             print "$orig_uri cannot be found\n";
488             }
489             return $orig_uri;
490             });
491             $finder->find(\$text);
492              
493              
494             Turn plain text into HTML, with each URI found wrapped in an HTML anchor.
495              
496             use CGI qw(escapeHTML);
497             use URI::Find;
498              
499             my $finder = URI::Find->new(sub {
500             my($uri, $orig_uri) = @_;
501             return qq|$orig_uri|;
502             });
503             $finder->find(\$text, \&escapeHTML);
504             print "
$text
";
505              
506             =cut
507              
508              
509             sub _is_uri {
510 280 50   280   620 @_ == 2 || __PACKAGE__->badinvo;
511 280         378 my($self, $r_uri_cand) = @_;
512              
513 280         319 my $uri = $$r_uri_cand;
514              
515             # Translate schemeless to schemed if necessary.
516 280 100 66     642 $uri = $self->schemeless_to_schemed($uri) if
517             $uri =~ $self->schemeless_uri_re and
518             $uri !~ /^
519              
520 280         540 eval {
521 280         1119 $uri = URI->new($uri);
522              
523             # Throw out anything with an invalid scheme.
524 280   100     87351 my $has_invalid_scheme = $uri->isa("URI::_foreign") &&
525             $uri->scheme !~ $extraSchemesRe;
526              
527             # Toss out things like http:// but keep file:///
528 280         9495 my $is_empty = $uri =~ m{^$schemeRe://$};
529              
530 280 100 100     2601 undef $uri if $has_invalid_scheme || $is_empty;
531             };
532              
533 280 100 66     1182 if($@ || !defined $uri) { # leave everything untouched, its not a URI.
534 15         58 return NO;
535             }
536             else { # Its a URI.
537 265         912 return $uri;
538             }
539             }
540              
541              
542             =head1 NOTES
543              
544             Will not find URLs with Internationalized Domain Names or pretty much
545             any non-ascii stuff in them. See
546             L
547              
548              
549             =head1 AUTHOR
550              
551             Michael G Schwern with insight from Uri Gutman,
552             Greg Bacon, Jeff Pinyan, Roderick Schertler and others.
553              
554             Roderick Schertler maintained versions 0.11 to 0.16.
555              
556             Darren Chamberlain wrote urifind.
557              
558              
559             =head1 LICENSE
560              
561             Copyright 2000, 2009-2010, 2014, 2016 by Michael G Schwern Eschwern@pobox.comE.
562              
563             This program is free software; you can redistribute it and/or
564             modify it under the same terms as Perl itself.
565              
566             See F
567              
568             =head1 SEE ALSO
569              
570             L, L, L, RFC 3986 Appendix C
571              
572             =cut
573              
574             1;