File Coverage

blib/lib/URI/Find.pm

Criterion	Covered	Total	%
statement	134	139	96.4
branch	40	56	71.4
condition	20	26	76.9
subroutine	23	24	95.8
pod	11	13	84.6
total	228	258	88.3

line	stmt	bran	cond	sub	pod	time	code
1							package URI::Find;
2
3							require 5.006;
4
5	6			6		67870	use strict;
	6					10
	6					155
6	6			6		21	use warnings;
	6					8
	6					170
7	6			6		28	use base qw(Exporter);
	6					7
	6					518
8	6			6		25	use vars qw($VERSION @EXPORT);
	6					5
	6					440
9
10							$VERSION = 20160806;
11							@EXPORT = qw(find_uris);
12
13	6			6		27	use constant YES => (1==1);
	6					7
	6					407
14	6			6		25	use constant NO => !YES;
	6					6
	6					227
15
16	6			6		22	use Carp qw(croak);
	6					7
	6					8926
17
18							require URI;
19
20							my $reserved = q(;/?:@&=+$,[]);
21							my $mark = q(-_.!~*'());
22							my $unreserved = "A-Za-z0-9\Q$mark\E";
23							my $uric = quotemeta($reserved) . '\p{isAlpha}' . $unreserved . "%";
24
25							# URI scheme pattern without the non-alpha numerics.
26							# Those are extremely uncommon and interfere with the match.
27							my($schemeRe) = qr/[a-zA-Z][a-zA-Z0-9\+]*/;
28							my($uricSet) = $uric; # use new set
29
30							# Some schemes which URI.pm does not explicitly support.
31							my $extraSchemesRe = qr{^(?:git\|svn\|ssh\|svn\+ssh)$};
32
33							# We need to avoid picking up 'HTTP::Request::Common' so we have a
34							# subset of uric without a colon ("I have no colon and yet I must poop")
35							my($uricCheat) = __PACKAGE__->uric_set;
36							$uricCheat =~ tr/://d;
37
38							# Identifying characters accidentally picked up with a URI.
39							my($cruftSet) = q{])\},.'";}; #'#
40
41
42							=head1 NAME
43
44							URI::Find - Find URIs in arbitrary text
45
46							=head1 SYNOPSIS
47
48							require URI::Find;
49
50							my $finder = URI::Find->new(\&callback);
51
52							$how_many_found = $finder->find(\$text);
53
54							=head1 DESCRIPTION
55
56							This module does one thing: Finds URIs and URLs in plain text. It
57							finds them quickly and it finds them B (or what URI.pm considers
58							a URI to be.) It only finds URIs which include a scheme (http:// or
59							the like), for something a bit less strict have a look at
60							L.
61
62							For a command-line interface, L is provided.
63
64							=head2 Public Methods
65
66							=over 4
67
68							=item B
69
70							my $finder = URI::Find->new(\&callback);
71
72							Creates a new URI::Find object.
73
74							&callback is a function which is called on each URI found. It is
75							passed two arguments, the first is a URI object representing the URI
76							found. The second is the original text of the URI found. The return
77							value of the callback will replace the original URI in the text.
78
79							=cut
80
81							sub new {
82	208	50		208	1	216205	@_ == 2 \|\| __PACKAGE__->badinvo;
83	208					275	my($proto, $callback) = @_;
84	208		33			893	my($class) = ref $proto \|\| $proto;
85	208					392	my $self = bless {}, $class;
86
87	208					437	$self->{callback} = $callback;
88
89	208					349	return $self;
90							}
91
92							=item B
93
94							my $how_many_found = $finder->find(\$text);
95
96							$text is a string to search and possibly modify with your callback.
97
98							Alternatively, C can be called with a replacement function for
99							the rest of the text:
100
101							use CGI qw(escapeHTML);
102							# ...
103							my $how_many_found = $finder->find(\$text, \&escapeHTML);
104
105							will not only call the callback function for every URL found (and
106							perform the replacement instructions therein), but also run the rest
107							of the text through C. This makes it easier to turn
108							plain text which contains URLs into HTML (see example below).
109
110							=cut
111
112							sub find {
113	211	50	66	211	1	876	@_ == 2 \|\| @_ == 3 \|\| __PACKAGE__->badinvo;
114	211					225	my($self, $r_text, $escape_func) = @_;
115
116							# Might be slower, but it makes the code simpler
117	211		100	422		879	$escape_func \|\|= sub { return $_[0] };
	422					1003
118
119							# Store the escape func in the object temporarily for use
120							# by other methods.
121	211					409	local $self->{escape_func} = $escape_func;
122
123	211					307	$self->{_uris_found} = 0;
124
125							# Yes, evil. Basically, look for something vaguely resembling a URL,
126							# then hand it off to URI for examination. If it passes, throw
127							# it to a callback and put the result in its place.
128	211					768	local $SIG{__DIE__} = 'DEFAULT';
129	211					215	my $uri_cand;
130							my $uri;
131
132	211					431	my $uriRe = sprintf '(?:%s\|%s)', $self->uri_re, $self->schemeless_uri_re;
133
134	211			3		21812	$$r_text =~ s{ (.*?) (?:(<(?:URL:)?)(.+?)(>)\|($uriRe)) \| (.+?)$ }{
	3					2231
	3					27
	3					39
135	422					167140	my $replace = '';
136	422	100				1032	if( defined $6 ) {
137	109					194	$replace = $escape_func->($6);
138							}
139							else {
140	313					351	my $maybe_uri = '';
141
142	313	100				1268	$replace = $escape_func->($1) if length $1;
143
144	313	100				763	if( defined $2 ) {
145	60					107	$maybe_uri = $3;
146	60					70	my $is_uri = do { # Don't alter $1...
147	60					144	$maybe_uri =~ s/\s+//g;
148	60					2601	$maybe_uri =~ /^$uriRe/;
149							};
150
151	60	100				16785	if( $is_uri ) {
152	27					78	$replace .= $escape_func->($2);
153	27					96	$replace .= $self->_uri_filter($maybe_uri);
154	27					79	$replace .= $escape_func->($4);
155							}
156							else {
157							# the whole text inside of the <...> was not a url, but
158							# maybe it has a url (like an HTML link)
159	33					37	my $has_uri = do { # Don't alter $1...
160	33					62	$maybe_uri = $3;
161	33					760	$maybe_uri =~ /$uriRe/;
162							};
163	33	100				4228	if( $has_uri ) {
164	4					7	my $pre = $2;
165	4					6	my $post = $4;
166	4					3	do { $self->find(\$maybe_uri, $escape_func) };
	4					18
167	4					8	$replace .= $escape_func->($pre);
168	4					8	$replace .= $maybe_uri; # already escaped by find()
169	4					7	$replace .= $escape_func->($post);
170							}
171							else {
172	29					132	$replace .= $escape_func->($2.$3.$4);
173							}
174							}
175							}
176							else {
177	253					656	$replace .= $self->_uri_filter($5);
178							}
179							}
180
181	422					4756	$replace;
182							}gsex;
183
184	211					3317	return $self->{_uris_found};
185							}
186
187
188							sub _uri_filter {
189	280			280		585	my($self, $orig_match) = @_;
190
191							# A heuristic. Often you'll see things like:
192							# "I saw this site, http://www.foo.com, and its really neat!"
193							# or "Foo Industries (at http://www.foo.com)"
194							# We want to avoid picking up the trailing paren, period or comma.
195							# Of course, this might wreck a perfectly valid URI, more often than
196							# not it corrects a parse mistake.
197	280					563	$orig_match = $self->decruft($orig_match);
198
199	280					304	my $replacement = '';
200	280	100				670	if( my $uri = $self->_is_uri(\$orig_match) ) {
201							# It's a URI
202	265					1525	$self->{_uris_found}++;
203	265					690	$replacement = $self->{callback}->($uri, $orig_match);
204							}
205							else {
206							# False alarm
207	15					48	$replacement = $self->{escape_func}->($orig_match);
208							}
209
210							# Return recrufted replacement
211	280					1594	return $self->recruft($replacement);
212							}
213
214
215							=back
216
217							=head2 Protected Methods
218
219							I got a bunch of mail from people asking if I'd add certain features
220							to URI::Find. Most wanted the search to be less restrictive, do more
221							heuristics, etc... Since many of the requests were contradictory, I'm
222							letting people create their own custom subclasses to do what they
223							want.
224
225							The following are methods internal to URI::Find which a subclass can
226							override to change the way URI::Find acts. They are only to be called
227							B a URI::Find subclass. Users of this module are NOT to use
228							these methods.
229
230							=over
231
232							=item B
233
234							my $uri_re = $self->uri_re;
235
236							Returns the regex for finding absolute, schemed URIs
237							(http://www.foo.com and such). This, combined with
238							schemeless_uri_re() is what finds candidate URIs.
239
240							Usually this method does not have to be overridden.
241
242							=cut
243
244							sub uri_re {
245	211	50		211	1	425	@_ == 1 \|\| __PACKAGE__->badinvo;
246	211					239	my($self) = shift;
247	211					456	return sprintf '%s:[%s][%s#]*', $schemeRe,
248							$uricCheat,
249							$self->uric_set;
250							}
251
252							=item B
253
254							my $schemeless_re = $self->schemeless_uri_re;
255
256							Returns the regex for finding schemeless URIs (www.foo.com and such) and
257							other things which might be URIs. By default this will match nothing
258							(though it used to try to find schemeless URIs which started with C
259							and C).
260
261							Many people will want to override this method. See L
262							for a subclass does a reasonable job of finding URIs which might be missing
263							the scheme.
264
265							=cut
266
267							sub schemeless_uri_re {
268	313	50		313	1	724	@_ == 1 \|\| __PACKAGE__->badinvo;
269	313					334	my($self) = shift;
270	313					2501	return qr/\b\B/; # match nothing
271							}
272
273							=item B
274
275							my $uric_set = $self->uric_set;
276
277							Returns a set matching the 'uric' set defined in RFC 2396 suitable for
278							putting into a character set ([]) in a regex.
279
280							You almost never have to override this.
281
282							=cut
283
284							sub uric_set {
285	219	50		219	1	443	@_ == 1 \|\| __PACKAGE__->badinvo;
286	219					1214	return $uricSet;
287							}
288
289							=item B
290
291							my $cruft_set = $self->cruft_set;
292
293							Returns a set of characters which are considered garbage. Used by
294							decruft().
295
296							=cut
297
298							sub cruft_set {
299	2	50		2	1	9	@_ == 1 \|\| __PACKAGE__->badinvo;
300	2					7	return $cruftSet;
301							}
302
303							=item B
304
305							my $uri = $self->decruft($uri);
306
307							Sometimes garbage characters like periods and parenthesis get
308							accidentally matched along with the URI. In order for the URI to be
309							properly identified, it must sometimes be "decrufted", the garbage
310							characters stripped.
311
312							This method takes a candidate URI and strips off any cruft it finds.
313
314							=cut
315
316							my %balanced_cruft = (
317							'(' => ')',
318							'{' => '}',
319							'[' => ']',
320							'"' => '"',
321							q['] => q['],
322							);
323
324							sub decruft {
325	280	50		280	1	635	@_ == 2 \|\| __PACKAGE__->badinvo;
326	280					390	my($self, $orig_match) = @_;
327
328	280					526	$self->{start_cruft} = '';
329	280					445	$self->{end_cruft} = '';
330
331	280	100				2203	if( $orig_match =~ s/([\Q$cruftSet\E]+)$// ) {
332							# urls can end with HTML entities if found in HTML so let's put back semicolons
333							# if this looks like the case
334	63					125	my $cruft = $1;
335	63	100	66			219	if( $cruft =~ /^;/ && $orig_match =~ /\&(\#[1-9]\d{1,3}\|[a-zA-Z]{2,8})$/) {
336	1					1	$orig_match .= ';';
337	1					4	$cruft =~ s/^;//;
338							}
339
340	63					239	while( my($open, $close) = each %balanced_cruft ) {
341	315					631	$self->recruft_balanced(\$orig_match, \$cruft, $open, $close);
342							}
343
344	63	100				180	$self->{end_cruft} = $cruft if $cruft;
345							}
346
347	280					609	return $orig_match;
348							}
349
350
351							sub recruft_balanced {
352	315			315	0	294	my $self = shift;
353	315					335	my($orig_match, $cruft, $open, $close) = @_;
354
355	315					2560	my $open_count = () = $$orig_match =~ m{\Q$open}g;
356	315					2157	my $close_count = () = $$orig_match =~ m{\Q$close}g;
357
358	315	100	100			2762	if ( $$cruft =~ /\Q$close\E$/ && $open_count == ( $close_count + 1 ) ) {
359	18					42	$$orig_match .= $close;
360	18					177	$$cruft =~ s/\Q$close\E$//;
361							}
362
363	315					1190	return;
364							}
365
366
367							=item B
368
369							my $uri = $self->recruft($uri);
370
371							This method puts back the cruft taken off with decruft(). This is necessary
372							because the cruft is destructively removed from the string before invoking
373							the user's callback, so it has to be put back afterwards.
374
375							=cut
376
377							#'#
378
379							sub recruft {
380	280	50		280	1	689	@_ == 2 \|\| __PACKAGE__->badinvo;
381	280					383	my($self, $uri) = @_;
382
383	280					1133	return $self->{start_cruft} . $uri . $self->{end_cruft};
384							}
385
386							=item B
387
388							my $schemed_uri = $self->schemeless_to_schemed($schemeless_uri);
389
390							This takes a schemeless URI and returns an absolute, schemed URI. The
391							standard implementation supplies ftp:// for URIs which start with ftp.,
392							and http:// otherwise.
393
394							=cut
395
396							sub schemeless_to_schemed {
397	42	50		42	1	106	@_ == 2 \|\| __PACKAGE__->badinvo;
398	42					64	my($self, $uri_cand) = @_;
399
400	42	100				361	$uri_cand =~ s\|^(
401							or $uri_cand =~ s\|^(
402
403	42					108	return $uri_cand;
404							}
405
406							=item B
407
408							$obj->is_schemed($uri);
409
410							Returns whether or not the given URI is schemed or schemeless. True for
411							schemed, false for schemeless.
412
413							=cut
414
415							sub is_schemed {
416	2	50		2	1	702	@_ == 2 \|\| __PACKAGE__->badinvo;
417	2					4	my($self, $uri) = @_;
418	2					36	return scalar $uri =~ /^
419							}
420
421							=item I
422
423							__PACKAGE__->badinvo($extra_levels, $msg)
424
425							This is used to complain about bogus subroutine/method invocations.
426							The args are optional.
427
428							=cut
429
430							sub badinvo {
431	0			0	1	0	my $package = shift;
432	0	0				0	my $level = @_ ? shift : 0;
433	0	0				0	my $msg = @_ ? " (" . shift() . ")" : '';
434	0					0	my $subname = (caller $level + 1)[3];
435	0					0	croak "Bogus invocation of $subname$msg";
436							}
437
438							=back
439
440							=head2 Old Functions
441
442							The old find_uri() function is still around and it works, but its
443							deprecated.
444
445							=cut
446
447							# Old interface.
448							sub find_uris (\$&) {
449	64	50		64	0	132713	@_ == 2 \|\| __PACKAGE__->badinvo;
450	64					151	my($r_text, $callback) = @_;
451
452	64					1208	my $self = __PACKAGE__->new($callback);
453	64					166	return $self->find($r_text);
454							}
455
456
457							=head1 EXAMPLES
458
459							Store a list of all URIs (normalized) in the document.
460
461							my @uris;
462							my $finder = URI::Find->new(sub {
463							my($uri) = shift;
464							push @uris, $uri;
465							});
466							$finder->find(\$text);
467
468							Print the original URI text found and the normalized representation.
469
470							my $finder = URI::Find->new(sub {
471							my($uri, $orig_uri) = @_;
472							print "The text '$orig_uri' represents '$uri'\n";
473							return $orig_uri;
474							});
475							$finder->find(\$text);
476
477							Check each URI in document to see if it exists.
478
479							use LWP::Simple;
480
481							my $finder = URI::Find->new(sub {
482							my($uri, $orig_uri) = @_;
483							if( head $uri ) {
484							print "$orig_uri is okay\n";
485							}
486							else {
487							print "$orig_uri cannot be found\n";
488							}
489							return $orig_uri;
490							});
491							$finder->find(\$text);
492
493
494							Turn plain text into HTML, with each URI found wrapped in an HTML anchor.
495
496							use CGI qw(escapeHTML);
497							use URI::Find;
498
499							my $finder = URI::Find->new(sub {
500							my($uri, $orig_uri) = @_;
501							return qq\|$orig_uri\|;
502							});
503							$finder->find(\$text, \&escapeHTML);
504							print " $text ";
505
506							=cut
507
508
509							sub _is_uri {
510	280	50		280		620	@_ == 2 \|\| __PACKAGE__->badinvo;
511	280					378	my($self, $r_uri_cand) = @_;
512
513	280					319	my $uri = $$r_uri_cand;
514
515							# Translate schemeless to schemed if necessary.
516	280	100	66			642	$uri = $self->schemeless_to_schemed($uri) if
517							$uri =~ $self->schemeless_uri_re and
518							$uri !~ /^
519
520	280					540	eval {
521	280					1119	$uri = URI->new($uri);
522
523							# Throw out anything with an invalid scheme.
524	280		100			87351	my $has_invalid_scheme = $uri->isa("URI::_foreign") &&
525							$uri->scheme !~ $extraSchemesRe;
526
527							# Toss out things like http:// but keep file:///
528	280					9495	my $is_empty = $uri =~ m{^$schemeRe://$};
529
530	280	100	100			2601	undef $uri if $has_invalid_scheme \|\| $is_empty;
531							};
532
533	280	100	66			1182	if($@ \|\| !defined $uri) { # leave everything untouched, its not a URI.
534	15					58	return NO;
535							}
536							else { # Its a URI.
537	265					912	return $uri;
538							}
539							}
540
541
542							=head1 NOTES
543
544							Will not find URLs with Internationalized Domain Names or pretty much
545							any non-ascii stuff in them. See
546							L
547
548
549							=head1 AUTHOR
550
551							Michael G Schwern with insight from Uri Gutman,
552							Greg Bacon, Jeff Pinyan, Roderick Schertler and others.
553
554							Roderick Schertler maintained versions 0.11 to 0.16.
555
556							Darren Chamberlain wrote urifind.
557
558
559							=head1 LICENSE
560
561							Copyright 2000, 2009-2010, 2014, 2016 by Michael G Schwern Eschwern@pobox.comE.
562
563							This program is free software; you can redistribute it and/or
564							modify it under the same terms as Perl itself.
565
566							See F
567
568							=head1 SEE ALSO
569
570							L, L, L, RFC 3986 Appendix C
571
572							=cut
573
574							1;