File Coverage

blib/lib/URI/Find.pm

Criterion	Covered	Total	%
statement	131	136	96.3
branch	40	56	71.4
condition	20	26	76.9
subroutine	22	23	95.6
pod	11	13	84.6
total	224	254	88.1

line	stmt	bran	cond	sub	pod	time	code
1							# Copyright (c) 2000, 2009 Michael G. Schwern. All rights reserved.
2							# This program is free software; you can redistribute it and/or modify
3							# it under the same terms as Perl itself.
4
5							package URI::Find;
6
7							require 5.006;
8
9	6			6		123039	use strict;
	6					16
	6					340
10	6			6		33	use base qw(Exporter);
	6					11
	6					792
11	6			6		48	use vars qw($VERSION @EXPORT);
	6					9
	6					576
12
13							$VERSION = 20140709;
14							@EXPORT = qw(find_uris);
15
16	6			6		45	use constant YES => (1==1);
	6					11
	6					852
17	6			6		32	use constant NO => !YES;
	6					15
	6					268
18
19	6			6		122	use Carp qw(croak);
	6					12
	6					14148
20
21							require URI;
22
23							my $reserved = q(;/?:@&=+$,[]);
24							my $mark = q(-_.!~*'());
25							my $unreserved = "A-Za-z0-9\Q$mark\E";
26							my $uric = quotemeta($reserved) . '\p{isAlpha}' . $unreserved . "%";
27
28							# URI scheme pattern without the non-alpha numerics.
29							# Those are extremely uncommon and interfere with the match.
30							my($schemeRe) = qr/[a-zA-Z][a-zA-Z0-9\+]*/;
31							my($uricSet) = $uric; # use new set
32
33							# Some schemes which URI.pm does not explicitly support.
34							my $extraSchemesRe = qr{^(?:git\|svn\|ssh\|svn\+ssh)$};
35
36							# We need to avoid picking up 'HTTP::Request::Common' so we have a
37							# subset of uric without a colon ("I have no colon and yet I must poop")
38							my($uricCheat) = __PACKAGE__->uric_set;
39							$uricCheat =~ tr/://d;
40
41							# Identifying characters accidentally picked up with a URI.
42							my($cruftSet) = q{])\},.'";}; #'#
43
44
45							=head1 NAME
46
47							URI::Find - Find URIs in arbitrary text
48
49							=head1 SYNOPSIS
50
51							require URI::Find;
52
53							my $finder = URI::Find->new(\&callback);
54
55							$how_many_found = $finder->find(\$text);
56
57							=head1 DESCRIPTION
58
59							This module does one thing: Finds URIs and URLs in plain text. It
60							finds them quickly and it finds them B (or what URI.pm considers
61							a URI to be.) It only finds URIs which include a scheme (http:// or
62							the like), for something a bit less strict have a look at
63							L.
64
65							For a command-line interface, L is provided.
66
67							=head2 Public Methods
68
69							=over 4
70
71							=item B
72
73							my $finder = URI::Find->new(\&callback);
74
75							Creates a new URI::Find object.
76
77							&callback is a function which is called on each URI found. It is
78							passed two arguments, the first is a URI object representing the URI
79							found. The second is the original text of the URI found. The return
80							value of the callback will replace the original URI in the text.
81
82							=cut
83
84							sub new {
85	208	50		208	1	514939	@_ == 2 \|\| __PACKAGE__->badinvo;
86	208					636	my($proto, $callback) = @_;
87	208		33			1624	my($class) = ref $proto \|\| $proto;
88	208					1328	my $self = bless {}, $class;
89
90	208					1198	$self->{callback} = $callback;
91
92	208					854	return $self;
93							}
94
95							=item B
96
97							my $how_many_found = $finder->find(\$text);
98
99							$text is a string to search and possibly modify with your callback.
100
101							Alternatively, C can be called with a replacement function for
102							the rest of the text:
103
104							use CGI qw(escapeHTML);
105							# ...
106							my $how_many_found = $finder->find(\$text, \&escapeHTML);
107
108							will not only call the callback function for every URL found (and
109							perform the replacement instructions therein), but also run the rest
110							of the text through C. This makes it easier to turn
111							plain text which contains URLs into HTML (see example below).
112
113							=cut
114
115							sub find {
116	211	50	66	211	1	2727	@_ == 2 \|\| @_ == 3 \|\| __PACKAGE__->badinvo;
117	211					378	my($self, $r_text, $escape_func) = @_;
118
119							# Might be slower, but it makes the code simpler
120	211		100	424		1842	$escape_func \|\|= sub { return $_[0] };
	424					1362
121
122							# Store the escape func in the object temporarily for use
123							# by other methods.
124	211					607	local $self->{escape_func} = $escape_func;
125
126	211					1017	$self->{_uris_found} = 0;
127
128							# Yes, evil. Basically, look for something vaguely resembling a URL,
129							# then hand it off to URI for examination. If it passes, throw
130							# it to a callback and put the result in its place.
131	211					1103	local $SIG{__DIE__} = 'DEFAULT';
132	211					668	my $uri_cand;
133							my $uri;
134
135	211					1067	my $uriRe = sprintf '(?:%s\|%s)', $self->uri_re, $self->schemeless_uri_re;
136
137	211			3		46471	$$r_text =~ s{ (.*?) (?:(<(?:URL:)?)(.+?)(>)\|($uriRe)) \| (.+?)$ }{
	3					5599
	3					96
	3					47
138	424					376728	my $replace = '';
139	424	100				1810	if( defined $6 ) {
140	111					322	$replace = $escape_func->($6);
141							}
142							else {
143	313					639	my $maybe_uri = '';
144
145	313	100				1918	$replace = $escape_func->($1) if length $1;
146
147	313	100				1508	if( defined $2 ) {
148	60					268	$maybe_uri = $3;
149	60					89	my $is_uri = do { # Don't alter $1...
150	60					169	$maybe_uri =~ s/\s+//g;
151	60					4712	$maybe_uri =~ /^$uriRe/;
152							};
153
154	60	100				43291	if( $is_uri ) {
155	27					96	$replace .= $escape_func->($2);
156	27					243	$replace .= $self->_uri_filter($maybe_uri);
157	27					90	$replace .= $escape_func->($4);
158							}
159							else {
160							# the whole text inside of the <...> was not a url, but
161							# maybe it has a url (like an HTML link)
162	33					57	my $has_uri = do { # Don't alter $1...
163	33					86	$maybe_uri = $3;
164	33					1154	$maybe_uri =~ /$uriRe/;
165							};
166	33	100				23489	if( $has_uri ) {
167	4					11	my $pre = $2;
168	4					7	my $post = $4;
169	4					6	do { $self->find(\$maybe_uri, $escape_func) };
	4					24
170	4					12	$replace .= $escape_func->($pre);
171	4					16	$replace .= $maybe_uri; # already escaped by find()
172	4					8	$replace .= $escape_func->($post);
173							}
174							else {
175	29					159	$replace .= $escape_func->($2.$3.$4);
176							}
177							}
178							}
179							else {
180	253					1531	$replace .= $self->_uri_filter($5);
181							}
182							}
183
184	424					8719	$replace;
185							}gsex;
186
187	211					9041	return $self->{_uris_found};
188							}
189
190
191							sub _uri_filter {
192	280			280		1318	my($self, $orig_match) = @_;
193
194							# A heuristic. Often you'll see things like:
195							# "I saw this site, http://www.foo.com, and its really neat!"
196							# or "Foo Industries (at http://www.foo.com)"
197							# We want to avoid picking up the trailing paren, period or comma.
198							# Of course, this might wreck a perfectly valid URI, more often than
199							# not it corrects a parse mistake.
200	280					874	$orig_match = $self->decruft($orig_match);
201
202	280					768	my $replacement = '';
203	280	100				960	if( my $uri = $self->_is_uri(\$orig_match) ) {
204							# It's a URI
205	265					2796	$self->{_uris_found}++;
206	265					1062	$replacement = $self->{callback}->($uri, $orig_match);
207							}
208							else {
209							# False alarm
210	15					59	$replacement = $self->{escape_func}->($orig_match);
211							}
212
213							# Return recrufted replacement
214	280					1988	return $self->recruft($replacement);
215							}
216
217
218							=back
219
220							=head2 Protected Methods
221
222							I got a bunch of mail from people asking if I'd add certain features
223							to URI::Find. Most wanted the search to be less restrictive, do more
224							heuristics, etc... Since many of the requests were contradictory, I'm
225							letting people create their own custom subclasses to do what they
226							want.
227
228							The following are methods internal to URI::Find which a subclass can
229							override to change the way URI::Find acts. They are only to be called
230							B a URI::Find subclass. Users of this module are NOT to use
231							these methods.
232
233							=over
234
235							=item B
236
237							my $uri_re = $self->uri_re;
238
239							Returns the regex for finding absolute, schemed URIs
240							(http://www.foo.com and such). This, combined with
241							schemeless_uri_re() is what finds candidate URIs.
242
243							Usually this method does not have to be overridden.
244
245							=cut
246
247							sub uri_re {
248	211	50		211	1	1952	@_ == 1 \|\| __PACKAGE__->badinvo;
249	211					648	my($self) = shift;
250	211					1289	return sprintf '%s:[%s][%s#]*', $schemeRe,
251							$uricCheat,
252							$self->uric_set;
253							}
254
255							=item B
256
257							my $schemeless_re = $self->schemeless_uri_re;
258
259							Returns the regex for finding schemeless URIs (www.foo.com and such) and
260							other things which might be URIs. By default this will match nothing
261							(though it used to try to find schemeless URIs which started with C
262							and C).
263
264							Many people will want to override this method. See L
265							for a subclass does a reasonable job of finding URIs which might be missing
266							the scheme.
267
268							=cut
269
270							sub schemeless_uri_re {
271	313	50		313	1	1100	@_ == 1 \|\| __PACKAGE__->badinvo;
272	313					551	my($self) = shift;
273	313					3948	return qr/\b\B/; # match nothing
274							}
275
276							=item B
277
278							my $uric_set = $self->uric_set;
279
280							Returns a set matching the 'uric' set defined in RFC 2396 suitable for
281							putting into a character set ([]) in a regex.
282
283							You almost never have to override this.
284
285							=cut
286
287							sub uric_set {
288	219	50		219	1	973	@_ == 1 \|\| __PACKAGE__->badinvo;
289	219					1929	return $uricSet;
290							}
291
292							=item B
293
294							my $cruft_set = $self->cruft_set;
295
296							Returns a set of characters which are considered garbage. Used by
297							decruft().
298
299							=cut
300
301							sub cruft_set {
302	2	50		2	1	16	@_ == 1 \|\| __PACKAGE__->badinvo;
303	2					10	return $cruftSet;
304							}
305
306							=item B
307
308							my $uri = $self->decruft($uri);
309
310							Sometimes garbage characters like periods and parenthesis get
311							accidentally matched along with the URI. In order for the URI to be
312							properly identified, it must sometimes be "decrufted", the garbage
313							characters stripped.
314
315							This method takes a candidate URI and strips off any cruft it finds.
316
317							=cut
318
319							my %balanced_cruft = (
320							'(' => ')',
321							'{' => '}',
322							'[' => ']',
323							'"' => '"',
324							q['] => q['],
325							);
326
327							sub decruft {
328	280	50		280	1	984	@_ == 2 \|\| __PACKAGE__->badinvo;
329	280					512	my($self, $orig_match) = @_;
330
331	280					1098	$self->{start_cruft} = '';
332	280					599	$self->{end_cruft} = '';
333
334	280	100				3391	if( $orig_match =~ s/([\Q$cruftSet\E]+)$// ) {
335							# urls can end with HTML entities if found in HTML so let's put back semicolons
336							# if this looks like the case
337	63					132	my $cruft = $1;
338	63	100	66			275	if( $cruft =~ /^;/ && $orig_match =~ /\&(\#[1-9]\d{1,3}\|[a-zA-Z]{2,8})$/) {
339	1					3	$orig_match .= ';';
340	1					4	$cruft =~ s/^;//;
341							}
342
343	63					514	while( my($open, $close) = each %balanced_cruft ) {
344	315					839	$self->recruft_balanced(\$orig_match, \$cruft, $open, $close);
345							}
346
347	63	100				224	$self->{end_cruft} = $cruft if $cruft;
348							}
349
350	280					1366	return $orig_match;
351							}
352
353
354							sub recruft_balanced {
355	315			315	0	390	my $self = shift;
356	315					3706	my($orig_match, $cruft, $open, $close) = @_;
357
358	315					3789	my $open_count = () = $$orig_match =~ m{\Q$open}g;
359	315					3793	my $close_count = () = $$orig_match =~ m{\Q$close}g;
360
361	315	100	100			3835	if ( $$cruft =~ /\Q$close\E$/ && $open_count == ( $close_count + 1 ) ) {
362	18					45	$$orig_match .= $close;
363	18					208	$$cruft =~ s/\Q$close\E$//;
364							}
365
366	315					3170	return;
367							}
368
369
370							=item B
371
372							my $uri = $self->recruft($uri);
373
374							This method puts back the cruft taken off with decruft(). This is necessary
375							because the cruft is destructively removed from the string before invoking
376							the user's callback, so it has to be put back afterwards.
377
378							=cut
379
380							#'#
381
382							sub recruft {
383	280	50		280	1	1004	@_ == 2 \|\| __PACKAGE__->badinvo;
384	280					491	my($self, $uri) = @_;
385
386	280					1655	return $self->{start_cruft} . $uri . $self->{end_cruft};
387							}
388
389							=item B
390
391							my $schemed_uri = $self->schemeless_to_schemed($schemeless_uri);
392
393							This takes a schemeless URI and returns an absolute, schemed URI. The
394							standard implementation supplies ftp:// for URIs which start with ftp.,
395							and http:// otherwise.
396
397							=cut
398
399							sub schemeless_to_schemed {
400	42	50		42	1	305	@_ == 2 \|\| __PACKAGE__->badinvo;
401	42					101	my($self, $uri_cand) = @_;
402
403	42	100				2549	$uri_cand =~ s\|^(
404							or $uri_cand =~ s\|^(
405
406	42					132	return $uri_cand;
407							}
408
409							=item B
410
411							$obj->is_schemed($uri);
412
413							Returns whether or not the given URI is schemed or schemeless. True for
414							schemed, false for schemeless.
415
416							=cut
417
418							sub is_schemed {
419	2	50		2	1	1077	@_ == 2 \|\| __PACKAGE__->badinvo;
420	2					5	my($self, $uri) = @_;
421	2					70	return scalar $uri =~ /^
422							}
423
424							=item I
425
426							__PACKAGE__->badinvo($extra_levels, $msg)
427
428							This is used to complain about bogus subroutine/method invocations.
429							The args are optional.
430
431							=cut
432
433							sub badinvo {
434	0			0	1	0	my $package = shift;
435	0	0				0	my $level = @_ ? shift : 0;
436	0	0				0	my $msg = @_ ? " (" . shift() . ")" : '';
437	0					0	my $subname = (caller $level + 1)[3];
438	0					0	croak "Bogus invocation of $subname$msg";
439							}
440
441							=back
442
443							=head2 Old Functions
444
445							The old find_uri() function is still around and it works, but its
446							deprecated.
447
448							=cut
449
450							# Old interface.
451							sub find_uris (\$&) {
452	64	50		64	0	197543	@_ == 2 \|\| __PACKAGE__->badinvo;
453	64					704	my($r_text, $callback) = @_;
454
455	64					321	my $self = __PACKAGE__->new($callback);
456	64					327	return $self->find($r_text);
457							}
458
459
460							=head1 EXAMPLES
461
462							Store a list of all URIs (normalized) in the document.
463
464							my @uris;
465							my $finder = URI::Find->new(sub {
466							my($uri) = shift;
467							push @uris, $uri;
468							});
469							$finder->find(\$text);
470
471							Print the original URI text found and the normalized representation.
472
473							my $finder = URI::Find->new(sub {
474							my($uri, $orig_uri) = @_;
475							print "The text '$orig_uri' represents '$uri'\n";
476							return $orig_uri;
477							});
478							$finder->find(\$text);
479
480							Check each URI in document to see if it exists.
481
482							use LWP::Simple;
483
484							my $finder = URI::Find->new(sub {
485							my($uri, $orig_uri) = @_;
486							if( head $uri ) {
487							print "$orig_uri is okay\n";
488							}
489							else {
490							print "$orig_uri cannot be found\n";
491							}
492							return $orig_uri;
493							});
494							$finder->find(\$text);
495
496
497							Turn plain text into HTML, with each URI found wrapped in an HTML anchor.
498
499							use CGI qw(escapeHTML);
500							use URI::Find;
501
502							my $finder = URI::Find->new(sub {
503							my($uri, $orig_uri) = @_;
504							return qq\|$orig_uri\|;
505							});
506							$finder->find(\$text, \&escapeHTML);
507							print " $text ";
508
509							=cut
510
511
512							sub _is_uri {
513	280	50		280		772	@_ == 2 \|\| __PACKAGE__->badinvo;
514	280					534	my($self, $r_uri_cand) = @_;
515
516	280					485	my $uri = $$r_uri_cand;
517
518							# Translate schemeless to schemed if necessary.
519	280	100	66			34833	$uri = $self->schemeless_to_schemed($uri) if
520							$uri =~ $self->schemeless_uri_re and
521							$uri !~ /^
522
523	280					769	eval {
524	280					7611	$uri = URI->new($uri);
525
526							# Throw out anything with an invalid scheme.
527	280		100			198230	my $has_invalid_scheme = $uri->isa("URI::_foreign") &&
528							$uri->scheme !~ $extraSchemesRe;
529
530							# Toss out things like http:// but keep file:///
531	280					30761	my $is_empty = $uri =~ m{^$schemeRe://$};
532
533	280	100	100			3935	undef $uri if $has_invalid_scheme \|\| $is_empty;
534							};
535
536	280	100	66			8381	if($@ \|\| !defined $uri) { # leave everything untouched, its not a URI.
537	15					131	return NO;
538							}
539							else { # Its a URI.
540	265					2764	return $uri;
541							}
542							}
543
544
545							=head1 NOTES
546
547							Will not find URLs with Internationalized Domain Names or pretty much
548							any non-ascii stuff in them. See
549							L
550
551
552							=head1 AUTHOR
553
554							Michael G Schwern with insight from Uri Gutman,
555							Greg Bacon, Jeff Pinyan, Roderick Schertler and others.
556
557							Roderick Schertler maintained versions 0.11 to 0.16.
558
559							Darren Chamberlain wrote urifind.
560
561
562							=head1 LICENSE
563
564							Copyright 2000, 2009-2010 by Michael G Schwern Eschwern@pobox.comE.
565
566							This program is free software; you can redistribute it and/or
567							modify it under the same terms as Perl itself.
568
569							See F
570
571							=head1 SEE ALSO
572
573							L, L, L, RFC 3986 Appendix C
574
575							=cut
576
577							1;