File Coverage

blib/lib/Text/CleanFragment.pm
Criterion Covered Total %
statement 22 22 100.0
branch 2 2 100.0
condition n/a
subroutine 5 5 100.0
pod 1 1 100.0
total 30 30 100.0


line stmt bran cond sub pod time code
1             package Text::CleanFragment;
2 2     2   75360 use strict;
  2         16  
  2         70  
3 2     2   12 use Exporter qw'import';
  2         4  
  2         55  
4 2     2   1009 use Text::Unidecode;
  2         4923  
  2         285  
5              
6             our $VERSION = '0.05';
7             our @EXPORT = (qw(clean_fragment));
8              
9             =head1 NAME
10              
11             =encoding utf8
12              
13             Text::CleanFragment - clean up text to use as URL fragment or filename
14              
15             =head1 SYNOPSIS
16              
17             my $title = "Do p\x{00FC}t into/URL's?";
18             my $id = 42;
19             my $url = join "/",
20             $id,
21             clean_fragment( $title );
22             # 42/Do_put_this_into_URLs
23              
24             =head1 DESCRIPTION
25              
26             This module downgrades strings of text to match
27              
28             /^[-._A-Za-z0-9]*$/
29              
30             or, to be more exact
31              
32             /^([-.A-Za-z0-9]([-._A-Za-z0-9]*[-.A-Za-z0-9])?)?$/
33              
34             This makes the return values safe to be used as URL fragments
35             or as file names on many file systems where whitespace
36             and characters outside of the Latin alphabet are undesired
37             or problematic.
38              
39             =head1 FUNCTIONS
40              
41             =head2 C<< clean_fragment( @fragments ) >>
42              
43             my $url_title = join("_", clean_fragment("Ümloud vs. ß",'by',"Grégory"));
44             # Umloud_vs._ss_by_Gregory
45              
46             Returns a cleaned up list of elements. The input elements
47             are expected to be encoded as Unicode strings. Decode them using
48             L if you read the fragments as file names from the filesystem.
49              
50             The operations performed are:
51              
52             =over 4
53              
54             =item *
55              
56             Use L to downgrade the text from Unicode to 7-bit ASCII.
57              
58             =item *
59              
60             Eliminate single and double quotes, apostrophes.
61              
62             =item *
63              
64             Replace all non-letters, non-digits by underscores, including whitespace
65             and control characters.
66              
67             =item *
68              
69             Squash dashes to a single dash
70              
71             =item *
72              
73             Squash C<_-_> and C<_-_(-_)+> to -
74              
75             =item *
76              
77             Eliminate leading underscores
78              
79             =item *
80              
81             Eliminate trailing underscores
82              
83             =item *
84              
85             Eliminate underscores before - or .
86              
87             =back
88              
89             In scalar context, returns the first element of the cleaned up list.
90              
91             =cut
92              
93             sub clean_fragment {
94             # make uri-sane filenames
95             # We assume Unicode on input.
96              
97             # First, downgrade to ASCII chars (or transliterate if possible)
98 67     67 1 71123 @_ = unidecode(@_);
99              
100 67         7042 for( @_ ) {
101 2     2   15 tr/['"\x{2019}]//d; # Eliminate apostrophes
  2         4  
  2         25  
  68         259  
102 68         417 s/[^a-zA-Z0-9.-]+/_/g; # Replace all non-ascii by underscores, including whitespace
103 68         190 s/-+/-/g; # Squash dashes
104 68         154 s/_(?:-_)+/-/g; # Squash _-_ and _-_-_ to -
105 68         164 s/^[-_]+//; # Eliminate leading underscores
106 68         215 s/[-_]+$//; # Eliminate trailing underscores
107 68         238 s/_(\W)/$1/; # No underscore before - or .
108             };
109 67 100       329 wantarray ? @_ : $_[0];
110             };
111              
112             1;
113              
114             __END__