File Coverage

blib/lib/Regexp/Common/profanity_us.pm
Criterion Covered Total %
statement 16 17 94.1
branch n/a
condition n/a
subroutine 6 7 85.7
pod 0 1 0.0
total 22 25 88.0


line stmt bran cond sub pod time code
1             package Regexp::Common::profanity_us;
2             BEGIN {
3 1     1   800 $Regexp::Common::profanity_us::VERSION = '4.112150';
4             }
5              
6 1     1   9 use strict;
  1         2  
  1         34  
7 1     1   50 use warnings;
  1         3  
  1         33  
8              
9 1     1   1140 use Data::Dumper;
  1         9102  
  1         92  
10 1     1   798 use Data::Section::Simple qw(get_data_section);
  1         707  
  1         123  
11              
12 1     1   8 use Regexp::Common qw /pattern clean no_defaults/;
  1         2  
  1         16  
13              
14              
15              
16              
17              
18              
19 0     0 0   sub longest_first { length($b) <=> length($a) }
20              
21             my ($profanity, @profanity);
22              
23              
24             my $data = Data::Section::Simple::get_data_section('profanity');
25             #die "data: $data";
26              
27             my @line = split "\n", $data;
28              
29             for (@line) {
30              
31              
32             # warn $_;
33              
34             next if /^#/;
35             next if /^\s*$/;
36              
37              
38              
39             s/^\s+//;
40             s/\s+$//;
41              
42             push @profanity, $_;
43              
44             }
45              
46             #die Dumper("profanity", \@profanity);
47              
48             pattern name => [qw (profanity us normal label -dist=7)],
49             create => sub
50             {
51             my ($self, $flags) = @_;
52             my $word_width = $flags->{-dist};
53             my $any_char = ".{0,$word_width}";
54             @profanity = map { s/-/$any_char/g; $_ } @profanity;
55             $profanity = join '|', @profanity;
56             $profanity = '(?k:' . $profanity . ')';
57             #warn "$profanity";
58             $profanity;
59             },
60             ;
61              
62              
63              
64             1;
65              
66              
67              
68             =pod
69              
70             =head1 NAME
71              
72             Regexp::Common::profanity_us -- provide regexes for U.S. profanity
73              
74             =head1 SYNOPSIS
75              
76             use Regexp::Common qw /profanity_us/;
77              
78             my $RE = $RE{profanity}{us}{normal}{label}{-keep}{-dist=>3};
79              
80             while (<>) {
81             warn "PROFANE" if /$RE/;
82             }
83              
84             Or easier
85              
86             use Regexp::Profanity::US;
87              
88             $profane = profane ($string);
89             @profane = profane_list($string);
90              
91             =head1 OVERVIEW
92              
93             Instead of a dry technical overview, I am going to explain the structure
94             of this module based on its history. I consult at a company that generates
95             customer leads primarily by having websites that attract people (e.g.
96             lowering loan values, selling cars, buying real estate, etc.). For some reason
97             we get more than our fair share of profane leads. For this reason I was told
98             to write a profanity checker.
99              
100             For the data that I was dealing with, the profanity was most often in the
101             email address or in the first or last name, so I naively started filtering
102             profanity with a set of regexps for that sort of data. Note that both names
103             and email addresses are unlike what you are reading now: they are not
104             whitespace-separated text, but are instead labels.
105              
106             Therefore full support for profanity checking should work in 2 entirely
107             different contexts: labels (email, names) and text (what you are reading).
108             Because open-source is driven by demand and I have no need for detecting
109             profanity in text, only C
110             know the next sentence: "patches welcome" :)
111              
112             =head2 Spelling Variations Dictated by Sound or Sight
113              
114             =head3 Creative use of symbols to spell words (el33t sp3@k)
115              
116             Now, within labels, you can see normal ascii or creative use of symbols:
117              
118             Here are some normal profane labels:
119             suckmycock@isp.com
120             shitonastick
121              
122             And here they are in ascii art:
123             s\/cKmyc0k@aol.com
124             sh|+0naST1ck
125              
126             A CPAN module which does a great job of "drawing words" is
127             L.
128             I thought I knew all of the ways that someone could
129             "inflate" a letter so that dirty words could bypass a profanity checker, but
130             just look at all these:
131              
132             %letter =
133             ( a => [ "4", "@" ],
134             c => "(",
135             e => "3",
136             g => "6",
137             h => [ "|-|", "]-[" ],
138             k => [ "|<", "]{" ],
139             i => "!",
140             l => [ "1", "|" ],
141             m => [ "|V|", "|\\/|" ],
142             n => "|\\|",
143             o => "0",
144             s => [ "5", "Z" ],
145             t => [ "7", "+"],
146             u => "\\_/",
147             v => "\\/",
148             w => [ "vv", "\\/\\/" ],
149             'y' => "j",
150             z => "2",
151             );
152              
153             =head3 Soundex respelling
154              
155             Which of course brings me to the final way to take normal text and vary it
156             for the same meaning: soundex.
157              
158             The way a word sounds can lead to different spellings. For example, we have
159             shitonastick
160              
161             Which we can soundex out as:
162             shitonuhstick
163              
164             Or, given:
165             nigger
166              
167             We can rewrite it as:
168             nigga
169             nigguh
170             niggah
171              
172             There are two CPAN modules, L and
173             L which do
174             this sort of thing, but after they resolved "shit" and "shot" to the same
175             soundex, I forgot about them :).
176              
177              
178             So to conclude this OVERVIEW, (or is that oV3r\/ieW :), this module does
179             profanity checking for:
180              
181             labels and not text
182              
183             and for:
184              
185             normal and not eleet spelling
186              
187             with a bit of hedging to support soundexing (and only definite obscene
188             words are searched for. Ambiguous / contextual searching is left as an
189             exercise for the reader).
190              
191             In L terminology, which is the infrastructure on which
192             this module is built, we have only the following regexp for your
193             string-matching ecstasy:
194              
195             $RE{profanity}{us}{normal}{label}
196              
197             and patches are welcome for:
198              
199             $RE{profanity}{us}{label}{eleet}
200             $RE{profanity}{us}{text}{normal}
201             $RE{profanity}{us}{text}{eleet}
202              
203             But do note this if you plan to implement I parsing,
204              
205             C<[^:alpha:]> and not C<\b> should be used because C<_> does not form a
206             word boundary and so
207              
208             \bshit\b
209              
210             will match
211              
212             shit head
213              
214             and
215              
216             shit-head
217              
218             but not
219            
220             shit_head
221              
222             Another thing about text is that it may be resolved into labels by splitting
223             on whitespace. Thus, one could have one engine and a different pre-processor.
224              
225             =head1 USAGE
226              
227             Please consult the manual of L for a general description
228             of the works of this interface.
229              
230             Do not use this module directly, but load it via I.
231              
232             This module reads one flag, C<-dist> which is used to set the amount of
233             characters that can appear between components of an obscene phrase.
234             For example
235              
236             suck!!!my!!!cock
237              
238             will match the following regular expression
239              
240             suck-my-cock
241              
242             as long as the flag C<-dist> is set to 3 or greater because this module
243             changes C<-> into C<.{0,$dist}> with C<$dist> defaulting to 7.
244             Why such a large default? It is done so that the profanity list can
245             omit certain words such as my or your. Take this:
246              
247             poop on your face
248              
249             We have the following regular expression
250              
251             poop--face
252              
253             which is transformed to
254              
255             poop.{0,7}.{0,7}face
256              
257             which will match the possible prepositions and adjectives in between
258             "poop" and "face" and also match the hideous term "poopface".
259              
260             =head2 Capturing
261              
262             Under C<-keep> (see L):
263              
264             =over 4
265              
266             =item $1
267              
268             captures the entire word
269              
270             =back
271              
272             =head1 SEE ALSO
273              
274             L for a general description of how to use this interface.
275              
276             L for a slightly more European set of words.
277              
278             L for a pair of wrapper functions that use these regexps.
279              
280             =head1 AUTHOR
281              
282             T. M. Brannon, tbone@cpan.org
283              
284             I cannot pay enough thanks to
285              
286             Matthew Simon Cavalletto, evo@cpan.org.
287              
288             who refactored this module completely of his own volition and
289             in spite of his hectic schedule. He turned this module from an
290             unsophisticated hack into something worth others using.
291              
292             Useful brain picking came from William McKee of Knowmad Consulting on the
293             L mailing list.
294              
295             =cut
296              
297              
298             __DATA__