File Coverage

blib/lib/Unicode/Homoglyph/Replace.pm
Criterion Covered Total %
statement 31 31 100.0
branch 4 4 100.0
condition 2 3 66.6
subroutine 8 8 100.0
pod 0 2 0.0
total 45 48 93.7


line stmt bran cond sub pod time code
1             package Unicode::Homoglyph::Replace;
2              
3 2     2   136646 use 5.008;
  2         19  
4 2     2   11 use strict;
  2         4  
  2         57  
5 2     2   9 use warnings;
  2         4  
  2         76  
6 2     2   656 use utf8;
  2         19  
  2         9  
7              
8 2     2   66 use Exporter qw(import);
  2         4  
  2         4265  
9              
10             our @EXPORT_OK = qw(replace_homoglyphs disguise);
11              
12             =head1 NAME
13              
14             Unicode::Homoglyph::Replace - replace homoglyphs with their ASCII lookalike equivalents
15              
16             =cut
17              
18             our $VERSION = '0.01';
19              
20              
21             =head1 SYNOPSIS
22              
23             use Unicode::Homoglyph::Replace qw(replace_homoglyphs);
24              
25             my $replaced = replace_homoglyphs("...");
26             ...
27              
28              
29              
30             =head1 DESCRIPTION
31              
32             Unicode has various homoglyphs - characters which look the same or mostly the
33             the same, but are different characters.
34              
35             If you're trying to filter input in some way, but support Unicode text, then
36             such homoglyphs can be used to get past your filters. For instance, there are
37             B other characters that look like a colon.
38              
39             So, if someone wants to be a ⅾⅰⅽk to bypass your filters, they can replace some
40             characters with look-alike (or at least look-similar) characters which your
41             profanity / spam filters won't recognise. (That example there was
42             C<\x{217E}\x{2170}\x{217D}k> - i.e. the characters SMALL ROMAN NUMERAL
43             FIVE HUNDRED, SMALL ROMAN NUMERAL ONE, SMALL ROMAN NUMERAL ONE HUNDRED,
44             and a "k".)
45              
46             =cut
47              
48             # This list of homoglyphs was lifted from Unicode::Homoglyph, and changed to
49             # note which ASCII character each is a homoglyph for. (It strikes me as very
50             # odd that the original version didn't do that...)
51              
52             our %homoglyphs = (
53             " " => [
54             "\x{0020}", # # SPACE
55             "\x{00A0}", # NO-BREAK SPACE
56             "\x{2000}", # EN QUAD
57             "\x{2001}", # EM QUAD
58             "\x{2002}", # EN SPACE
59             "\x{2003}", # EM SPACE
60             "\x{2004}", # THREE-PER-EM SPACE
61             "\x{2005}", # FOUR-PER-EM SPACE
62             "\x{2006}", # SIX-PER-EM SPACE
63             "\x{2007}", # FIGURE SPACE
64             "\x{2008}", # PUNCTUATION SPACE
65             "\x{2009}", # THIN SPACE
66             "\x{200A}", # HAIR SPACE
67             "\x{202F}", # NARROW NO-BREAK SPACE
68             "\x{205F}", # MEDIUM MATHEMATICAL SPACE
69             ],
70             "!" => [
71             "\x{0021}", # ! # EXCLAMATION MARK
72             "\x{01C3}", # LATIN LETTER RETROFLEX CLICK
73             "\x{2D51}", # TIFINAGH LETTER TUAREG YANG
74             "\x{FE15}", # PRESENTATION FORM FOR VERTICAL EXCLAMATION MARK
75             "\x{FE57}", # SMALL EXCLAMATION MARK
76             "\x{FF01}", # FULLWIDTH EXCLAMATION MARK
77             ],
78              
79             "\"" => [
80             "\x{0022}", # " # QUOTATION MARK
81             "\x{FF02}", # FULLWIDTH QUOTATION MARK
82             ],
83              
84             "#" => [
85             "\x{0023}", # # # NUMBER SIGN
86             "\x{FE5F}", # SMALL NUMBER SIGN
87             "\x{FF03}", # FULLWIDTH NUMBER SIGN
88             ],
89              
90             "\$" => [
91             "\x{0024}", # $ # DOLLAR SIGN
92             "\x{FE69}", # SMALL DOLLAR SIGN
93             "\x{FF04}", # FULLWIDTH DOLLAR SIGN
94             ],
95              
96             "\%" => [
97             "\x{0025}", # % # PERCENT SIGN
98             "\x{066A}", # ARABIC PERCENT SIGN
99             "\x{2052}", # COMMERCIAL MINUS SIGN
100             "\x{FE6A}", # SMALL PERCENT SIGN
101             "\x{FF05}", # FULLWIDTH PERCENT SIGN
102             ],
103              
104             "&" => [
105             "\x{0026}", # & # AMPERSAND
106             "\x{FE60}", # SMALL AMPERSAND
107             "\x{FF06}", # FULLWIDTH AMPERSAND
108             ],
109              
110             "'" => [
111             "\x{0027}", # ' # APOSTROPHE
112             "\x{02B9}", # MODIFIER LETTER PRIME
113             "\x{0374}", # GREEK NUMERAL SIGN
114             "\x{FF07}", # FULLWIDTH APOSTROPHE
115             ],
116              
117             "(" => [
118             "\x{0028}", # ( # LEFT PARENTHESIS
119             "\x{FE59}", # SMALL LEFT PARENTHESIS
120             "\x{FF08}", # FULLWIDTH LEFT PARENTHESIS
121             ],
122              
123             ")" => [
124             "\x{0029}", # ) # RIGHT PARENTHESIS
125             "\x{FF09}", # FULLWIDTH RIGHT PARENTHESIS
126             "\x{FE5A}", # SMALL RIGHT PARENTHESIS
127             ],
128              
129             "*" => [
130             "\x{002A}", # * # ASTERISK
131             "\x{22C6}", # STAR OPERATOR
132             "\x{FE61}", # SMALL ASTERISK
133             "\x{FF0A}", # FULLWIDTH ASTERISK
134             ],
135              
136             "+" => [
137             "\x{002B}", # + # PLUS SIGN
138             "\x{16ED}", # RUNIC CROSS PUNCTUATION
139             "\x{FE62}", # SMALL PLUS SIGN
140             "\x{FF0B}", # FULLWIDTH PLUS SIGN
141             ],
142            
143             "," => [
144             "\x{002C}", # , # COMMA
145             "\x{02CF}", # MODIFIER LETTER LOW ACUTE ACCENT
146             "\x{16E7}", # RUNIC LETTER SHORT-TWIG-YR
147             "\x{201A}", # SINGLE LOW-9 QUOTATION MARK
148             "\x{FF0C}", # FULLWIDTH COMMA
149             ],
150              
151             "-" => [
152             "\x{002D}", # - # HYPHEN-MINUS
153             "\x{02D7}", # MODIFIER LETTER MINUS SIGN
154             "\x{2212}", # MINUS SIGN
155             "\x{23BC}", # HORIZONTAL SCAN LINE-7
156             "\x{2574}", # BOX DRAWINGS LIGHT LEFT
157             "\x{FE63}", # SMALL HYPHEN-MINUS
158             "\x{FF0D}", # FULLWIDTH HYPHEN-MINUS
159             ],
160              
161             "." => [
162             "\x{002E}", # . # FULL STOP
163             "\x{2024}", # ONE DOT LEADER
164             "\x{FF0E}", # FULLWIDTH FULL STOP
165             ],
166              
167             "/" => [
168             "\x{002F}", # / # SOLIDUS
169             "\x{FF0F}", # FULLWIDTH SOLIDUS
170             "\x{1735}", # PHILIPPINE SINGLE PUNCTUATION
171             "\x{2044}", # FRACTION SLASH
172             "\x{2215}", # DIVISION SLASH
173             "\x{29F8}", # BIG SOLIDUS
174             ],
175              
176             "2" => [
177             "\x{0032}", # 2 # DIGIT TWO
178             "\x{14BF}", # CANADIAN SYLLABICS SAYISI M
179             ],
180              
181             "3" => [
182             "\x{0033}", # 3 # DIGIT THREE
183             "\x{01B7}", # LATIN CAPITAL LETTER EZH
184             "\x{2128}", # BLACK-LETTER CAPITAL Z
185             ],
186              
187             "4" => [
188             "\x{0034}", # 4 # DIGIT FOUR
189             "\x{13CE}", # CHEROKEE LETTER SE
190             ],
191              
192             "6" => [
193             "\x{0036}", # 6 # DIGIT SIX
194             "\x{13EE}", # CHEROKEE LETTER WV
195             ],
196              
197             "9" => [
198             "\x{0039}", # 9 # DIGIT NINE
199             "\x{13ED}", # CHEROKEE LETTER WU
200             ],
201              
202             ":" => [
203             "\x{003A}", # : # COLON
204             "\x{02D0}", # MODIFIER LETTER TRIANGULAR COLON
205             "\x{02F8}", # MODIFIER LETTER RAISED COLON
206             "\x{0589}", # ARMENIAN FULL STOP
207             "\x{1361}", # ETHIOPIC WORDSPACE
208             "\x{16EC}", # RUNIC MULTIPLE PUNCTUATION
209             "\x{205A}", # TWO DOT PUNCTUATION
210             "\x{2236}", # RATIO
211             "\x{2806}", # BRAILLE PATTERN DOTS-23
212             "\x{FE13}", # PRESENTATION FORM FOR VERTICAL COLON
213             "\x{FE55}", # SMALL COLON
214             "\x{FF1A}", # FULLWIDTH COLON
215             ],
216              
217             ";" => [
218             "\x{003B}", # ; # SEMICOLON
219             "\x{037E}", # GREEK QUESTION MARK
220             "\x{FE14}", # PRESENTATION FORM FOR VERTICAL SEMICOLON
221             "\x{FE54}", # SMALL SEMICOLON
222             "\x{FF1B}", # FULLWIDTH SEMICOLON
223             ],
224              
225             "<" => [
226             "\x{003C}", # < # LESS-THAN SIGN
227             "\x{02C2}", # MODIFIER LETTER LEFT ARROWHEAD
228             "\x{2039}", # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
229             "\x{227A}", # PRECEDES
230             "\x{276E}", # HEAVY LEFT-POINTING ANGLE QUOTATION MARK ORNAMENT
231             "\x{2D66}", # TIFINAGH LETTER YE
232             "\x{FE64}", # SMALL LESS-THAN SIGN
233             "\x{FF1C}", # FULLWIDTH LESS-THAN SIGN
234             ],
235              
236             "=" => [
237             "\x{003D}", # = # EQUALS SIGN
238             "\x{2550}", # BOX DRAWINGS DOUBLE HORIZONTAL
239             "\x{268C}", # DIGRAM FOR GREATER YANG
240             "\x{FE66}", # SMALL EQUALS SIGN
241             "\x{FF1D}", # FULLWIDTH EQUALS SIGN
242             ],
243              
244             ">" => [
245             "\x{003E}", # > # GREATER-THAN SIGN
246             "\x{02C3}", # MODIFIER LETTER RIGHT ARROWHEAD
247             "\x{203A}", # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
248             "\x{227B}", # SUCCEEDS
249             "\x{276F}", # HEAVY RIGHT-POINTING ANGLE QUOTATION MARK ORNAMENT
250             "\x{FE65}", # SMALL GREATER-THAN SIGN
251             "\x{FF1E}", # FULLWIDTH GREATER-THAN SIGN
252             ],
253              
254             "?" => [
255             "\x{003F}", # ? # QUESTION MARK
256             "\x{FE16}", # PRESENTATION FORM FOR VERTICAL QUESTION MARK
257             "\x{FE56}", # SMALL QUESTION MARK
258             "\x{FF1F}", # FULLWIDTH QUESTION MARK
259             ],
260              
261             "\@" => [
262             "\x{0040}", # @ # COMMERCIAL AT
263             "\x{FE6B}", # SMALL COMMERCIAL AT
264             "\x{FF20}", # FULLWIDTH COMMERCIAL AT
265             ],
266              
267             "A" => [
268             "\x{0041}", # A # LATIN CAPITAL LETTER A
269             "\x{0391}", # GREEK CAPITAL LETTER ALPHA
270             "\x{0410}", # CYRILLIC CAPITAL LETTER A
271             "\x{13AA}", # CHEROKEE LETTER GO
272             ],
273              
274             "B" => [
275             "\x{0042}", # B # LATIN CAPITAL LETTER B
276             "\x{0392}", # GREEK CAPITAL LETTER BETA
277             "\x{0412}", # CYRILLIC CAPITAL LETTER VE
278             "\x{13F4}", # CHEROKEE LETTER YV
279             "\x{15F7}", # CANADIAN SYLLABICS CARRIER KHE
280             "\x{2C82}", # COPTIC CAPITAL LETTER VIDA
281             ],
282              
283             "C" => [
284             "\x{0043}", # C # LATIN CAPITAL LETTER C
285             "\x{03F9}", # GREEK CAPITAL LUNATE SIGMA SYMBOL
286             "\x{0421}", # CYRILLIC CAPITAL LETTER ES
287             "\x{13DF}", # CHEROKEE LETTER TLI
288             "\x{216D}", # ROMAN NUMERAL ONE HUNDRED
289             "\x{2CA4}", # COPTIC CAPITAL LETTER SIMA
290             ],
291              
292             "D" => [
293             "\x{0044}", # D # LATIN CAPITAL LETTER D
294             "\x{13A0}", # CHEROKEE LETTER A
295             "\x{15EA}", # CANADIAN SYLLABICS CARRIER PE
296             "\x{216E}", # ROMAN NUMERAL FIVE HUNDRED
297             ],
298              
299             "E" => [
300             "\x{0045}", # E # LATIN CAPITAL LETTER E
301             "\x{0395}", # GREEK CAPITAL LETTER EPSILON
302             "\x{0415}", # CYRILLIC CAPITAL LETTER IE
303             "\x{13AC}", # CHEROKEE LETTER GV
304             ],
305              
306             "F" => [
307             "\x{0046}", # F # LATIN CAPITAL LETTER F
308             "\x{15B4}", # CANADIAN SYLLABICS BLACKFOOT WE
309             ],
310              
311             "G" => [
312             "\x{0047}", # G # LATIN CAPITAL LETTER G
313             "\x{050C}", # CYRILLIC CAPITAL LETTER KOMI SJE
314             "\x{13C0}", # CHEROKEE LETTER NAH
315             ],
316              
317             "H" => [
318             "\x{0048}", # H # LATIN CAPITAL LETTER H
319             "\x{0397}", # GREEK CAPITAL LETTER ETA
320             "\x{041D}", # CYRILLIC CAPITAL LETTER EN
321             "\x{12D8}", # ETHIOPIC SYLLABLE ZA
322             "\x{13BB}", # CHEROKEE LETTER MI
323             "\x{157C}", # CANADIAN SYLLABICS NUNAVUT H
324             "\x{2C8E}", # COPTIC CAPITAL LETTER HATE
325             ],
326              
327             "I" => [
328             "\x{0049}", # I # LATIN CAPITAL LETTER I
329             "\x{0399}", # GREEK CAPITAL LETTER IOTA
330             "\x{0406}", # CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I
331             "\x{2160}", # ROMAN NUMERAL ONE
332             ],
333              
334             "J" => [
335             "\x{004A}", # J # LATIN CAPITAL LETTER J
336             "\x{0408}", # CYRILLIC CAPITAL LETTER JE
337             "\x{13AB}", # CHEROKEE LETTER GU
338             "\x{148D}", # CANADIAN SYLLABICS CO
339             ],
340              
341             "K" => [
342             "\x{004B}", # K # LATIN CAPITAL LETTER K
343             "\x{039A}", # GREEK CAPITAL LETTER KAPPA
344             "\x{13E6}", # CHEROKEE LETTER TSO
345             "\x{16D5}", # RUNIC LETTER OPEN-P
346             "\x{212A}", # KELVIN SIGN
347             "\x{2C94}", # COPTIC CAPITAL LETTER KAPA
348             ],
349              
350             "L" => [
351             "\x{004C}", # L # LATIN CAPITAL LETTER L
352             "\x{13DE}", # CHEROKEE LETTER TLE
353             "\x{14AA}", # CANADIAN SYLLABICS MA
354             "\x{216C}", # ROMAN NUMERAL FIFTY
355             ],
356              
357             "M" => [
358             "\x{004D}", # M # LATIN CAPITAL LETTER M
359             "\x{039C}", # GREEK CAPITAL LETTER MU
360             "\x{03FA}", # GREEK CAPITAL LETTER SAN
361             "\x{041C}", # CYRILLIC CAPITAL LETTER EM
362             "\x{13B7}", # CHEROKEE LETTER LU
363             "\x{216F}", # ROMAN NUMERAL ONE THOUSAND
364             ],
365              
366             "N" => [
367             "\x{004E}", # N # LATIN CAPITAL LETTER N
368             "\x{039D}", # GREEK CAPITAL LETTER NU
369             "\x{2C9A}", # COPTIC CAPITAL LETTER NI
370             ],
371              
372             "O" => [
373             "\x{004F}", # O # LATIN CAPITAL LETTER O
374             "\x{039F}", # GREEK CAPITAL LETTER OMICRON
375             "\x{041E}", # CYRILLIC CAPITAL LETTER O
376             "\x{2C9E}", # COPTIC CAPITAL LETTER O
377             ],
378              
379             "P" => [
380             "\x{0050}", # P # LATIN CAPITAL LETTER P
381             "\x{03A1}", # GREEK CAPITAL LETTER RHO
382             "\x{0420}", # CYRILLIC CAPITAL LETTER ER
383             "\x{13E2}", # CHEROKEE LETTER TLV
384             "\x{2CA2}", # COPTIC CAPITAL LETTER RO
385             ],
386              
387             "Q" => [
388             "\x{0051}", # Q # LATIN CAPITAL LETTER Q
389             "\x{051A}", # CYRILLIC CAPITAL LETTER QA
390             "\x{2D55}", # TIFINAGH LETTER YARR
391             ],
392              
393             "R" => [
394             "\x{0052}", # R # LATIN CAPITAL LETTER R
395             "\x{13A1}", # CHEROKEE LETTER E
396             "\x{13D2}", # CHEROKEE LETTER SV
397             "\x{1587}", # CANADIAN SYLLABICS TLHI
398             ],
399              
400             "S" => [
401             "\x{0053}", # S # LATIN CAPITAL LETTER S
402             "\x{0405}", # CYRILLIC CAPITAL LETTER DZE
403             "\x{13DA}", # CHEROKEE LETTER DU
404             ],
405              
406             "T" => [
407             "\x{0054}", # T # LATIN CAPITAL LETTER T
408             "\x{03A4}", # GREEK CAPITAL LETTER TAU
409             "\x{0422}", # CYRILLIC CAPITAL LETTER TE
410             "\x{13A2}", # CHEROKEE LETTER I
411             ],
412              
413             "V" => [
414             "\x{0056}", # V # LATIN CAPITAL LETTER V
415             "\x{13D9}", # CHEROKEE LETTER DO
416             "\x{2164}", # ROMAN NUMERAL FIVE
417             ],
418              
419             "W" => [
420             "\x{0057}", # W # LATIN CAPITAL LETTER W
421             "\x{13B3}", # CHEROKEE LETTER LA
422             "\x{13D4}", # CHEROKEE LETTER TA
423             ],
424              
425             "X" => [
426             "\x{0058}", # X # LATIN CAPITAL LETTER X
427             "\x{03A7}", # GREEK CAPITAL LETTER CHI
428             "\x{0425}", # CYRILLIC CAPITAL LETTER HA
429             "\x{2169}", # ROMAN NUMERAL TEN
430             "\x{2CAC}", # COPTIC CAPITAL LETTER KHI
431             ],
432              
433             "Y" => [
434             "\x{0059}", # Y # LATIN CAPITAL LETTER Y
435             "\x{03A5}", # GREEK CAPITAL LETTER UPSILON
436             "\x{2CA8}", # COPTIC CAPITAL LETTER UA
437             ],
438              
439             "Z" => [
440             "\x{005A}", # Z # LATIN CAPITAL LETTER Z
441             "\x{0396}", # GREEK CAPITAL LETTER ZETA
442             "\x{13C3}", # CHEROKEE LETTER NO
443             ],
444              
445             "[" => [
446             "\x{005B}", # [ # LEFT SQUARE BRACKET
447             "\x{FF3B}", # FULLWIDTH LEFT SQUARE BRACKET
448             ],
449              
450             "\\" => [
451             "\x{005C}", # \ # REVERSE SOLIDUS
452             "\x{2216}", # SET MINUS
453             "\x{29F5}", # REVERSE SOLIDUS OPERATOR
454             "\x{29F9}", # BIG REVERSE SOLIDUS
455             "\x{FE68}", # SMALL REVERSE SOLIDUS
456             "\x{FF3C}", # FULLWIDTH REVERSE SOLIDUS
457             ],
458              
459             "]" => [
460             "\x{005D}", # ] # RIGHT SQUARE BRACKET
461             "\x{FF3D}", # FULLWIDTH RIGHT SQUARE BRACKET
462             ],
463              
464             "^" => [
465             "\x{005E}", # ^ # CIRCUMFLEX ACCENT
466             "\x{02C4}", # MODIFIER LETTER UP ARROWHEAD
467             "\x{02C6}", # MODIFIER LETTER CIRCUMFLEX ACCENT
468             "\x{1DBA}", # MODIFIER LETTER SMALL TURNED V
469             "\x{2303}", # UP ARROWHEAD
470             "\x{FF3E}", # FULLWIDTH CIRCUMFLEX ACCENT
471             ],
472              
473             "_" => [
474             "\x{005F}", # _ # LOW LINE
475             "\x{02CD}", # MODIFIER LETTER LOW MACRON
476             "\x{268A}", # MONOGRAM FOR YANG
477             "\x{FF3F}", # FULLWIDTH LOW LINE
478             ],
479              
480             "`" => [
481             "\x{0060}", # ` # GRAVE ACCENT
482             "\x{02CB}", # MODIFIER LETTER GRAVE ACCENT
483             "\x{1FEF}", # GREEK VARIA
484             "\x{2035}", # REVERSED PRIME
485             "\x{FF40}", # FULLWIDTH GRAVE ACCENT
486             ],
487              
488             "a" => [
489             "\x{0061}", # a # LATIN SMALL LETTER A
490             "\x{0251}", # LATIN SMALL LETTER ALPHA
491             "\x{0430}", # CYRILLIC SMALL LETTER A
492             ],
493              
494             "c" => [
495             "\x{0063}", # c # LATIN SMALL LETTER C
496             "\x{03F2}", # GREEK LUNATE SIGMA SYMBOL
497             "\x{0441}", # CYRILLIC SMALL LETTER ES
498             "\x{217D}", # SMALL ROMAN NUMERAL ONE HUNDRED
499             ],
500              
501             "d" => [
502             "\x{0064}", # d # LATIN SMALL LETTER D
503             "\x{0501}", # CYRILLIC SMALL LETTER KOMI DE
504             "\x{217E}", # SMALL ROMAN NUMERAL FIVE HUNDRED
505             ],
506              
507             "e" => [
508             "\x{0065}", # e # LATIN SMALL LETTER E
509             "\x{0435}", # CYRILLIC SMALL LETTER IE
510             "\x{1971}", # TAI LE LETTER TONE-3
511             ],
512              
513             "g" => [
514             "\x{0067}", # g # LATIN SMALL LETTER G
515             "\x{0261}", # LATIN SMALL LETTER SCRIPT G
516             ],
517              
518             "h" => [
519             "\x{0068}", # h # LATIN SMALL LETTER H
520             "\x{04BB}", # CYRILLIC SMALL LETTER SHHA
521             ],
522              
523             "i" => [
524             "\x{0069}", # i # LATIN SMALL LETTER I
525             "\x{0456}", # CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I
526             "\x{2170}", # SMALL ROMAN NUMERAL ONE
527             ],
528              
529             "j" => [
530             "\x{006A}", # j # LATIN SMALL LETTER J
531             "\x{03F3}", # GREEK LETTER YOT
532             "\x{0458}", # CYRILLIC SMALL LETTER JE
533             ],
534              
535             "l" => [
536             "\x{006C}", # l # LATIN SMALL LETTER L
537             "\x{217C}", # SMALL ROMAN NUMERAL FIFTY
538             ],
539              
540             "m" => [
541             "\x{006D}", # m # LATIN SMALL LETTER M
542             "\x{217F}", # SMALL ROMAN NUMERAL ONE THOUSAND
543             ],
544              
545             "n" => [
546             "\x{006E}", # n # LATIN SMALL LETTER N
547             "\x{1952}", # TAI LE LETTER NGA
548             ],
549              
550             "o" => [
551             "\x{006F}", # o # LATIN SMALL LETTER O
552             "\x{03BF}", # GREEK SMALL LETTER OMICRON
553             "\x{043E}", # CYRILLIC SMALL LETTER O
554             "\x{0D20}", # MALAYALAM LETTER TTHA
555             "\x{2C9F}", # COPTIC SMALL LETTER O
556             ],
557              
558             "p" => [
559             "\x{0070}", # p # LATIN SMALL LETTER P
560             "\x{0440}", # CYRILLIC SMALL LETTER ER
561             "\x{2CA3}", # COPTIC SMALL LETTER RO
562             ],
563              
564             "s" => [
565             "\x{0073}", # s # LATIN SMALL LETTER S
566             "\x{0073}", # s # LATIN SMALL LETTER S
567             "\x{0455}", # CYRILLIC SMALL LETTER DZE
568             ],
569              
570             "u" => [
571             "\x{0075}", # u # LATIN SMALL LETTER U
572             "\x{1959}", # TAI LE LETTER PA
573             "\x{222A}", # UNION
574             ],
575              
576             "v" => [
577             "\x{0076}", # v # LATIN SMALL LETTER V
578             "\x{1D20}", # LATIN LETTER SMALL CAPITAL V
579             "\x{2174}", # SMALL ROMAN NUMERAL FIVE
580             "\x{2228}", # LOGICAL OR
581             "\x{22C1}", # N-ARY LOGICAL OR
582             ],
583              
584             "w" => [
585             "\x{0077}", # w # LATIN SMALL LETTER W
586             "\x{1D21}", # LATIN LETTER SMALL CAPITAL W
587             ],
588            
589              
590             "x" => [
591             "\x{0078}", # x # LATIN SMALL LETTER X
592             "\x{0445}", # CYRILLIC SMALL LETTER HA
593             "\x{2179}", # SMALL ROMAN NUMERAL TEN
594             "\x{2CAD}", # COPTIC SMALL LETTER KHI
595             ],
596              
597             "y" => [
598             "\x{0079}", # y # LATIN SMALL LETTER Y
599             "\x{0443}", # CYRILLIC SMALL LETTER U
600             "\x{1EFF}", # LATIN SMALL LETTER Y WITH LOOP
601             ],
602              
603             "z" => [
604             "\x{007A}", # z # LATIN SMALL LETTER Z
605             "\x{1D22}", # LATIN LETTER SMALL CAPITAL Z
606             ],
607              
608             "{" => [
609             "\x{007B}", # { # LEFT CURLY BRACKET
610             "\x{FE5B}", # SMALL LEFT CURLY BRACKET
611             "\x{FF5B}", # FULLWIDTH LEFT CURLY BRACKET
612             ],
613              
614             "|" => [
615             "\x{007C}", # | # VERTICAL LINE
616             "\x{01C0}", # LATIN LETTER DENTAL CLICK
617             "\x{16C1}", # RUNIC LETTER ISAZ IS ISS I
618             "\x{239C}", # LEFT PARENTHESIS EXTENSION
619             "\x{239F}", # RIGHT PARENTHESIS EXTENSION
620             "\x{23A2}", # LEFT SQUARE BRACKET EXTENSION
621             "\x{23A5}", # RIGHT SQUARE BRACKET EXTENSION
622             "\x{23AA}", # CURLY BRACKET EXTENSION
623             "\x{23AE}", # INTEGRAL EXTENSION
624             "\x{FF5C}", # FULLWIDTH VERTICAL LINE
625             "\x{FFE8}", # HALFWIDTH FORMS LIGHT VERTICAL
626             ],
627              
628             "}" => [
629             "\x{007D}", # } # RIGHT CURLY BRACKET
630             "\x{FE5C}", # SMALL RIGHT CURLY BRACKET
631             "\x{FF5D}", # FULLWIDTH RIGHT CURLY BRACKET
632             ],
633              
634             "~" => [
635             "\x{007E}", # ~ # TILDE
636             "\x{02DC}", # SMALL TILDE
637             "\x{2053}", # SWUNG DASH
638             "\x{223C}", # TILDE OPERATOR
639             "\x{FF5E}", # FULLWIDTH TILDE
640             ],
641              
642             );
643              
644              
645             my %replace_map;
646             sub _build_replace_map {
647 1     1   25 for my $ascii_char (keys %homoglyphs) {
648 83         107 for my $homoglyph (@{ $homoglyphs{$ascii_char} }) {
  83         163  
649 340         808 $replace_map{$homoglyph} = $ascii_char;
650             }
651             }
652             }
653              
654              
655             # TODO: this would probably be much more efficient if we build up a tr///
656             # transliteration, I suspect.
657             sub replace_homoglyphs {
658 84     84 0 27263 my $input = shift;
659 84         139 my $result;
660 84 100       223 _build_replace_map() unless keys %replace_map;
661 84         525 for my $char (split //, $input) {
662 2536   66     5906 $result .= $replace_map{$char} // $char;
663             }
664 84         508 return $result;
665             }
666              
667              
668              
669              
670             # Mostly for testing, take a string, and for each character we have a choice of
671             # homoglyphs for, pick one at random and use it.
672             sub disguise {
673 80     80 0 42254 my $input = shift;
674 80         144 my $result;
675 80         540 for my $char (split //, $input) {
676 2450 100       4405 if (my $possible_homoglyphs = $homoglyphs{$char}) {
677 2140         4571 $result .= $possible_homoglyphs->[int rand @$possible_homoglyphs];
678             } else {
679 310         939 $result .= $char;
680             }
681             }
682 80         322 return $result;
683             }
684              
685              
686             =head1 AUTHOR
687              
688             David Precious, C<< >>
689              
690             =head1 BUGS
691              
692             Please report any bugs or feature requests to C, or through
693             the web interface at L. I will be notified, and then you'll
694             automatically be notified of progress on your bug as I make changes.
695              
696              
697              
698              
699             =head1 SUPPORT
700              
701             You can find documentation for this module with the perldoc command.
702              
703             perldoc Unicode::Homoglyph::Replace
704              
705              
706             You can also look for information at:
707              
708             =over 4
709              
710             =item * RT: CPAN's request tracker (report bugs here)
711              
712             L
713              
714             =item * AnnoCPAN: Annotated CPAN documentation
715              
716             L
717              
718             =item * CPAN Ratings
719              
720             L
721              
722             =item * Search CPAN
723              
724             L
725              
726             =back
727              
728              
729             =head1 SEE ALSO
730              
731             L, where the list of homoglyphs came from.
732              
733              
734             =head1 ACKNOWLEDGEMENTS
735              
736              
737             =head1 LICENSE AND COPYRIGHT
738              
739             Copyright 2018 David Precious.
740              
741             This program is free software; you can redistribute it and/or modify it
742             under the terms of the the Artistic License (2.0). You may obtain a
743             copy of the full license at:
744              
745             L
746              
747             Any use, modification, and distribution of the Standard or Modified
748             Versions is governed by this Artistic License. By using, modifying or
749             distributing the Package, you accept this license. Do not use, modify,
750             or distribute the Package, if you do not accept this license.
751              
752             If your Modified Version has been derived from a Modified Version made
753             by someone other than you, you are nevertheless required to ensure that
754             your Modified Version complies with the requirements of this license.
755              
756             This license does not grant you the right to use any trademark, service
757             mark, tradename, or logo of the Copyright Holder.
758              
759             This license includes the non-exclusive, worldwide, free-of-charge
760             patent license to make, have made, use, offer to sell, sell, import and
761             otherwise transfer the Package with respect to any patent claims
762             licensable by the Copyright Holder that are necessarily infringed by the
763             Package. If you institute patent litigation (including a cross-claim or
764             counterclaim) against any party alleging that the Package constitutes
765             direct or contributory patent infringement, then this Artistic License
766             to you shall terminate on the date that such litigation is filed.
767              
768             Disclaimer of Warranty: THE PACKAGE IS PROVIDED BY THE COPYRIGHT HOLDER
769             AND CONTRIBUTORS "AS IS' AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES.
770             THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
771             PURPOSE, OR NON-INFRINGEMENT ARE DISCLAIMED TO THE EXTENT PERMITTED BY
772             YOUR LOCAL LAW. UNLESS REQUIRED BY LAW, NO COPYRIGHT HOLDER OR
773             CONTRIBUTOR WILL BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, OR
774             CONSEQUENTIAL DAMAGES ARISING IN ANY WAY OUT OF THE USE OF THE PACKAGE,
775             EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
776              
777              
778             =cut
779              
780             1; # End of Unicode::Homoglyph::Replace