File Coverage

blib/lib/Twitter/Text/Regexp.pm
Criterion Covered Total %
statement 16 16 100.0
branch 2 2 100.0
condition n/a
subroutine 5 5 100.0
pod 0 1 0.0
total 23 24 95.8


line stmt bran cond sub pod time code
1             package
2             Twitter::Text::Regexp; # hide from PAUSE
3 4     4   29 use strict;
  4         10  
  4         128  
4 4     4   22 use warnings;
  4         6  
  4         109  
5 4     4   20 use utf8;
  4         6  
  4         27  
6 4     4   540 use Twitter::Text::Util qw(load_yaml);
  4         10  
  4         3692  
7              
8             # internal use only, do not use this module directly.
9              
10             sub regex_range {
11 68     68 0 111 my ($from, $to) = @_;
12              
13 68 100       115 if (defined $to) {
14 32         139 return pack('U', $from) . '-' . pack('U', $to);
15             } else {
16 36         108 return pack('U', $from);
17             }
18             }
19              
20             our $TLDS = load_yaml("tld_lib.yml")->[0];
21             our $PUNCTUATION_CHARS = '!"#$%&\'()*+,-./:;<=>?@\[\]^_\`{|}~';
22             our $SPACE_CHARS = " \t\n\x0B\f\r";
23             our $CTRL_CHARS = "\x00-\x1F\x7F";
24             our $INVALID_CHARACTERS = join '', map { pack 'U', $_ } (
25             0xFFFE, 0xFEFF, # BOM
26             0xFFFF, # Special
27             );
28             our $UNICODE_SPACES = join '', map { pack 'U*', $_ } (
29             (0x0009..0x000D), # White_Space # Cc [5] ..
30             0x0020, # White_Space # Zs SPACE
31             0x0085, # White_Space # Cc
32             0x00A0, # White_Space # Zs NO-BREAK SPACE
33             0x1680, # White_Space # Zs OGHAM SPACE MARK
34             0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR
35             (0x2000..0x200A), # White_Space # Zs [11] EN QUAD..HAIR SPACE
36             0x2028, # White_Space # Zl LINE SEPARATOR
37             0x2029, # White_Space # Zp PARAGRAPH SEPARATOR
38             0x202F, # White_Space # Zs NARROW NO-BREAK SPACE
39             0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE
40             0x3000, # White_Space # Zs IDEOGRAPHIC SPACE
41             );
42              
43             our $DIRECTIONAL_CHARACTERS = join '', map { pack 'U', $_ } (
44             0x061C, # ARABIC LETTER MARK (ALM)
45             0x200E, # LEFT-TO-RIGHT MARK (LRM)
46             0x200F, # RIGHT-TO-LEFT MARK (RLM)
47             0x202A, # LEFT-TO-RIGHT EMBEDDING (LRE)
48             0x202B, # RIGHT-TO-LEFT EMBEDDING (RLE)
49             0x202C, # POP DIRECTIONAL FORMATTING (PDF)
50             0x202D, # LEFT-TO-RIGHT OVERRIDE (LRO)
51             0x202E, # RIGHT-TO-LEFT OVERRIDE (RLO)
52             0x2066, # LEFT-TO-RIGHT ISOLATE (LRI)
53             0x2067, # RIGHT-TO-LEFT ISOLATE (RLI)
54             0x2068, # FIRST STRONG ISOLATE (FSI)
55             0x2069, # POP DIRECTIONAL ISOLATE (PDI)
56             );
57             our $DOMAIN_VALID_CHARS = "[^$DIRECTIONAL_CHARACTERS$PUNCTUATION_CHARS$SPACE_CHARS$CTRL_CHARS$INVALID_CHARACTERS$UNICODE_SPACES]";
58              
59             our $LATIN_ACCENTS = join '', (
60             regex_range(0xc0, 0xd6),
61             regex_range(0xd8, 0xf6),
62             regex_range(0xf8, 0xff),
63             regex_range(0x0100, 0x024f),
64             regex_range(0x0253, 0x0254),
65             regex_range(0x0256, 0x0257),
66             regex_range(0x0259),
67             regex_range(0x025b),
68             regex_range(0x0263),
69             regex_range(0x0268),
70             regex_range(0x026f),
71             regex_range(0x0272),
72             regex_range(0x0289),
73             regex_range(0x028b),
74             regex_range(0x02bb),
75             regex_range(0x0300, 0x036f),
76             regex_range(0x1e00, 0x1eff)
77             );
78             our $latin_accents = qr/[$LATIN_ACCENTS]+/o;
79              
80             our $HASHTAG_LETTERS_AND_MARKS = '\p{L}\p{M}' .
81             "\N{U+037f}\N{U+0528}-\N{U+052f}\N{U+08a0}-\N{U+08b2}\N{U+08e4}-\N{U+08ff}\N{U+0978}\N{U+0980}\N{U+0c00}\N{U+0c34}\N{U+0c81}\N{U+0d01}\N{U+0ede}\N{U+0edf}" .
82             "\N{U+10c7}\N{U+10cd}\N{U+10fd}-\N{U+10ff}\N{U+16f1}-\N{U+16f8}\N{U+17b4}\N{U+17b5}\N{U+191d}\N{U+191e}\N{U+1ab0}-\N{U+1abe}\N{U+1bab}-\N{U+1bad}\N{U+1bba}-" .
83             "\N{U+1bbf}\N{U+1cf3}-\N{U+1cf6}\N{U+1cf8}\N{U+1cf9}\N{U+1de7}-\N{U+1df5}\N{U+2cf2}\N{U+2cf3}\N{U+2d27}\N{U+2d2d}\N{U+2d66}\N{U+2d67}\N{U+9fcc}\N{U+a674}-" .
84             "\N{U+a67b}\N{U+a698}-\N{U+a69d}\N{U+a69f}\N{U+a792}-\N{U+a79f}\N{U+a7aa}-\N{U+a7ad}\N{U+a7b0}\N{U+a7b1}\N{U+a7f7}-\N{U+a7f9}\N{U+a9e0}-\N{U+a9ef}\N{U+a9fa}-" .
85             "\N{U+a9fe}\N{U+aa7c}-\N{U+aa7f}\N{U+aae0}-\N{U+aaef}\N{U+aaf2}-\N{U+aaf6}\N{U+ab30}-\N{U+ab5a}\N{U+ab5c}-\N{U+ab5f}\N{U+ab64}\N{U+ab65}\N{U+f870}-\N{U+f87f}" .
86             "\N{U+f882}\N{U+f884}-\N{U+f89f}\N{U+f8b8}\N{U+f8c1}-\N{U+f8d6}\N{U+fa2e}\N{U+fa2f}\N{U+fe27}-\N{U+fe2d}\N{U+102e0}\N{U+1031f}\N{U+10350}-\N{U+1037a}" .
87             "\N{U+10500}-\N{U+10527}\N{U+10530}-\N{U+10563}\N{U+10600}-\N{U+10736}\N{U+10740}-\N{U+10755}\N{U+10760}-\N{U+10767}" .
88             "\N{U+10860}-\N{U+10876}\N{U+10880}-\N{U+1089e}\N{U+10980}-\N{U+109b7}\N{U+109be}\N{U+109bf}\N{U+10a80}-\N{U+10a9c}" .
89             "\N{U+10ac0}-\N{U+10ac7}\N{U+10ac9}-\N{U+10ae6}\N{U+10b80}-\N{U+10b91}\N{U+1107f}\N{U+110d0}-\N{U+110e8}\N{U+11100}-" .
90             "\N{U+11134}\N{U+11150}-\N{U+11173}\N{U+11176}\N{U+11180}-\N{U+111c4}\N{U+111da}\N{U+11200}-\N{U+11211}\N{U+11213}-" .
91             "\N{U+11237}\N{U+112b0}-\N{U+112ea}\N{U+11301}-\N{U+11303}\N{U+11305}-\N{U+1130c}\N{U+1130f}\N{U+11310}\N{U+11313}-" .
92             "\N{U+11328}\N{U+1132a}-\N{U+11330}\N{U+11332}\N{U+11333}\N{U+11335}-\N{U+11339}\N{U+1133c}-\N{U+11344}\N{U+11347}" .
93             "\N{U+11348}\N{U+1134b}-\N{U+1134d}\N{U+11357}\N{U+1135d}-\N{U+11363}\N{U+11366}-\N{U+1136c}\N{U+11370}-\N{U+11374}" .
94             "\N{U+11480}-\N{U+114c5}\N{U+114c7}\N{U+11580}-\N{U+115b5}\N{U+115b8}-\N{U+115c0}\N{U+11600}-\N{U+11640}\N{U+11644}" .
95             "\N{U+11680}-\N{U+116b7}\N{U+118a0}-\N{U+118df}\N{U+118ff}\N{U+11ac0}-\N{U+11af8}\N{U+1236f}-\N{U+12398}\N{U+16a40}-" .
96             "\N{U+16a5e}\N{U+16ad0}-\N{U+16aed}\N{U+16af0}-\N{U+16af4}\N{U+16b00}-\N{U+16b36}\N{U+16b40}-\N{U+16b43}\N{U+16b63}-" .
97             "\N{U+16b77}\N{U+16b7d}-\N{U+16b8f}\N{U+16f00}-\N{U+16f44}\N{U+16f50}-\N{U+16f7e}\N{U+16f8f}-\N{U+16f9f}\N{U+1bc00}-" .
98             "\N{U+1bc6a}\N{U+1bc70}-\N{U+1bc7c}\N{U+1bc80}-\N{U+1bc88}\N{U+1bc90}-\N{U+1bc99}\N{U+1bc9d}\N{U+1bc9e}\N{U+1e800}-" .
99             "\N{U+1e8c4}\N{U+1e8d0}-\N{U+1e8d6}\N{U+1ee00}-\N{U+1ee03}\N{U+1ee05}-\N{U+1ee1f}\N{U+1ee21}\N{U+1ee22}\N{U+1ee24}" .
100             "\N{U+1ee27}\N{U+1ee29}-\N{U+1ee32}\N{U+1ee34}-\N{U+1ee37}\N{U+1ee39}\N{U+1ee3b}\N{U+1ee42}\N{U+1ee47}\N{U+1ee49}" .
101             "\N{U+1ee4b}\N{U+1ee4d}-\N{U+1ee4f}\N{U+1ee51}\N{U+1ee52}\N{U+1ee54}\N{U+1ee57}\N{U+1ee59}\N{U+1ee5b}\N{U+1ee5d}\N{U+1ee5f}" .
102             "\N{U+1ee61}\N{U+1ee62}\N{U+1ee64}\N{U+1ee67}-\N{U+1ee6a}\N{U+1ee6c}-\N{U+1ee72}\N{U+1ee74}-\N{U+1ee77}\N{U+1ee79}-" .
103             "\N{U+1ee7c}\N{U+1ee7e}\N{U+1ee80}-\N{U+1ee89}\N{U+1ee8b}-\N{U+1ee9b}\N{U+1eea1}-\N{U+1eea3}\N{U+1eea5}-\N{U+1eea9}" .
104             "\N{U+1eeab}-\N{U+1eebb}";
105              
106             our $HASHTAG_NUMERALS = "\\p{Nd}" .
107             "\N{U+0de6}-\N{U+0def}\N{U+a9f0}-\N{U+a9f9}\N{U+110f0}-\N{U+110f9}\N{U+11136}-\N{U+1113f}\N{U+111d0}-\N{U+111d9}\N{U+112f0}-" .
108             "\N{U+112f9}\N{U+114d0}-\N{U+114d9}\N{U+11650}-\N{U+11659}\N{U+116c0}-\N{U+116c9}\N{U+118e0}-\N{U+118e9}\N{U+16a60}-" .
109             "\N{U+16a69}\N{U+16b50}-\N{U+16b59}";
110              
111             our $HASHTAG_SPECIAL_CHARS = "_\N{U+200c}\N{U+200d}\N{U+a67e}\N{U+05be}\N{U+05f3}\N{U+05f4}\N{U+ff5e}\N{U+301c}\N{U+309b}\N{U+309c}\N{U+30a0}\N{U+30fb}\N{U+3003}\N{U+0f0b}\N{U+0f0c}\N{U+00b7}";
112              
113             our $HASHTAG_LETTERS_NUMERALS = "$HASHTAG_LETTERS_AND_MARKS$HASHTAG_NUMERALS$HASHTAG_SPECIAL_CHARS";
114             our $HASHTAG_LETTERS_NUMERALS_SET = "[$HASHTAG_LETTERS_NUMERALS]";
115             our $HASHTAG_LETTERS_SET = "[$HASHTAG_LETTERS_AND_MARKS]";
116              
117             our $HASHTAG = qr/(\A|\N{U+fe0e}|\N{U+fe0f}|[^&$HASHTAG_LETTERS_NUMERALS])(#|#)(?!\N{U+fe0f}|\N{U+20e3})($HASHTAG_LETTERS_NUMERALS_SET*$HASHTAG_LETTERS_SET$HASHTAG_LETTERS_NUMERALS_SET*)/i;
118              
119             our $valid_hashtag = qr/$HASHTAG/i;
120             our $end_hashtag_match = qr/\A(?:[##]|:\/\/)/;
121              
122             our $valid_mention_preceding_chars = qr/(?:[^a-z0-9_!#\$%&*@@]|^|(?:^|[^a-z0-9_+~.-])[rR][tT]:?)/i;
123             our $at_signs = qr/[@@]/;
124             our $valid_mention_or_list = qr/
125             ($valid_mention_preceding_chars) # $1: Preceeding character
126             ($at_signs) # $2: At mark
127             ([a-z0-9_]{1,20}) # $3: Screen name
128             (\/[a-z][a-zA-Z0-9_\-]{0,24})? # $4: List (optional)
129             /ix;
130             our $valid_reply = qr/^(?:[$UNICODE_SPACES$DIRECTIONAL_CHARACTERS])*$at_signs([a-z0-9_]{1,20})/i;
131             # Used in Extractor for final filtering
132             our $end_mention_match = qr/\A(?:$at_signs|$latin_accents|:\/\/)/i;
133              
134             our $valid_subdomain = qr/(?:(?:$DOMAIN_VALID_CHARS(?:[_-]|$DOMAIN_VALID_CHARS)*)?$DOMAIN_VALID_CHARS\.)/i;
135             our $valid_domain_name = qr/(?:(?:$DOMAIN_VALID_CHARS(?:[-]|$DOMAIN_VALID_CHARS)*)?$DOMAIN_VALID_CHARS\.)/i;
136              
137             our $GENERIC_TLDS = join '|', @{$TLDS->{generic}};
138             our $CC_TLDS = join '|', @{$TLDS->{country}};
139              
140             our $valid_gTLD = qr{
141             (?:
142             (?:$GENERIC_TLDS)
143             (?=[^0-9a-z@+-]|$)
144             )
145             }ix;
146              
147             our $valid_ccTLD = qr{
148             (?:
149             (?:$CC_TLDS)
150             (?=[^0-9a-z@+-]|$)
151             )
152             }ix;
153             our $valid_punycode = qr/(?:xn--[0-9a-z]+)/i;
154              
155             our $valid_domain = qr/(?:
156             $valid_subdomain*$valid_domain_name
157             (?:$valid_gTLD|$valid_ccTLD|$valid_punycode)
158             )/ix;
159              
160             # This is used in Extractor
161             our $valid_ascii_domain = qr/
162             (?:(?:[a-z0-9\-_]|$latin_accents)+\.)+
163             (?:$valid_gTLD|$valid_ccTLD|$valid_punycode)
164             /ix;
165              
166             # This is used in Extractor for stricter t.co URL extraction
167             our $valid_tco_url = qr/^https?:\/\/t\.co\/([a-z0-9]+)/i;
168              
169             our $valid_port_number = qr/[0-9]+/;
170              
171             our $valid_url_preceding_chars = qr/(?:[^A-Z0-9@@\$##$INVALID_CHARACTERS]|[$DIRECTIONAL_CHARACTERS]|^)/i;
172             our $invalid_url_without_protocol_preceding_chars = qr/[-_.\/]$/;
173              
174             our $valid_general_url_path_chars = qr/[a-z\p{Cyrillic}0-9!\*';:=\+\,\.\$\/%#\[\]\p{Pd}_~&\|$LATIN_ACCENTS]/i;
175             # Allow URL paths to contain up to two nested levels of balanced parens
176             # 1. Used in Wikipedia URLs like /Primer_(film)
177             # 2. Used in IIS sessions like /S(dfd346)/
178             # 3. Used in Rdio URLs like /track/We_Up_(Album_Version_(Edited))/
179             our $valid_url_balanced_parens = qr/
180             \(
181             (?:
182             $valid_general_url_path_chars+
183             |
184             # allow one nested level of balanced parentheses
185             (?:
186             $valid_general_url_path_chars*
187             \(
188             $valid_general_url_path_chars+
189             \)
190             $valid_general_url_path_chars*
191             )
192             )
193             \)
194             /ix;
195             # Valid end-of-path chracters (so /foo. does not gobble the period).
196             # 1. Allow =&# for empty URL parameters and other URL-join artifacts
197             our $valid_url_path_ending_chars = qr/[a-z\p{Cyrillic}0-9=_#\/\+\-$LATIN_ACCENTS]|(?:$valid_url_balanced_parens)/i;
198             our $valid_url_path = qr/(?:
199             (?:
200             $valid_general_url_path_chars*
201             (?:$valid_url_balanced_parens $valid_general_url_path_chars*)*
202             $valid_url_path_ending_chars
203             )|(?:$valid_general_url_path_chars+\/)
204             )/ix;
205             our $valid_url_query_chars = qr/[a-z0-9!?\*'\(\);:&=\+\$\/%#\[\]\-_\.,~|@]/i;
206             our $valid_url_query_ending_chars = qr/[a-z0-9_&=#\/\-]/i;
207             our $valid_url = qr{
208             ( # $1 total match
209             ($valid_url_preceding_chars) # $2 Preceeding chracter
210             ( # $3 URL
211             (https?:\/\/)? # $4 Protocol (optional)
212             ($valid_domain) # $5 Domain(s)
213             (?::($valid_port_number))? # $6 Port number (optional)
214             (/$valid_url_path*)? # $7 URL Path and anchor
215             (\?$valid_url_query_chars*$valid_url_query_ending_chars)? # $8 Query String
216             )
217             )}ix;
218              
219             our $cashtag = qr/[a-z]{1,6}(?:[._][a-z]{1,2})?/i;
220             our $valid_cashtag = qr/(^|[$UNICODE_SPACES$DIRECTIONAL_CHARACTERS])(\$)($cashtag)(?=$|\s|[$PUNCTUATION_CHARS])/i;
221              
222             # These URL validation pattern strings are based on the ABNF from RFC 3986
223             our $validate_url_unreserved = qr/[a-z\p{Cyrillic}0-9\p{Pd}._~]/i;
224             our $validate_url_pct_encoded = qr/(?:%[0-9a-f]{2})/i;
225             our $validate_url_sub_delims = qr/[!\$&'()*+,;=]/i;
226             our $validate_url_pchar = qr/(?:
227             $validate_url_unreserved|
228             $validate_url_pct_encoded|
229             $validate_url_sub_delims|
230             [:\|@]
231             )/ix;
232              
233             our $validate_url_scheme = qr/(?:[a-z][a-z0-9+\-.]*)/i;
234             our $validate_url_userinfo = qr/(?:
235             $validate_url_unreserved|
236             $validate_url_pct_encoded|
237             $validate_url_sub_delims|
238             :
239             )*/ix;
240              
241             our $validate_url_dec_octet = qr/(?:[0-9]|(?:[1-9][0-9])|(?:1[0-9]{2})|(?:2[0-4][0-9])|(?:25[0-5]))/i;
242             our $validate_url_ipv4 =
243             qr/(?:$validate_url_dec_octet(?:\.$validate_url_dec_octet){3})/ix;
244              
245             # Punting on real IPv6 validation for now
246             our $validate_url_ipv6 = qr/(?:\[[a-f0-9:\.]+\])/i;
247              
248             # Also punting on IPvFuture for now
249             our $validate_url_ip = qr/(?:
250             $validate_url_ipv4|
251             $validate_url_ipv6
252             )/ix;
253              
254             # This is more strict than the rfc specifies
255             our $validate_url_subdomain_segment = qr/(?:[a-z0-9](?:[a-z0-9_\-]*[a-z0-9])?)/i;
256             our $validate_url_domain_segment = qr/(?:[a-z0-9](?:[a-z0-9\-]*[a-z0-9])?)/i;
257             our $validate_url_domain_tld = qr/(?:[a-z](?:[a-z0-9\-]*[a-z0-9])?)/i;
258             our $validate_url_domain = qr/(?:(?:$validate_url_subdomain_segment\.)*
259             (?:$validate_url_domain_segment\.)
260             $validate_url_domain_tld)/ix;
261              
262             our $validate_url_host = qr/(?:
263             $validate_url_ip|
264             $validate_url_domain
265             )/ix;
266              
267             # Unencoded internationalized domains - this doesn't check for invalid UTF-8 sequences
268             our $validate_url_unicode_subdomain_segment =
269             qr/(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9_\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix;
270             our $validate_url_unicode_domain_segment =
271             qr/(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix;
272             our $validate_url_unicode_domain_tld =
273             qr/(?:(?:[a-z]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)/ix;
274             our $validate_url_unicode_domain = qr/(?:(?:$validate_url_unicode_subdomain_segment\.)*
275             (?:$validate_url_unicode_domain_segment\.)
276             $validate_url_unicode_domain_tld)/ix;
277              
278             our $validate_url_unicode_host = qr/(?:
279             $validate_url_ip|
280             $validate_url_unicode_domain
281             )/ix;
282              
283             our $validate_url_port = qr/[0-9]{1,5}/;
284              
285             our $validate_url_unicode_authority = qr{
286             (?:($validate_url_userinfo)@)? # $1 userinfo
287             ($validate_url_unicode_host) # $2 host
288             (?::($validate_url_port))? # $3 port
289             }ix;
290              
291             our $validate_url_authority = qr{
292             (?:($validate_url_userinfo)@)? # $1 userinfo
293             ($validate_url_host) # $2 host
294             (?::($validate_url_port))? # $3 port
295             }ix;
296              
297             our $validate_url_path = qr{(/$validate_url_pchar*)*}i;
298             our $validate_url_query = qr{($validate_url_pchar|/|\?)*}i;
299             our $validate_url_fragment = qr{($validate_url_pchar|/|\?)*}i;
300              
301             # Modified version of RFC 3986 Appendix B
302             our $validate_url_unencoded = qr{
303             \A # Full URL
304             (?:
305             ([^:/?#]+):// # $1 Scheme
306             )?
307             ([^/?#]*) # $2 Authority
308             ([^?#]*) # $3 Path
309             (?:
310             \?([^#]*) # $4 Query
311             )?
312             (?:
313             \#(.*) # $5 Fragment
314             )?\z
315             }ix;
316              
317             1;