line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Regexp::Common::microsyntax; |
2
|
|
|
|
|
|
|
|
3
|
6
|
|
|
6
|
|
424493
|
use strict; |
|
6
|
|
|
|
|
13
|
|
|
6
|
|
|
|
|
215
|
|
4
|
6
|
|
|
6
|
|
32
|
use warnings; |
|
6
|
|
|
|
|
14
|
|
|
6
|
|
|
|
|
172
|
|
5
|
6
|
|
|
6
|
|
1322
|
use utf8; |
|
6
|
|
|
|
|
24
|
|
|
6
|
|
|
|
|
45
|
|
6
|
6
|
|
|
6
|
|
2419
|
use Regexp::Common qw(pattern); |
|
6
|
|
|
|
|
7323
|
|
|
6
|
|
|
|
|
57
|
|
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
our $VERSION = '0.02'; |
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
# ------------------------------------------------------------------------- |
11
|
|
|
|
|
|
|
# Helper sets and regexes |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
my %REGEXEN = (); |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
my $AT_SIGNS = '@@'; |
16
|
|
|
|
|
|
|
my $HASH_SIGNS = '##'; |
17
|
|
|
|
|
|
|
my $EXCLAMATION_SIGNS = '!'; |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
my $UNICODE_SPACES = join '|', map { pack 'U*', $_ } |
20
|
|
|
|
|
|
|
0x0009 .. 0x000D, # White_Space # Cc [5] .. |
21
|
|
|
|
|
|
|
0x0020, # White_Space # Zs SPACE |
22
|
|
|
|
|
|
|
0x0085, # White_Space # Cc |
23
|
|
|
|
|
|
|
0x00A0, # White_Space # Zs NO-BREAK SPACE |
24
|
|
|
|
|
|
|
0x1680, # White_Space # Zs OGHAM SPACE MARK |
25
|
|
|
|
|
|
|
0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR |
26
|
|
|
|
|
|
|
0x2000 .. 0x200A, # White_Space # Zs [11] EN QUAD..HAIR SPACE |
27
|
|
|
|
|
|
|
0x2028, # White_Space # Zl LINE SEPARATOR |
28
|
|
|
|
|
|
|
0x2029, # White_Space # Zp PARAGRAPH SEPARATOR |
29
|
|
|
|
|
|
|
0x202F, # White_Space # Zs NARROW NO-BREAK SPACE |
30
|
|
|
|
|
|
|
0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE |
31
|
|
|
|
|
|
|
0x3000, # White_Space # Zs IDEOGRAPHIC SPACE |
32
|
|
|
|
|
|
|
; |
33
|
|
|
|
|
|
|
$REGEXEN{spaces} = qr/$UNICODE_SPACES/o; |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
# Latin accented characters |
36
|
|
|
|
|
|
|
# Excludes 0xd7 from the range (the multiplication sign, confusable with "x"). |
37
|
|
|
|
|
|
|
# Also excludes 0xf7, the division sign |
38
|
|
|
|
|
|
|
my $LATIN_ACCENTS = join '', map { pack 'U*', $_ } |
39
|
|
|
|
|
|
|
0xc0 .. 0xd6, |
40
|
|
|
|
|
|
|
0xd8 .. 0xf6, |
41
|
|
|
|
|
|
|
0xf8 .. 0xff, |
42
|
|
|
|
|
|
|
; |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
my $NON_LATIN_HASHTAG_CHARS = join '', map { pack 'U*', $_ } |
45
|
|
|
|
|
|
|
# Cyrillic (Russian, Ukrainian, etc.) |
46
|
|
|
|
|
|
|
0x0400 .. 0x04ff, # Cyrillic |
47
|
|
|
|
|
|
|
0x0500 .. 0x0527, # Cyrillic Supplement |
48
|
|
|
|
|
|
|
# Hangul (Korean) |
49
|
|
|
|
|
|
|
0x1100 .. 0x11ff, # Hangul Jamo |
50
|
|
|
|
|
|
|
0x3130 .. 0x3185, # Hangul Compatibility Jamo |
51
|
|
|
|
|
|
|
0xA960 .. 0xA97F, # Hangul Jamo Extended-A |
52
|
|
|
|
|
|
|
0xAC00 .. 0xD7AF, # Hangul Syllables |
53
|
|
|
|
|
|
|
0xD7B0 .. 0xD7FF # Hangul Jamo Extended-B |
54
|
|
|
|
|
|
|
; |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
my $CJ_HASHTAG_CHARS = join '', map { pack 'U*', $_ } |
57
|
|
|
|
|
|
|
0x30A1 .. 0x30FA, # Katakana (full-width) |
58
|
|
|
|
|
|
|
0xFF66 .. 0xFF9D, # Katakana (half-width) |
59
|
|
|
|
|
|
|
0xFF10 .. 0xFF19, # Latin (full-width) |
60
|
|
|
|
|
|
|
0xFF21 .. 0xFF3A, # Latin (full-width) |
61
|
|
|
|
|
|
|
0xFF41 .. 0xFF5A, # Latin (full-width) |
62
|
|
|
|
|
|
|
0x3041 .. 0x3096, # Hiragana |
63
|
|
|
|
|
|
|
0x3400 .. 0x4DBF, # Kanji (CJK Extension A) |
64
|
|
|
|
|
|
|
0x4E00 .. 0x9FFF, # Kanji (Unified) |
65
|
|
|
|
|
|
|
0x20000 .. 0x2A6DF, # Kanji (CJK Extension B) |
66
|
|
|
|
|
|
|
0x2A700 .. 0x2B73F, # Kanji (CJK Extension C) |
67
|
|
|
|
|
|
|
0x2B740 .. 0x2B81F, # Kanji (CJK Extension D) |
68
|
|
|
|
|
|
|
0x2F800 .. 0x2FA1F, # Kanji (CJK supplement) |
69
|
|
|
|
|
|
|
; |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
my $HASHTAG_BOUNDARY1 = qr/(?:\A|(?<=$REGEXEN{spaces}|「|」|。|\.|!))/; |
72
|
|
|
|
|
|
|
my $HASHTAG_BOUNDARY2 = qr/(?:\z|$REGEXEN{spaces}|「|」|。|\.|!)/; |
73
|
|
|
|
|
|
|
my $HASHTAG_ALPHA = "[a-zA-Z_$LATIN_ACCENTS$NON_LATIN_HASHTAG_CHARS$CJ_HASHTAG_CHARS]"; |
74
|
|
|
|
|
|
|
my $HASHTAG_ALPHANUMERIC = "[a-zA-Z0-9_$LATIN_ACCENTS$NON_LATIN_HASHTAG_CHARS$CJ_HASHTAG_CHARS]"; |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
my $SLASHTAGS = qr/(?:by|cc|for|tip|thx|hat tip|ht|via)/i; |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
# ------------------------------------------------------------------------- |
79
|
|
|
|
|
|
|
# Pattern definitions |
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
# user |
82
|
|
|
|
|
|
|
# "(/[a-zA-Z][a-zA-Z0-9_-]{0,24})?)" . |
83
|
|
|
|
|
|
|
pattern |
84
|
|
|
|
|
|
|
name => [ qw(microsyntax user) ], |
85
|
|
|
|
|
|
|
create => # @user must be at beginning of string, or not after a word char |
86
|
|
|
|
|
|
|
"(?:^|(?
|
87
|
|
|
|
|
|
|
# open main capture |
88
|
|
|
|
|
|
|
"(" . |
89
|
|
|
|
|
|
|
# at sigil (keep) |
90
|
|
|
|
|
|
|
"(?k:[$AT_SIGNS])" . |
91
|
|
|
|
|
|
|
# username (keep) |
92
|
|
|
|
|
|
|
"(?k:[a-zA-Z0-9_]{1,20})" . |
93
|
|
|
|
|
|
|
# close main capture |
94
|
|
|
|
|
|
|
")" . |
95
|
|
|
|
|
|
|
# @user must be at end of string, or not followed by a word char or at |
96
|
|
|
|
|
|
|
"(?=\$|[^a-zA-Z0-9_$AT_SIGNS$LATIN_ACCENTS])", |
97
|
|
|
|
|
|
|
; |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
# hashtag |
100
|
|
|
|
|
|
|
pattern |
101
|
|
|
|
|
|
|
name => [ qw(microsyntax hashtag) ], |
102
|
|
|
|
|
|
|
create => # hashtag boundary condition |
103
|
|
|
|
|
|
|
$HASHTAG_BOUNDARY1 . |
104
|
|
|
|
|
|
|
# open main capture |
105
|
|
|
|
|
|
|
"(" . |
106
|
|
|
|
|
|
|
# hash sigil (keep) |
107
|
|
|
|
|
|
|
"(?k:[$HASH_SIGNS])" . |
108
|
|
|
|
|
|
|
# hashtag (keep) |
109
|
|
|
|
|
|
|
"(?k:$HASHTAG_ALPHANUMERIC*$HASHTAG_ALPHA$HASHTAG_ALPHANUMERIC*)" . |
110
|
|
|
|
|
|
|
# close main capture |
111
|
|
|
|
|
|
|
")" . |
112
|
|
|
|
|
|
|
# hashtag boundary condition (zero-width) |
113
|
|
|
|
|
|
|
"(?=$HASHTAG_BOUNDARY2)", |
114
|
|
|
|
|
|
|
; |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
# grouptag |
117
|
|
|
|
|
|
|
pattern |
118
|
|
|
|
|
|
|
name => [ qw(microsyntax grouptag) ], |
119
|
|
|
|
|
|
|
create => # hashtag boundary condition |
120
|
|
|
|
|
|
|
$HASHTAG_BOUNDARY1 . |
121
|
|
|
|
|
|
|
# open main capture |
122
|
|
|
|
|
|
|
"(" . |
123
|
|
|
|
|
|
|
# exclamation sigil (keep) |
124
|
|
|
|
|
|
|
"(?k:[$EXCLAMATION_SIGNS])" . |
125
|
|
|
|
|
|
|
# grouptag (keep) |
126
|
|
|
|
|
|
|
# TODO: check what chars status.net allows in grouptags |
127
|
|
|
|
|
|
|
"(?k:$HASHTAG_ALPHANUMERIC*$HASHTAG_ALPHA$HASHTAG_ALPHANUMERIC*)" . |
128
|
|
|
|
|
|
|
# "(?k:[a-z0-9]+)" . |
129
|
|
|
|
|
|
|
# close main capture |
130
|
|
|
|
|
|
|
")" . |
131
|
|
|
|
|
|
|
# hashtag boundary condition (zero-width) |
132
|
|
|
|
|
|
|
"(?=$HASHTAG_BOUNDARY2)", |
133
|
|
|
|
|
|
|
; |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
# slashtag |
136
|
|
|
|
|
|
|
pattern |
137
|
|
|
|
|
|
|
name => [ qw(microsyntax slashtag) ], |
138
|
|
|
|
|
|
|
create => # slashtag must be at beginning of string, or not after a word char |
139
|
|
|
|
|
|
|
# "(?:^|(?
|
140
|
|
|
|
|
|
|
# open main capture |
141
|
|
|
|
|
|
|
"(" . |
142
|
|
|
|
|
|
|
# slashtag (keep) |
143
|
|
|
|
|
|
|
"(?k:/?$SLASHTAGS)$REGEXEN{spaces}" . |
144
|
|
|
|
|
|
|
# @user (keep) |
145
|
|
|
|
|
|
|
"(?k:[$AT_SIGNS][a-zA-Z0-9_]{1,20}" . |
146
|
|
|
|
|
|
|
"(?:$REGEXEN{spaces}+[$AT_SIGNS][a-zA-Z0-9_]{1,20})*)" . |
147
|
|
|
|
|
|
|
# close main capture |
148
|
|
|
|
|
|
|
")" . |
149
|
|
|
|
|
|
|
# @user must be at end of string, or not followed by a word char or at |
150
|
|
|
|
|
|
|
"(?=\$|[^a-zA-Z0-9_$AT_SIGNS$LATIN_ACCENTS])", |
151
|
|
|
|
|
|
|
; |
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
1; |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
__END__ |