| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Regexp::Common::microsyntax; |
|
2
|
|
|
|
|
|
|
|
|
3
|
6
|
|
|
6
|
|
424493
|
use strict; |
|
|
6
|
|
|
|
|
13
|
|
|
|
6
|
|
|
|
|
215
|
|
|
4
|
6
|
|
|
6
|
|
32
|
use warnings; |
|
|
6
|
|
|
|
|
14
|
|
|
|
6
|
|
|
|
|
172
|
|
|
5
|
6
|
|
|
6
|
|
1322
|
use utf8; |
|
|
6
|
|
|
|
|
24
|
|
|
|
6
|
|
|
|
|
45
|
|
|
6
|
6
|
|
|
6
|
|
2419
|
use Regexp::Common qw(pattern); |
|
|
6
|
|
|
|
|
7323
|
|
|
|
6
|
|
|
|
|
57
|
|
|
7
|
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
our $VERSION = '0.02'; |
|
9
|
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
# ------------------------------------------------------------------------- |
|
11
|
|
|
|
|
|
|
# Helper sets and regexes |
|
12
|
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
my %REGEXEN = (); |
|
14
|
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
my $AT_SIGNS = '@@'; |
|
16
|
|
|
|
|
|
|
my $HASH_SIGNS = '##'; |
|
17
|
|
|
|
|
|
|
my $EXCLAMATION_SIGNS = '!'; |
|
18
|
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
my $UNICODE_SPACES = join '|', map { pack 'U*', $_ } |
|
20
|
|
|
|
|
|
|
0x0009 .. 0x000D, # White_Space # Cc [5] .. |
|
21
|
|
|
|
|
|
|
0x0020, # White_Space # Zs SPACE |
|
22
|
|
|
|
|
|
|
0x0085, # White_Space # Cc |
|
23
|
|
|
|
|
|
|
0x00A0, # White_Space # Zs NO-BREAK SPACE |
|
24
|
|
|
|
|
|
|
0x1680, # White_Space # Zs OGHAM SPACE MARK |
|
25
|
|
|
|
|
|
|
0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR |
|
26
|
|
|
|
|
|
|
0x2000 .. 0x200A, # White_Space # Zs [11] EN QUAD..HAIR SPACE |
|
27
|
|
|
|
|
|
|
0x2028, # White_Space # Zl LINE SEPARATOR |
|
28
|
|
|
|
|
|
|
0x2029, # White_Space # Zp PARAGRAPH SEPARATOR |
|
29
|
|
|
|
|
|
|
0x202F, # White_Space # Zs NARROW NO-BREAK SPACE |
|
30
|
|
|
|
|
|
|
0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE |
|
31
|
|
|
|
|
|
|
0x3000, # White_Space # Zs IDEOGRAPHIC SPACE |
|
32
|
|
|
|
|
|
|
; |
|
33
|
|
|
|
|
|
|
$REGEXEN{spaces} = qr/$UNICODE_SPACES/o; |
|
34
|
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
# Latin accented characters |
|
36
|
|
|
|
|
|
|
# Excludes 0xd7 from the range (the multiplication sign, confusable with "x"). |
|
37
|
|
|
|
|
|
|
# Also excludes 0xf7, the division sign |
|
38
|
|
|
|
|
|
|
my $LATIN_ACCENTS = join '', map { pack 'U*', $_ } |
|
39
|
|
|
|
|
|
|
0xc0 .. 0xd6, |
|
40
|
|
|
|
|
|
|
0xd8 .. 0xf6, |
|
41
|
|
|
|
|
|
|
0xf8 .. 0xff, |
|
42
|
|
|
|
|
|
|
; |
|
43
|
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
my $NON_LATIN_HASHTAG_CHARS = join '', map { pack 'U*', $_ } |
|
45
|
|
|
|
|
|
|
# Cyrillic (Russian, Ukrainian, etc.) |
|
46
|
|
|
|
|
|
|
0x0400 .. 0x04ff, # Cyrillic |
|
47
|
|
|
|
|
|
|
0x0500 .. 0x0527, # Cyrillic Supplement |
|
48
|
|
|
|
|
|
|
# Hangul (Korean) |
|
49
|
|
|
|
|
|
|
0x1100 .. 0x11ff, # Hangul Jamo |
|
50
|
|
|
|
|
|
|
0x3130 .. 0x3185, # Hangul Compatibility Jamo |
|
51
|
|
|
|
|
|
|
0xA960 .. 0xA97F, # Hangul Jamo Extended-A |
|
52
|
|
|
|
|
|
|
0xAC00 .. 0xD7AF, # Hangul Syllables |
|
53
|
|
|
|
|
|
|
0xD7B0 .. 0xD7FF # Hangul Jamo Extended-B |
|
54
|
|
|
|
|
|
|
; |
|
55
|
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
my $CJ_HASHTAG_CHARS = join '', map { pack 'U*', $_ } |
|
57
|
|
|
|
|
|
|
0x30A1 .. 0x30FA, # Katakana (full-width) |
|
58
|
|
|
|
|
|
|
0xFF66 .. 0xFF9D, # Katakana (half-width) |
|
59
|
|
|
|
|
|
|
0xFF10 .. 0xFF19, # Latin (full-width) |
|
60
|
|
|
|
|
|
|
0xFF21 .. 0xFF3A, # Latin (full-width) |
|
61
|
|
|
|
|
|
|
0xFF41 .. 0xFF5A, # Latin (full-width) |
|
62
|
|
|
|
|
|
|
0x3041 .. 0x3096, # Hiragana |
|
63
|
|
|
|
|
|
|
0x3400 .. 0x4DBF, # Kanji (CJK Extension A) |
|
64
|
|
|
|
|
|
|
0x4E00 .. 0x9FFF, # Kanji (Unified) |
|
65
|
|
|
|
|
|
|
0x20000 .. 0x2A6DF, # Kanji (CJK Extension B) |
|
66
|
|
|
|
|
|
|
0x2A700 .. 0x2B73F, # Kanji (CJK Extension C) |
|
67
|
|
|
|
|
|
|
0x2B740 .. 0x2B81F, # Kanji (CJK Extension D) |
|
68
|
|
|
|
|
|
|
0x2F800 .. 0x2FA1F, # Kanji (CJK supplement) |
|
69
|
|
|
|
|
|
|
; |
|
70
|
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
my $HASHTAG_BOUNDARY1 = qr/(?:\A|(?<=$REGEXEN{spaces}|「|」|。|\.|!))/; |
|
72
|
|
|
|
|
|
|
my $HASHTAG_BOUNDARY2 = qr/(?:\z|$REGEXEN{spaces}|「|」|。|\.|!)/; |
|
73
|
|
|
|
|
|
|
my $HASHTAG_ALPHA = "[a-zA-Z_$LATIN_ACCENTS$NON_LATIN_HASHTAG_CHARS$CJ_HASHTAG_CHARS]"; |
|
74
|
|
|
|
|
|
|
my $HASHTAG_ALPHANUMERIC = "[a-zA-Z0-9_$LATIN_ACCENTS$NON_LATIN_HASHTAG_CHARS$CJ_HASHTAG_CHARS]"; |
|
75
|
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
my $SLASHTAGS = qr/(?:by|cc|for|tip|thx|hat tip|ht|via)/i; |
|
77
|
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
# ------------------------------------------------------------------------- |
|
79
|
|
|
|
|
|
|
# Pattern definitions |
|
80
|
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
# user |
|
82
|
|
|
|
|
|
|
# "(/[a-zA-Z][a-zA-Z0-9_-]{0,24})?)" . |
|
83
|
|
|
|
|
|
|
pattern |
|
84
|
|
|
|
|
|
|
name => [ qw(microsyntax user) ], |
|
85
|
|
|
|
|
|
|
create => # @user must be at beginning of string, or not after a word char |
|
86
|
|
|
|
|
|
|
"(?:^|(?
|
|
87
|
|
|
|
|
|
|
# open main capture |
|
88
|
|
|
|
|
|
|
"(" . |
|
89
|
|
|
|
|
|
|
# at sigil (keep) |
|
90
|
|
|
|
|
|
|
"(?k:[$AT_SIGNS])" . |
|
91
|
|
|
|
|
|
|
# username (keep) |
|
92
|
|
|
|
|
|
|
"(?k:[a-zA-Z0-9_]{1,20})" . |
|
93
|
|
|
|
|
|
|
# close main capture |
|
94
|
|
|
|
|
|
|
")" . |
|
95
|
|
|
|
|
|
|
# @user must be at end of string, or not followed by a word char or at |
|
96
|
|
|
|
|
|
|
"(?=\$|[^a-zA-Z0-9_$AT_SIGNS$LATIN_ACCENTS])", |
|
97
|
|
|
|
|
|
|
; |
|
98
|
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
# hashtag |
|
100
|
|
|
|
|
|
|
pattern |
|
101
|
|
|
|
|
|
|
name => [ qw(microsyntax hashtag) ], |
|
102
|
|
|
|
|
|
|
create => # hashtag boundary condition |
|
103
|
|
|
|
|
|
|
$HASHTAG_BOUNDARY1 . |
|
104
|
|
|
|
|
|
|
# open main capture |
|
105
|
|
|
|
|
|
|
"(" . |
|
106
|
|
|
|
|
|
|
# hash sigil (keep) |
|
107
|
|
|
|
|
|
|
"(?k:[$HASH_SIGNS])" . |
|
108
|
|
|
|
|
|
|
# hashtag (keep) |
|
109
|
|
|
|
|
|
|
"(?k:$HASHTAG_ALPHANUMERIC*$HASHTAG_ALPHA$HASHTAG_ALPHANUMERIC*)" . |
|
110
|
|
|
|
|
|
|
# close main capture |
|
111
|
|
|
|
|
|
|
")" . |
|
112
|
|
|
|
|
|
|
# hashtag boundary condition (zero-width) |
|
113
|
|
|
|
|
|
|
"(?=$HASHTAG_BOUNDARY2)", |
|
114
|
|
|
|
|
|
|
; |
|
115
|
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
# grouptag |
|
117
|
|
|
|
|
|
|
pattern |
|
118
|
|
|
|
|
|
|
name => [ qw(microsyntax grouptag) ], |
|
119
|
|
|
|
|
|
|
create => # hashtag boundary condition |
|
120
|
|
|
|
|
|
|
$HASHTAG_BOUNDARY1 . |
|
121
|
|
|
|
|
|
|
# open main capture |
|
122
|
|
|
|
|
|
|
"(" . |
|
123
|
|
|
|
|
|
|
# exclamation sigil (keep) |
|
124
|
|
|
|
|
|
|
"(?k:[$EXCLAMATION_SIGNS])" . |
|
125
|
|
|
|
|
|
|
# grouptag (keep) |
|
126
|
|
|
|
|
|
|
# TODO: check what chars status.net allows in grouptags |
|
127
|
|
|
|
|
|
|
"(?k:$HASHTAG_ALPHANUMERIC*$HASHTAG_ALPHA$HASHTAG_ALPHANUMERIC*)" . |
|
128
|
|
|
|
|
|
|
# "(?k:[a-z0-9]+)" . |
|
129
|
|
|
|
|
|
|
# close main capture |
|
130
|
|
|
|
|
|
|
")" . |
|
131
|
|
|
|
|
|
|
# hashtag boundary condition (zero-width) |
|
132
|
|
|
|
|
|
|
"(?=$HASHTAG_BOUNDARY2)", |
|
133
|
|
|
|
|
|
|
; |
|
134
|
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
# slashtag |
|
136
|
|
|
|
|
|
|
pattern |
|
137
|
|
|
|
|
|
|
name => [ qw(microsyntax slashtag) ], |
|
138
|
|
|
|
|
|
|
create => # slashtag must be at beginning of string, or not after a word char |
|
139
|
|
|
|
|
|
|
# "(?:^|(?
|
|
140
|
|
|
|
|
|
|
# open main capture |
|
141
|
|
|
|
|
|
|
"(" . |
|
142
|
|
|
|
|
|
|
# slashtag (keep) |
|
143
|
|
|
|
|
|
|
"(?k:/?$SLASHTAGS)$REGEXEN{spaces}" . |
|
144
|
|
|
|
|
|
|
# @user (keep) |
|
145
|
|
|
|
|
|
|
"(?k:[$AT_SIGNS][a-zA-Z0-9_]{1,20}" . |
|
146
|
|
|
|
|
|
|
"(?:$REGEXEN{spaces}+[$AT_SIGNS][a-zA-Z0-9_]{1,20})*)" . |
|
147
|
|
|
|
|
|
|
# close main capture |
|
148
|
|
|
|
|
|
|
")" . |
|
149
|
|
|
|
|
|
|
# @user must be at end of string, or not followed by a word char or at |
|
150
|
|
|
|
|
|
|
"(?=\$|[^a-zA-Z0-9_$AT_SIGNS$LATIN_ACCENTS])", |
|
151
|
|
|
|
|
|
|
; |
|
152
|
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
1; |
|
154
|
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
__END__ |