File Coverage

blib/lib/Regexp/Common/microsyntax.pm
Criterion Covered Total %
statement 12 12 100.0
branch n/a
condition n/a
subroutine 4 4 100.0
pod n/a
total 16 16 100.0


line stmt bran cond sub pod time code
1             package Regexp::Common::microsyntax;
2              
3 6     6   424493 use strict;
  6         13  
  6         215  
4 6     6   32 use warnings;
  6         14  
  6         172  
5 6     6   1322 use utf8;
  6         24  
  6         45  
6 6     6   2419 use Regexp::Common qw(pattern);
  6         7323  
  6         57  
7              
8             our $VERSION = '0.02';
9              
10             # -------------------------------------------------------------------------
11             # Helper sets and regexes
12              
13             my %REGEXEN = ();
14              
15             my $AT_SIGNS = '@@';
16             my $HASH_SIGNS = '##';
17             my $EXCLAMATION_SIGNS = '!';
18              
19             my $UNICODE_SPACES = join '|', map { pack 'U*', $_ }
20             0x0009 .. 0x000D, # White_Space # Cc [5] ..
21             0x0020, # White_Space # Zs SPACE
22             0x0085, # White_Space # Cc
23             0x00A0, # White_Space # Zs NO-BREAK SPACE
24             0x1680, # White_Space # Zs OGHAM SPACE MARK
25             0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR
26             0x2000 .. 0x200A, # White_Space # Zs [11] EN QUAD..HAIR SPACE
27             0x2028, # White_Space # Zl LINE SEPARATOR
28             0x2029, # White_Space # Zp PARAGRAPH SEPARATOR
29             0x202F, # White_Space # Zs NARROW NO-BREAK SPACE
30             0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE
31             0x3000, # White_Space # Zs IDEOGRAPHIC SPACE
32             ;
33             $REGEXEN{spaces} = qr/$UNICODE_SPACES/o;
34              
35             # Latin accented characters
36             # Excludes 0xd7 from the range (the multiplication sign, confusable with "x").
37             # Also excludes 0xf7, the division sign
38             my $LATIN_ACCENTS = join '', map { pack 'U*', $_ }
39             0xc0 .. 0xd6,
40             0xd8 .. 0xf6,
41             0xf8 .. 0xff,
42             ;
43              
44             my $NON_LATIN_HASHTAG_CHARS = join '', map { pack 'U*', $_ }
45             # Cyrillic (Russian, Ukrainian, etc.)
46             0x0400 .. 0x04ff, # Cyrillic
47             0x0500 .. 0x0527, # Cyrillic Supplement
48             # Hangul (Korean)
49             0x1100 .. 0x11ff, # Hangul Jamo
50             0x3130 .. 0x3185, # Hangul Compatibility Jamo
51             0xA960 .. 0xA97F, # Hangul Jamo Extended-A
52             0xAC00 .. 0xD7AF, # Hangul Syllables
53             0xD7B0 .. 0xD7FF # Hangul Jamo Extended-B
54             ;
55              
56             my $CJ_HASHTAG_CHARS = join '', map { pack 'U*', $_ }
57             0x30A1 .. 0x30FA, # Katakana (full-width)
58             0xFF66 .. 0xFF9D, # Katakana (half-width)
59             0xFF10 .. 0xFF19, # Latin (full-width)
60             0xFF21 .. 0xFF3A, # Latin (full-width)
61             0xFF41 .. 0xFF5A, # Latin (full-width)
62             0x3041 .. 0x3096, # Hiragana
63             0x3400 .. 0x4DBF, # Kanji (CJK Extension A)
64             0x4E00 .. 0x9FFF, # Kanji (Unified)
65             0x20000 .. 0x2A6DF, # Kanji (CJK Extension B)
66             0x2A700 .. 0x2B73F, # Kanji (CJK Extension C)
67             0x2B740 .. 0x2B81F, # Kanji (CJK Extension D)
68             0x2F800 .. 0x2FA1F, # Kanji (CJK supplement)
69             ;
70              
71             my $HASHTAG_BOUNDARY1 = qr/(?:\A|(?<=$REGEXEN{spaces}|「|」|。|\.|!))/;
72             my $HASHTAG_BOUNDARY2 = qr/(?:\z|$REGEXEN{spaces}|「|」|。|\.|!)/;
73             my $HASHTAG_ALPHA = "[a-zA-Z_$LATIN_ACCENTS$NON_LATIN_HASHTAG_CHARS$CJ_HASHTAG_CHARS]";
74             my $HASHTAG_ALPHANUMERIC = "[a-zA-Z0-9_$LATIN_ACCENTS$NON_LATIN_HASHTAG_CHARS$CJ_HASHTAG_CHARS]";
75              
76             my $SLASHTAGS = qr/(?:by|cc|for|tip|thx|hat tip|ht|via)/i;
77              
78             # -------------------------------------------------------------------------
79             # Pattern definitions
80              
81             # user
82             # "(/[a-zA-Z][a-zA-Z0-9_-]{0,24})?)" .
83             pattern
84             name => [ qw(microsyntax user) ],
85             create => # @user must be at beginning of string, or not after a word char
86             "(?:^|(?
87             # open main capture
88             "(" .
89             # at sigil (keep)
90             "(?k:[$AT_SIGNS])" .
91             # username (keep)
92             "(?k:[a-zA-Z0-9_]{1,20})" .
93             # close main capture
94             ")" .
95             # @user must be at end of string, or not followed by a word char or at
96             "(?=\$|[^a-zA-Z0-9_$AT_SIGNS$LATIN_ACCENTS])",
97             ;
98              
99             # hashtag
100             pattern
101             name => [ qw(microsyntax hashtag) ],
102             create => # hashtag boundary condition
103             $HASHTAG_BOUNDARY1 .
104             # open main capture
105             "(" .
106             # hash sigil (keep)
107             "(?k:[$HASH_SIGNS])" .
108             # hashtag (keep)
109             "(?k:$HASHTAG_ALPHANUMERIC*$HASHTAG_ALPHA$HASHTAG_ALPHANUMERIC*)" .
110             # close main capture
111             ")" .
112             # hashtag boundary condition (zero-width)
113             "(?=$HASHTAG_BOUNDARY2)",
114             ;
115              
116             # grouptag
117             pattern
118             name => [ qw(microsyntax grouptag) ],
119             create => # hashtag boundary condition
120             $HASHTAG_BOUNDARY1 .
121             # open main capture
122             "(" .
123             # exclamation sigil (keep)
124             "(?k:[$EXCLAMATION_SIGNS])" .
125             # grouptag (keep)
126             # TODO: check what chars status.net allows in grouptags
127             "(?k:$HASHTAG_ALPHANUMERIC*$HASHTAG_ALPHA$HASHTAG_ALPHANUMERIC*)" .
128             # "(?k:[a-z0-9]+)" .
129             # close main capture
130             ")" .
131             # hashtag boundary condition (zero-width)
132             "(?=$HASHTAG_BOUNDARY2)",
133             ;
134              
135             # slashtag
136             pattern
137             name => [ qw(microsyntax slashtag) ],
138             create => # slashtag must be at beginning of string, or not after a word char
139             # "(?:^|(?
140             # open main capture
141             "(" .
142             # slashtag (keep)
143             "(?k:/?$SLASHTAGS)$REGEXEN{spaces}" .
144             # @user (keep)
145             "(?k:[$AT_SIGNS][a-zA-Z0-9_]{1,20}" .
146             "(?:$REGEXEN{spaces}+[$AT_SIGNS][a-zA-Z0-9_]{1,20})*)" .
147             # close main capture
148             ")" .
149             # @user must be at end of string, or not followed by a word char or at
150             "(?=\$|[^a-zA-Z0-9_$AT_SIGNS$LATIN_ACCENTS])",
151             ;
152              
153             1;
154              
155             __END__