| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
/* |
|
2
|
|
|
|
|
|
|
* Notes for the casual reader ... |
|
3
|
|
|
|
|
|
|
* |
|
4
|
|
|
|
|
|
|
* This is my first attempt at writing an XS module so it's probably not the |
|
5
|
|
|
|
|
|
|
* finest example for a new XS coder to read. Of course if you do read it and |
|
6
|
|
|
|
|
|
|
* have suggestions for improvements then please let me know. |
|
7
|
|
|
|
|
|
|
* |
|
8
|
|
|
|
|
|
|
* Unlike some XS modules, this one is not wrapping an existing library. All |
|
9
|
|
|
|
|
|
|
* the C source is contained in this file, along with the XSUB definition. |
|
10
|
|
|
|
|
|
|
* |
|
11
|
|
|
|
|
|
|
* Although the XSUB layer allows automatic conversion between the data |
|
12
|
|
|
|
|
|
|
* structures used by Perl variables (different types of SV) and native C types |
|
13
|
|
|
|
|
|
|
* (like ints and character pointers) this module doesn't really take advantage |
|
14
|
|
|
|
|
|
|
* of that. Instead, it takes an SV as input and returns an SV as output. |
|
15
|
|
|
|
|
|
|
* This design decision was made in order to support the (premature/micro) |
|
16
|
|
|
|
|
|
|
* optimisation whereby if the input SV contained all-ASCII characters, then |
|
17
|
|
|
|
|
|
|
* the return value would be a pointer to the same SV, rather than needlessly |
|
18
|
|
|
|
|
|
|
* making a copy of it. |
|
19
|
|
|
|
|
|
|
* |
|
20
|
|
|
|
|
|
|
* Copyright (C) 2014 by Grant McLean |
|
21
|
|
|
|
|
|
|
* |
|
22
|
|
|
|
|
|
|
*/ |
|
23
|
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
#include "EXTERN.h" |
|
25
|
|
|
|
|
|
|
#include "perl.h" |
|
26
|
|
|
|
|
|
|
#include "XSUB.h" |
|
27
|
|
|
|
|
|
|
#include "ppport.h" |
|
28
|
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
U8 _encoding_fix_latin_ms_map[] = { |
|
31
|
|
|
|
|
|
|
0xE2, 0x82, 0xAC, 0x00, // 80 EURO SIGN |
|
32
|
|
|
|
|
|
|
0x25, 0x38, 0x31, 0x00, // 81 |
|
33
|
|
|
|
|
|
|
0xE2, 0x80, 0x9A, 0x00, // 82 SINGLE LOW-9 QUOTATION MARK |
|
34
|
|
|
|
|
|
|
0xC6, 0x92, 0x00, 0x00, // 83 LATIN SMALL LETTER F WITH HOOK |
|
35
|
|
|
|
|
|
|
0xE2, 0x80, 0x9E, 0x00, // 84 DOUBLE LOW-9 QUOTATION MARK |
|
36
|
|
|
|
|
|
|
0xE2, 0x80, 0xA6, 0x00, // 85 HORIZONTAL ELLIPSIS |
|
37
|
|
|
|
|
|
|
0xE2, 0x80, 0xA0, 0x00, // 86 DAGGER |
|
38
|
|
|
|
|
|
|
0xE2, 0x80, 0xA1, 0x00, // 87 DOUBLE DAGGER |
|
39
|
|
|
|
|
|
|
0xCB, 0x86, 0x00, 0x00, // 88 MODIFIER LETTER CIRCUMFLEX ACCENT |
|
40
|
|
|
|
|
|
|
0xE2, 0x80, 0xB0, 0x00, // 89 PER MILLE SIGN |
|
41
|
|
|
|
|
|
|
0xC5, 0xA0, 0x00, 0x00, // 8A LATIN CAPITAL LETTER S WITH CARON |
|
42
|
|
|
|
|
|
|
0xE2, 0x80, 0xB9, 0x00, // 8B SINGLE LEFT-POINTING ANGLE QUOTATION MARK |
|
43
|
|
|
|
|
|
|
0xC5, 0x92, 0x00, 0x00, // 8C LATIN CAPITAL LIGATURE OE |
|
44
|
|
|
|
|
|
|
0x25, 0x38, 0x44, 0x00, // 8D |
|
45
|
|
|
|
|
|
|
0xC5, 0xBD, 0x00, 0x00, // 8E LATIN CAPITAL LETTER Z WITH CARON |
|
46
|
|
|
|
|
|
|
0x25, 0x38, 0x46, 0x00, // 8F |
|
47
|
|
|
|
|
|
|
0x25, 0x39, 0x30, 0x00, // 90 |
|
48
|
|
|
|
|
|
|
0xE2, 0x80, 0x98, 0x00, // 91 LEFT SINGLE QUOTATION MARK |
|
49
|
|
|
|
|
|
|
0xE2, 0x80, 0x99, 0x00, // 92 RIGHT SINGLE QUOTATION MARK |
|
50
|
|
|
|
|
|
|
0xE2, 0x80, 0x9C, 0x00, // 93 LEFT DOUBLE QUOTATION MARK |
|
51
|
|
|
|
|
|
|
0xE2, 0x80, 0x9D, 0x00, // 94 RIGHT DOUBLE QUOTATION MARK |
|
52
|
|
|
|
|
|
|
0xE2, 0x80, 0xA2, 0x00, // 95 BULLET |
|
53
|
|
|
|
|
|
|
0xE2, 0x80, 0x93, 0x00, // 96 EN DASH |
|
54
|
|
|
|
|
|
|
0xE2, 0x80, 0x94, 0x00, // 97 EM DASH |
|
55
|
|
|
|
|
|
|
0xCB, 0x9C, 0x00, 0x00, // 98 SMALL TILDE |
|
56
|
|
|
|
|
|
|
0xE2, 0x84, 0xA2, 0x00, // 99 TRADE MARK SIGN |
|
57
|
|
|
|
|
|
|
0xC5, 0xA1, 0x00, 0x00, // 9A LATIN SMALL LETTER S WITH CARON |
|
58
|
|
|
|
|
|
|
0xE2, 0x80, 0xBA, 0x00, // 9B SINGLE RIGHT-POINTING ANGLE QUOTATION MARK |
|
59
|
|
|
|
|
|
|
0xC5, 0x93, 0x00, 0x00, // 9C LATIN SMALL LIGATURE OE |
|
60
|
|
|
|
|
|
|
0x25, 0x39, 0x44, 0x00, // 9D |
|
61
|
|
|
|
|
|
|
0xC5, 0xBE, 0x00, 0x00, // 9E LATIN SMALL LETTER Z WITH CARON |
|
62
|
|
|
|
|
|
|
0xC5, 0xB8, 0x00, 0x00, // 9F LATIN CAPITAL LETTER Y WITH DIAERESIS |
|
63
|
|
|
|
|
|
|
0x00 |
|
64
|
|
|
|
|
|
|
}; |
|
65
|
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
static SV* _encoding_fix_latin_xs(SV*, int, int); |
|
68
|
|
|
|
|
|
|
static int consume_utf8_bytes(U8*, U8*, int); |
|
69
|
|
|
|
|
|
|
static int consume_latin_byte(U8*, U8*, int); |
|
70
|
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
|
|
72
|
35
|
|
|
|
|
|
static SV* _encoding_fix_latin_xs(SV* source, int overlong_fatal, int ascii_hex) { |
|
73
|
35
|
|
|
|
|
|
SV* out = NULL; // Defer initialisation until first non-ASCII character |
|
74
|
|
|
|
|
|
|
U8 *ph, *pt; |
|
75
|
|
|
|
|
|
|
U8 ubuf[8]; |
|
76
|
|
|
|
|
|
|
UV i, bytes, bytes_consumed; |
|
77
|
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
STRLEN l; |
|
79
|
35
|
50
|
|
|
|
|
ph = pt = SvPV(source, l); |
|
80
|
35
|
|
|
|
|
|
bytes = SvCUR(source); |
|
81
|
218
|
100
|
|
|
|
|
for(i = 0; i < bytes; i++, ph++) { |
|
82
|
184
|
100
|
|
|
|
|
if((*ph & 0x80) == 0) |
|
83
|
140
|
|
|
|
|
|
continue; |
|
84
|
|
|
|
|
|
|
|
|
85
|
44
|
100
|
|
|
|
|
if(out == NULL) { // Deferred initialisation |
|
86
|
27
|
|
|
|
|
|
out = newSV(bytes * 12 / 10); // Pre-allocate 20% more space |
|
87
|
27
|
|
|
|
|
|
SvPOK_on(out); |
|
88
|
|
|
|
|
|
|
} |
|
89
|
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
// Copy the ASCII byte sequence up to, but not including, the byte that |
|
91
|
|
|
|
|
|
|
// we're currently pointing at |
|
92
|
44
|
100
|
|
|
|
|
if(ph > pt) { |
|
93
|
15
|
|
|
|
|
|
sv_catpvn(out, pt, (STRLEN)(ph - pt)); |
|
94
|
|
|
|
|
|
|
} |
|
95
|
|
|
|
|
|
|
|
|
96
|
44
|
|
|
|
|
|
bytes_consumed = consume_utf8_bytes(ph, ubuf, overlong_fatal); |
|
97
|
43
|
100
|
|
|
|
|
if(!bytes_consumed) { |
|
98
|
24
|
|
|
|
|
|
bytes_consumed = consume_latin_byte(ph, ubuf, ascii_hex); |
|
99
|
|
|
|
|
|
|
} |
|
100
|
43
|
|
|
|
|
|
sv_catpvn(out, ubuf, strlen(ubuf)); |
|
101
|
43
|
|
|
|
|
|
i += bytes_consumed - 1; |
|
102
|
43
|
|
|
|
|
|
ph += bytes_consumed - 1; |
|
103
|
|
|
|
|
|
|
|
|
104
|
43
|
|
|
|
|
|
pt = ph + 1; |
|
105
|
|
|
|
|
|
|
} |
|
106
|
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
// If the input was all ASCII, just return the input |
|
108
|
34
|
100
|
|
|
|
|
if(out == NULL) { |
|
109
|
8
|
|
|
|
|
|
return(source); |
|
110
|
|
|
|
|
|
|
} |
|
111
|
|
|
|
|
|
|
|
|
112
|
26
|
100
|
|
|
|
|
if(ph > pt) { |
|
113
|
6
|
|
|
|
|
|
sv_catpvn(out, pt, (STRLEN)(ph - pt)); |
|
114
|
|
|
|
|
|
|
} |
|
115
|
|
|
|
|
|
|
|
|
116
|
26
|
|
|
|
|
|
SvUTF8_on(out); |
|
117
|
|
|
|
|
|
|
|
|
118
|
34
|
|
|
|
|
|
return(sv_2mortal(out)); |
|
119
|
|
|
|
|
|
|
} |
|
120
|
|
|
|
|
|
|
|
|
121
|
44
|
|
|
|
|
|
static int consume_utf8_bytes(U8* in, U8* out, int overlong_fatal) { |
|
122
|
|
|
|
|
|
|
UV cp, min_cp, bytes, i; |
|
123
|
|
|
|
|
|
|
U8 *d, ebuf[8]; |
|
124
|
|
|
|
|
|
|
SV *exception; |
|
125
|
|
|
|
|
|
|
|
|
126
|
44
|
100
|
|
|
|
|
if((in[0] & 0xE0) == 0xC0) { |
|
127
|
10
|
|
|
|
|
|
cp = in[0] & 0x1F; |
|
128
|
10
|
|
|
|
|
|
bytes = 2; |
|
129
|
10
|
|
|
|
|
|
min_cp = 0x80; |
|
130
|
|
|
|
|
|
|
} |
|
131
|
34
|
100
|
|
|
|
|
else if((in[0] & 0xF0) == 0xE0) { |
|
132
|
11
|
|
|
|
|
|
cp = in[0] & 0x0F; |
|
133
|
11
|
|
|
|
|
|
bytes = 3; |
|
134
|
11
|
|
|
|
|
|
min_cp = 0x800; |
|
135
|
|
|
|
|
|
|
} |
|
136
|
23
|
100
|
|
|
|
|
else if((in[0] & 0xF8) == 0xF0) { |
|
137
|
2
|
|
|
|
|
|
cp = in[0] & 0x07; |
|
138
|
2
|
|
|
|
|
|
bytes = 4; |
|
139
|
2
|
|
|
|
|
|
min_cp = 0x10000; |
|
140
|
|
|
|
|
|
|
} |
|
141
|
21
|
100
|
|
|
|
|
else if((in[0] & 0xFC) == 0xF8) { |
|
142
|
1
|
|
|
|
|
|
cp = in[0] & 0x03; |
|
143
|
1
|
|
|
|
|
|
bytes = 5; |
|
144
|
1
|
|
|
|
|
|
min_cp = 0x200000; |
|
145
|
|
|
|
|
|
|
} |
|
146
|
|
|
|
|
|
|
else { |
|
147
|
20
|
|
|
|
|
|
return(0); |
|
148
|
|
|
|
|
|
|
} |
|
149
|
|
|
|
|
|
|
|
|
150
|
58
|
100
|
|
|
|
|
for(i = 1; i < bytes; i++) { |
|
151
|
38
|
100
|
|
|
|
|
if((in[i] & 0xC0) != 0x80) { |
|
152
|
4
|
|
|
|
|
|
return(0); |
|
153
|
|
|
|
|
|
|
} |
|
154
|
34
|
|
|
|
|
|
cp <<= 6; |
|
155
|
34
|
|
|
|
|
|
cp += in[i] & 0x3F; |
|
156
|
|
|
|
|
|
|
} |
|
157
|
|
|
|
|
|
|
|
|
158
|
20
|
100
|
|
|
|
|
if(overlong_fatal && cp < min_cp) { |
|
|
|
50
|
|
|
|
|
|
|
159
|
1
|
|
|
|
|
|
exception = newSV(48); |
|
160
|
1
|
|
|
|
|
|
SvPOK_on(exception); |
|
161
|
1
|
|
|
|
|
|
sv_catpv(exception, "Over-long UTF-8 byte sequence:"); |
|
162
|
4
|
100
|
|
|
|
|
for(i = 0; i < bytes; i++) { |
|
163
|
3
|
|
|
|
|
|
sprintf(ebuf, " %02X", (int)in[i]); |
|
164
|
3
|
|
|
|
|
|
sv_catpv(exception, ebuf); |
|
165
|
|
|
|
|
|
|
} |
|
166
|
1
|
|
|
|
|
|
croak_sv(exception); |
|
167
|
|
|
|
|
|
|
} |
|
168
|
19
|
|
|
|
|
|
d = uvchr_to_utf8(out, cp); |
|
169
|
19
|
|
|
|
|
|
*d = '\0'; |
|
170
|
43
|
|
|
|
|
|
return(bytes); |
|
171
|
|
|
|
|
|
|
} |
|
172
|
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
|
|
174
|
24
|
|
|
|
|
|
static int consume_latin_byte(U8* in, U8* out, int ascii_hex) { |
|
175
|
|
|
|
|
|
|
U8 *d, *utf_bytes; |
|
176
|
|
|
|
|
|
|
|
|
177
|
24
|
100
|
|
|
|
|
if(in[0] > 0x9F) { |
|
178
|
9
|
|
|
|
|
|
d = uvchr_to_utf8(out, (UV)in[0]); |
|
179
|
9
|
|
|
|
|
|
*d = '\0'; |
|
180
|
|
|
|
|
|
|
} |
|
181
|
|
|
|
|
|
|
else { |
|
182
|
15
|
|
|
|
|
|
utf_bytes = _encoding_fix_latin_ms_map + (in[0] & 0x7F) * 4; |
|
183
|
15
|
100
|
|
|
|
|
if(ascii_hex == 0 && *utf_bytes == '%') { |
|
|
|
50
|
|
|
|
|
|
|
184
|
6
|
|
|
|
|
|
d = uvchr_to_utf8(out, (UV)in[0]); |
|
185
|
6
|
|
|
|
|
|
*d = '\0'; |
|
186
|
|
|
|
|
|
|
} |
|
187
|
|
|
|
|
|
|
else { |
|
188
|
9
|
|
|
|
|
|
strncpy(out, utf_bytes, 4); |
|
189
|
|
|
|
|
|
|
} |
|
190
|
|
|
|
|
|
|
} |
|
191
|
24
|
|
|
|
|
|
return(1); |
|
192
|
|
|
|
|
|
|
} |
|
193
|
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
MODULE = Encoding::FixLatin::XS PACKAGE = Encoding::FixLatin::XS |
|
196
|
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
SV * |
|
198
|
|
|
|
|
|
|
_fix_latin_xs(source, overlong_fatal, ascii_hex) |
|
199
|
|
|
|
|
|
|
SV * source |
|
200
|
|
|
|
|
|
|
int overlong_fatal |
|
201
|
|
|
|
|
|
|
int ascii_hex |
|
202
|
|
|
|
|
|
|
PPCODE: |
|
203
|
35
|
|
|
|
|
|
ST(0) = _encoding_fix_latin_xs(source, overlong_fatal, ascii_hex); |
|
204
|
34
|
|
|
|
|
|
XSRETURN(1); |