File Coverage

XS.xs
Criterion Covered Total %
statement 71 71 100.0
branch 37 40 92.5
condition n/a
subroutine n/a
pod n/a
total 108 111 97.3


line stmt bran cond sub pod time code
1             /*
2             * Notes for the casual reader ...
3             *
4             * This is my first attempt at writing an XS module so it's probably not the
5             * finest example for a new XS coder to read. Of course if you do read it and
6             * have suggestions for improvements then please let me know.
7             *
8             * Unlike some XS modules, this one is not wrapping an existing library. All
9             * the C source is contained in this file, along with the XSUB definition.
10             *
11             * Although the XSUB layer allows automatic conversion between the data
12             * structures used by Perl variables (different types of SV) and native C types
13             * (like ints and character pointers) this module doesn't really take advantage
14             * of that. Instead, it takes an SV as input and returns an SV as output.
15             * This design decision was made in order to support the (premature/micro)
16             * optimisation whereby if the input SV contained all-ASCII characters, then
17             * the return value would be a pointer to the same SV, rather than needlessly
18             * making a copy of it.
19             *
20             * Copyright (C) 2014 by Grant McLean
21             *
22             */
23              
24             #include "EXTERN.h"
25             #include "perl.h"
26             #include "XSUB.h"
27             #include "ppport.h"
28              
29              
30             U8 _encoding_fix_latin_ms_map[] = {
31             0xE2, 0x82, 0xAC, 0x00, // 80 EURO SIGN
32             0x25, 0x38, 0x31, 0x00, // 81
33             0xE2, 0x80, 0x9A, 0x00, // 82 SINGLE LOW-9 QUOTATION MARK
34             0xC6, 0x92, 0x00, 0x00, // 83 LATIN SMALL LETTER F WITH HOOK
35             0xE2, 0x80, 0x9E, 0x00, // 84 DOUBLE LOW-9 QUOTATION MARK
36             0xE2, 0x80, 0xA6, 0x00, // 85 HORIZONTAL ELLIPSIS
37             0xE2, 0x80, 0xA0, 0x00, // 86 DAGGER
38             0xE2, 0x80, 0xA1, 0x00, // 87 DOUBLE DAGGER
39             0xCB, 0x86, 0x00, 0x00, // 88 MODIFIER LETTER CIRCUMFLEX ACCENT
40             0xE2, 0x80, 0xB0, 0x00, // 89 PER MILLE SIGN
41             0xC5, 0xA0, 0x00, 0x00, // 8A LATIN CAPITAL LETTER S WITH CARON
42             0xE2, 0x80, 0xB9, 0x00, // 8B SINGLE LEFT-POINTING ANGLE QUOTATION MARK
43             0xC5, 0x92, 0x00, 0x00, // 8C LATIN CAPITAL LIGATURE OE
44             0x25, 0x38, 0x44, 0x00, // 8D
45             0xC5, 0xBD, 0x00, 0x00, // 8E LATIN CAPITAL LETTER Z WITH CARON
46             0x25, 0x38, 0x46, 0x00, // 8F
47             0x25, 0x39, 0x30, 0x00, // 90
48             0xE2, 0x80, 0x98, 0x00, // 91 LEFT SINGLE QUOTATION MARK
49             0xE2, 0x80, 0x99, 0x00, // 92 RIGHT SINGLE QUOTATION MARK
50             0xE2, 0x80, 0x9C, 0x00, // 93 LEFT DOUBLE QUOTATION MARK
51             0xE2, 0x80, 0x9D, 0x00, // 94 RIGHT DOUBLE QUOTATION MARK
52             0xE2, 0x80, 0xA2, 0x00, // 95 BULLET
53             0xE2, 0x80, 0x93, 0x00, // 96 EN DASH
54             0xE2, 0x80, 0x94, 0x00, // 97 EM DASH
55             0xCB, 0x9C, 0x00, 0x00, // 98 SMALL TILDE
56             0xE2, 0x84, 0xA2, 0x00, // 99 TRADE MARK SIGN
57             0xC5, 0xA1, 0x00, 0x00, // 9A LATIN SMALL LETTER S WITH CARON
58             0xE2, 0x80, 0xBA, 0x00, // 9B SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
59             0xC5, 0x93, 0x00, 0x00, // 9C LATIN SMALL LIGATURE OE
60             0x25, 0x39, 0x44, 0x00, // 9D
61             0xC5, 0xBE, 0x00, 0x00, // 9E LATIN SMALL LETTER Z WITH CARON
62             0xC5, 0xB8, 0x00, 0x00, // 9F LATIN CAPITAL LETTER Y WITH DIAERESIS
63             0x00
64             };
65              
66              
67             static SV* _encoding_fix_latin_xs(SV*, int, int);
68             static int consume_utf8_bytes(U8*, U8*, int);
69             static int consume_latin_byte(U8*, U8*, int);
70              
71              
72 35           static SV* _encoding_fix_latin_xs(SV* source, int overlong_fatal, int ascii_hex) {
73 35           SV* out = NULL; // Defer initialisation until first non-ASCII character
74             U8 *ph, *pt;
75             U8 ubuf[8];
76             UV i, bytes, bytes_consumed;
77              
78             STRLEN l;
79 35 50         ph = pt = SvPV(source, l);
80 35           bytes = SvCUR(source);
81 218 100         for(i = 0; i < bytes; i++, ph++) {
82 184 100         if((*ph & 0x80) == 0)
83 140           continue;
84              
85 44 100         if(out == NULL) { // Deferred initialisation
86 27           out = newSV(bytes * 12 / 10); // Pre-allocate 20% more space
87 27           SvPOK_on(out);
88             }
89              
90             // Copy the ASCII byte sequence up to, but not including, the byte that
91             // we're currently pointing at
92 44 100         if(ph > pt) {
93 15           sv_catpvn(out, pt, (STRLEN)(ph - pt));
94             }
95              
96 44           bytes_consumed = consume_utf8_bytes(ph, ubuf, overlong_fatal);
97 43 100         if(!bytes_consumed) {
98 24           bytes_consumed = consume_latin_byte(ph, ubuf, ascii_hex);
99             }
100 43           sv_catpvn(out, ubuf, strlen(ubuf));
101 43           i += bytes_consumed - 1;
102 43           ph += bytes_consumed - 1;
103              
104 43           pt = ph + 1;
105             }
106              
107             // If the input was all ASCII, just return the input
108 34 100         if(out == NULL) {
109 8           return(source);
110             }
111              
112 26 100         if(ph > pt) {
113 6           sv_catpvn(out, pt, (STRLEN)(ph - pt));
114             }
115              
116 26           SvUTF8_on(out);
117              
118 34           return(sv_2mortal(out));
119             }
120              
121 44           static int consume_utf8_bytes(U8* in, U8* out, int overlong_fatal) {
122             UV cp, min_cp, bytes, i;
123             U8 *d, ebuf[8];
124             SV *exception;
125              
126 44 100         if((in[0] & 0xE0) == 0xC0) {
127 10           cp = in[0] & 0x1F;
128 10           bytes = 2;
129 10           min_cp = 0x80;
130             }
131 34 100         else if((in[0] & 0xF0) == 0xE0) {
132 11           cp = in[0] & 0x0F;
133 11           bytes = 3;
134 11           min_cp = 0x800;
135             }
136 23 100         else if((in[0] & 0xF8) == 0xF0) {
137 2           cp = in[0] & 0x07;
138 2           bytes = 4;
139 2           min_cp = 0x10000;
140             }
141 21 100         else if((in[0] & 0xFC) == 0xF8) {
142 1           cp = in[0] & 0x03;
143 1           bytes = 5;
144 1           min_cp = 0x200000;
145             }
146             else {
147 20           return(0);
148             }
149              
150 58 100         for(i = 1; i < bytes; i++) {
151 38 100         if((in[i] & 0xC0) != 0x80) {
152 4           return(0);
153             }
154 34           cp <<= 6;
155 34           cp += in[i] & 0x3F;
156             }
157              
158 20 100         if(overlong_fatal && cp < min_cp) {
    50          
159 1           exception = newSV(48);
160 1           SvPOK_on(exception);
161 1           sv_catpv(exception, "Over-long UTF-8 byte sequence:");
162 4 100         for(i = 0; i < bytes; i++) {
163 3           sprintf(ebuf, " %02X", (int)in[i]);
164 3           sv_catpv(exception, ebuf);
165             }
166 1           croak_sv(exception);
167             }
168 19           d = uvchr_to_utf8(out, cp);
169 19           *d = '\0';
170 43           return(bytes);
171             }
172              
173              
174 24           static int consume_latin_byte(U8* in, U8* out, int ascii_hex) {
175             U8 *d, *utf_bytes;
176              
177 24 100         if(in[0] > 0x9F) {
178 9           d = uvchr_to_utf8(out, (UV)in[0]);
179 9           *d = '\0';
180             }
181             else {
182 15           utf_bytes = _encoding_fix_latin_ms_map + (in[0] & 0x7F) * 4;
183 15 100         if(ascii_hex == 0 && *utf_bytes == '%') {
    50          
184 6           d = uvchr_to_utf8(out, (UV)in[0]);
185 6           *d = '\0';
186             }
187             else {
188 9           strncpy(out, utf_bytes, 4);
189             }
190             }
191 24           return(1);
192             }
193              
194              
195             MODULE = Encoding::FixLatin::XS PACKAGE = Encoding::FixLatin::XS
196              
197             SV *
198             _fix_latin_xs(source, overlong_fatal, ascii_hex)
199             SV * source
200             int overlong_fatal
201             int ascii_hex
202             PPCODE:
203 35           ST(0) = _encoding_fix_latin_xs(source, overlong_fatal, ascii_hex);
204 34           XSRETURN(1);