File Coverage

encode.c
Criterion Covered Total %
statement 0 36 0.0
branch 0 28 0.0
condition n/a
subroutine n/a
pod n/a
total 0 64 0.0


line stmt bran cond sub pod time code
1             /*
2             ** 2002 April 25
3             **
4             ** The author disclaims copyright to this source code. In place of
5             ** a legal notice, here is a blessing:
6             **
7             ** May you do good and not evil.
8             ** May you find forgiveness for yourself and forgive others.
9             ** May you share freely, never taking more than you give.
10             **
11             *************************************************************************
12             ** This file contains helper routines used to translate binary data into
13             ** a null-terminated string (suitable for use in SQLite) and back again.
14             ** These are convenience routines for use by people who want to store binary
15             ** data in an SQLite database. The code in this file is not used by any other
16             ** part of the SQLite library.
17             **
18             ** $Id: encode.c,v 1.1.1.1 2004/08/08 15:03:57 matt Exp $
19             */
20             #include
21             #include
22              
23             /*
24             ** How This Encoder Works
25             **
26             ** The output is allowed to contain any character except 0x27 (') and
27             ** 0x00. This is accomplished by using an escape character to encode
28             ** 0x27 and 0x00 as a two-byte sequence. The escape character is always
29             ** 0x01. An 0x00 is encoded as the two byte sequence 0x01 0x01. The
30             ** 0x27 character is encoded as the two byte sequence 0x01 0x28. Finally,
31             ** the escape character itself is encoded as the two-character sequence
32             ** 0x01 0x02.
33             **
34             ** To summarize, the encoder works by using an escape sequences as follows:
35             **
36             ** 0x00 -> 0x01 0x01
37             ** 0x01 -> 0x01 0x02
38             ** 0x27 -> 0x01 0x28
39             **
40             ** If that were all the encoder did, it would work, but in certain cases
41             ** it could double the size of the encoded string. For example, to
42             ** encode a string of 100 0x27 characters would require 100 instances of
43             ** the 0x01 0x03 escape sequence resulting in a 200-character output.
44             ** We would prefer to keep the size of the encoded string smaller than
45             ** this.
46             **
47             ** To minimize the encoding size, we first add a fixed offset value to each
48             ** byte in the sequence. The addition is modulo 256. (That is to say, if
49             ** the sum of the original character value and the offset exceeds 256, then
50             ** the higher order bits are truncated.) The offset is chosen to minimize
51             ** the number of characters in the string that need to be escaped. For
52             ** example, in the case above where the string was composed of 100 0x27
53             ** characters, the offset might be 0x01. Each of the 0x27 characters would
54             ** then be converted into an 0x28 character which would not need to be
55             ** escaped at all and so the 100 character input string would be converted
56             ** into just 100 characters of output. Actually 101 characters of output -
57             ** we have to record the offset used as the first byte in the sequence so
58             ** that the string can be decoded. Since the offset value is stored as
59             ** part of the output string and the output string is not allowed to contain
60             ** characters 0x00 or 0x27, the offset cannot be 0x00 or 0x27.
61             **
62             ** Here, then, are the encoding steps:
63             **
64             ** (1) Choose an offset value and make it the first character of
65             ** output.
66             **
67             ** (2) Copy each input character into the output buffer, one by
68             ** one, adding the offset value as you copy.
69             **
70             ** (3) If the value of an input character plus offset is 0x00, replace
71             ** that one character by the two-character sequence 0x01 0x01.
72             ** If the sum is 0x01, replace it with 0x01 0x02. If the sum
73             ** is 0x27, replace it with 0x01 0x03.
74             **
75             ** (4) Put a 0x00 terminator at the end of the output.
76             **
77             ** Decoding is obvious:
78             **
79             ** (5) Copy encoded characters except the first into the decode
80             ** buffer. Set the first encoded character aside for use as
81             ** the offset in step 7 below.
82             **
83             ** (6) Convert each 0x01 0x01 sequence into a single character 0x00.
84             ** Convert 0x01 0x02 into 0x01. Convert 0x01 0x28 into 0x27.
85             **
86             ** (7) Subtract the offset value that was the first character of
87             ** the encoded buffer from all characters in the output buffer.
88             **
89             ** The only tricky part is step (1) - how to compute an offset value to
90             ** minimize the size of the output buffer. This is accomplished by testing
91             ** all offset values and picking the one that results in the fewest number
92             ** of escapes. To do that, we first scan the entire input and count the
93             ** number of occurances of each character value in the input. Suppose
94             ** the number of 0x00 characters is N(0), the number of occurances of 0x01
95             ** is N(1), and so forth up to the number of occurances of 0xff is N(255).
96             ** An offset of 0 is not allowed so we don't have to test it. The number
97             ** of escapes required for an offset of 1 is N(1)+N(2)+N(40). The number
98             ** of escapes required for an offset of 2 is N(2)+N(3)+N(41). And so forth.
99             ** In this way we find the offset that gives the minimum number of escapes,
100             ** and thus minimizes the length of the output string.
101             */
102              
103             /*
104             ** Encode a binary buffer "in" of size n bytes so that it contains
105             ** no instances of characters '\'' or '\000'. The output is
106             ** null-terminated and can be used as a string value in an INSERT
107             ** or UPDATE statement. Use sqlite_decode_binary() to convert the
108             ** string back into its original binary.
109             **
110             ** The result is written into a preallocated output buffer "out".
111             ** "out" must be able to hold at least 2 +(257*n)/254 bytes.
112             ** In other words, the output will be expanded by as much as 3
113             ** bytes for every 254 bytes of input plus 2 bytes of fixed overhead.
114             ** (This is approximately 2 + 1.0118*n or about a 1.2% size increase.)
115             **
116             ** The return value is the number of characters in the encoded
117             ** string, excluding the "\000" terminator.
118             **
119             ** If out==NULL then no output is generated but the routine still returns
120             ** the number of characters that would have been generated if out had
121             ** not been NULL.
122             */
123 0           int sqlite_encode_binary(const unsigned char *in, int n, unsigned char *out){
124             int i, j, e, m;
125             unsigned char x;
126             int cnt[256];
127 0 0         if( n<=0 ){
128 0 0         if( out ){
129 0           out[0] = 'x';
130 0           out[1] = 0;
131             }
132 0           return 1;
133             }
134 0           memset(cnt, 0, sizeof(cnt));
135 0 0         for(i=n-1; i>=0; i--){ cnt[in[i]]++; }
136 0           m = n;
137 0 0         for(i=1; i<256; i++){
138             int sum;
139 0 0         if( i=='\'' ) continue;
140 0           sum = cnt[i] + cnt[(i+1)&0xff] + cnt[(i+'\'')&0xff];
141 0 0         if( sum
142 0           m = sum;
143 0           e = i;
144 0 0         if( m==0 ) break;
145             }
146             }
147 0 0         if( out==0 ){
148 0           return n+m+1;
149             }
150 0           out[0] = e;
151 0           j = 1;
152 0 0         for(i=0; i
153 0           x = in[i] - e;
154 0 0         if( x==0 || x==1 || x=='\''){
    0          
    0          
155 0           out[j++] = 1;
156 0           x++;
157             }
158 0           out[j++] = x;
159             }
160 0           out[j] = 0;
161             assert( j==n+m+1 );
162 0           return j;
163             }
164              
165             /*
166             ** Decode the string "in" into binary data and write it into "out".
167             ** This routine reverses the encoding created by sqlite_encode_binary().
168             ** The output will always be a few bytes less than the input. The number
169             ** of bytes of output is returned. If the input is not a well-formed
170             ** encoding, -1 is returned.
171             **
172             ** The "in" and "out" parameters may point to the same buffer in order
173             ** to decode a string in place.
174             */
175 0           int sqlite_decode_binary(const unsigned char *in, unsigned char *out){
176             int i, e;
177             unsigned char c;
178 0           e = *(in++);
179 0           i = 0;
180 0 0         while( (c = *(in++))!=0 ){
181 0 0         if( c==1 ){
182 0           c = *(in++) - 1;
183             }
184 0           out[i++] = c + e;
185             }
186 0           return i;
187             }
188              
189             #ifdef ENCODER_TEST
190             #include
191             /*
192             ** The subroutines above are not tested by the usual test suite. To test
193             ** these routines, compile just this one file with a -DENCODER_TEST=1 option
194             ** and run the result.
195             */
196             int main(int argc, char **argv){
197             int i, j, n, m, nOut, nByteIn, nByteOut;
198             unsigned char in[30000];
199             unsigned char out[33000];
200              
201             nByteIn = nByteOut = 0;
202             for(i=0; i
203             printf("Test %d: ", i+1);
204             n = rand() % (i+1);
205             if( i%100==0 ){
206             int k;
207             for(j=k=0; j
208             /* if( k==0 || k=='\'' ) k++; */
209             in[j] = k;
210             k = (k+1)&0xff;
211             }
212             }else{
213             for(j=0; j
214             }
215             nByteIn += n;
216             nOut = sqlite_encode_binary(in, n, out);
217             nByteOut += nOut;
218             if( nOut!=strlen(out) ){
219             printf(" ERROR return value is %d instead of %d\n", nOut, strlen(out));
220             exit(1);
221             }
222             if( nOut!=sqlite_encode_binary(in, n, 0) ){
223             printf(" ERROR actual output size disagrees with predicted size\n");
224             exit(1);
225             }
226             m = (256*n + 1262)/253;
227             printf("size %d->%d (max %d)", n, strlen(out)+1, m);
228             if( strlen(out)+1>m ){
229             printf(" ERROR output too big\n");
230             exit(1);
231             }
232             for(j=0; out[j]; j++){
233             if( out[j]=='\'' ){
234             printf(" ERROR contains (')\n");
235             exit(1);
236             }
237             }
238             j = sqlite_decode_binary(out, out);
239             if( j!=n ){
240             printf(" ERROR decode size %d\n", j);
241             exit(1);
242             }
243             if( memcmp(in, out, n)!=0 ){
244             printf(" ERROR decode mismatch\n");
245             exit(1);
246             }
247             printf(" OK\n");
248             }
249             fprintf(stderr,"Finished. Total encoding: %d->%d bytes\n",
250             nByteIn, nByteOut);
251             fprintf(stderr,"Avg size increase: %.3f%%\n",
252             (nByteOut-nByteIn)*100.0/(double)nByteIn);
253             }
254             #endif /* ENCODER_TEST */