File Coverage

encode.c

Criterion	Covered	Total	%
statement	0	36	0.0
branch	0	28	0.0
condition			n/a
subroutine			n/a
pod			n/a
total	0	64	0.0

line	stmt	bran	code
1			/*
2			** 2002 April 25
3			**
4			** The author disclaims copyright to this source code. In place of
5			** a legal notice, here is a blessing:
6			**
7			** May you do good and not evil.
8			** May you find forgiveness for yourself and forgive others.
9			** May you share freely, never taking more than you give.
10			**
11			*************************************************************************
12			** This file contains helper routines used to translate binary data into
13			** a null-terminated string (suitable for use in SQLite) and back again.
14			** These are convenience routines for use by people who want to store binary
15			** data in an SQLite database. The code in this file is not used by any other
16			** part of the SQLite library.
17			**
18			** $Id: encode.c,v 1.1.1.1 2004/08/08 15:03:57 matt Exp $
19			*/
20			#include
21			#include
22
23			/*
24			** How This Encoder Works
25			**
26			** The output is allowed to contain any character except 0x27 (') and
27			** 0x00. This is accomplished by using an escape character to encode
28			** 0x27 and 0x00 as a two-byte sequence. The escape character is always
29			** 0x01. An 0x00 is encoded as the two byte sequence 0x01 0x01. The
30			** 0x27 character is encoded as the two byte sequence 0x01 0x28. Finally,
31			** the escape character itself is encoded as the two-character sequence
32			** 0x01 0x02.
33			**
34			** To summarize, the encoder works by using an escape sequences as follows:
35			**
36			** 0x00 -> 0x01 0x01
37			** 0x01 -> 0x01 0x02
38			** 0x27 -> 0x01 0x28
39			**
40			** If that were all the encoder did, it would work, but in certain cases
41			** it could double the size of the encoded string. For example, to
42			** encode a string of 100 0x27 characters would require 100 instances of
43			** the 0x01 0x03 escape sequence resulting in a 200-character output.
44			** We would prefer to keep the size of the encoded string smaller than
45			** this.
46			**
47			** To minimize the encoding size, we first add a fixed offset value to each
48			** byte in the sequence. The addition is modulo 256. (That is to say, if
49			** the sum of the original character value and the offset exceeds 256, then
50			** the higher order bits are truncated.) The offset is chosen to minimize
51			** the number of characters in the string that need to be escaped. For
52			** example, in the case above where the string was composed of 100 0x27
53			** characters, the offset might be 0x01. Each of the 0x27 characters would
54			** then be converted into an 0x28 character which would not need to be
55			** escaped at all and so the 100 character input string would be converted
56			** into just 100 characters of output. Actually 101 characters of output -
57			** we have to record the offset used as the first byte in the sequence so
58			** that the string can be decoded. Since the offset value is stored as
59			** part of the output string and the output string is not allowed to contain
60			** characters 0x00 or 0x27, the offset cannot be 0x00 or 0x27.
61			**
62			** Here, then, are the encoding steps:
63			**
64			** (1) Choose an offset value and make it the first character of
65			** output.
66			**
67			** (2) Copy each input character into the output buffer, one by
68			** one, adding the offset value as you copy.
69			**
70			** (3) If the value of an input character plus offset is 0x00, replace
71			** that one character by the two-character sequence 0x01 0x01.
72			** If the sum is 0x01, replace it with 0x01 0x02. If the sum
73			** is 0x27, replace it with 0x01 0x03.
74			**
75			** (4) Put a 0x00 terminator at the end of the output.
76			**
77			** Decoding is obvious:
78			**
79			** (5) Copy encoded characters except the first into the decode
80			** buffer. Set the first encoded character aside for use as
81			** the offset in step 7 below.
82			**
83			** (6) Convert each 0x01 0x01 sequence into a single character 0x00.
84			** Convert 0x01 0x02 into 0x01. Convert 0x01 0x28 into 0x27.
85			**
86			** (7) Subtract the offset value that was the first character of
87			** the encoded buffer from all characters in the output buffer.
88			**
89			** The only tricky part is step (1) - how to compute an offset value to
90			** minimize the size of the output buffer. This is accomplished by testing
91			** all offset values and picking the one that results in the fewest number
92			** of escapes. To do that, we first scan the entire input and count the
93			** number of occurances of each character value in the input. Suppose
94			** the number of 0x00 characters is N(0), the number of occurances of 0x01
95			** is N(1), and so forth up to the number of occurances of 0xff is N(255).
96			** An offset of 0 is not allowed so we don't have to test it. The number
97			** of escapes required for an offset of 1 is N(1)+N(2)+N(40). The number
98			** of escapes required for an offset of 2 is N(2)+N(3)+N(41). And so forth.
99			** In this way we find the offset that gives the minimum number of escapes,
100			** and thus minimizes the length of the output string.
101			*/
102
103			/*
104			** Encode a binary buffer "in" of size n bytes so that it contains
105			** no instances of characters '\'' or '\000'. The output is
106			** null-terminated and can be used as a string value in an INSERT
107			** or UPDATE statement. Use sqlite_decode_binary() to convert the
108			** string back into its original binary.
109			**
110			** The result is written into a preallocated output buffer "out".
111			** "out" must be able to hold at least 2 +(257*n)/254 bytes.
112			** In other words, the output will be expanded by as much as 3
113			** bytes for every 254 bytes of input plus 2 bytes of fixed overhead.
114			** (This is approximately 2 + 1.0118*n or about a 1.2% size increase.)
115			**
116			** The return value is the number of characters in the encoded
117			** string, excluding the "\000" terminator.
118			**
119			** If out==NULL then no output is generated but the routine still returns
120			** the number of characters that would have been generated if out had
121			** not been NULL.
122			*/
123	0		int sqlite_encode_binary(const unsigned char in, int n, unsigned char out){
124			int i, j, e, m;
125			unsigned char x;
126			int cnt[256];
127	0	0	if( n<=0 ){
128	0	0	if( out ){
129	0		out[0] = 'x';
130	0		out[1] = 0;
131			}
132	0		return 1;
133			}
134	0		memset(cnt, 0, sizeof(cnt));
135	0	0	for(i=n-1; i>=0; i--){ cnt[in[i]]++; }
136	0		m = n;
137	0	0	for(i=1; i<256; i++){
138			int sum;
139	0	0	if( i=='\'' ) continue;
140	0		sum = cnt[i] + cnt[(i+1)&0xff] + cnt[(i+'\'')&0xff];
141	0	0	if( sum
142	0		m = sum;
143	0		e = i;
144	0	0	if( m==0 ) break;
145			}
146			}
147	0	0	if( out==0 ){
148	0		return n+m+1;
149			}
150	0		out[0] = e;
151	0		j = 1;
152	0	0	for(i=0; i
153	0		x = in[i] - e;
154	0	0	if( x==0 \|\| x==1 \|\| x=='\''){
		0
		0
155	0		out[j++] = 1;
156	0		x++;
157			}
158	0		out[j++] = x;
159			}
160	0		out[j] = 0;
161			assert( j==n+m+1 );
162	0		return j;
163			}
164
165			/*
166			** Decode the string "in" into binary data and write it into "out".
167			** This routine reverses the encoding created by sqlite_encode_binary().
168			** The output will always be a few bytes less than the input. The number
169			** of bytes of output is returned. If the input is not a well-formed
170			** encoding, -1 is returned.
171			**
172			** The "in" and "out" parameters may point to the same buffer in order
173			** to decode a string in place.
174			*/
175	0		int sqlite_decode_binary(const unsigned char in, unsigned char out){
176			int i, e;
177			unsigned char c;
178	0		e = *(in++);
179	0		i = 0;
180	0	0	while( (c = *(in++))!=0 ){
181	0	0	if( c==1 ){
182	0		c = *(in++) - 1;
183			}
184	0		out[i++] = c + e;
185			}
186	0		return i;
187			}
188
189			#ifdef ENCODER_TEST
190			#include
191			/*
192			** The subroutines above are not tested by the usual test suite. To test
193			** these routines, compile just this one file with a -DENCODER_TEST=1 option
194			** and run the result.
195			*/
196			int main(int argc, char **argv){
197			int i, j, n, m, nOut, nByteIn, nByteOut;
198			unsigned char in[30000];
199			unsigned char out[33000];
200
201			nByteIn = nByteOut = 0;
202			for(i=0; i
203			printf("Test %d: ", i+1);
204			n = rand() % (i+1);
205			if( i%100==0 ){
206			int k;
207			for(j=k=0; j
208			/* if( k==0 \|\| k=='\'' ) k++; */
209			in[j] = k;
210			k = (k+1)&0xff;
211			}
212			}else{
213			for(j=0; j
214			}
215			nByteIn += n;
216			nOut = sqlite_encode_binary(in, n, out);
217			nByteOut += nOut;
218			if( nOut!=strlen(out) ){
219			printf(" ERROR return value is %d instead of %d\n", nOut, strlen(out));
220			exit(1);
221			}
222			if( nOut!=sqlite_encode_binary(in, n, 0) ){
223			printf(" ERROR actual output size disagrees with predicted size\n");
224			exit(1);
225			}
226			m = (256*n + 1262)/253;
227			printf("size %d->%d (max %d)", n, strlen(out)+1, m);
228			if( strlen(out)+1>m ){
229			printf(" ERROR output too big\n");
230			exit(1);
231			}
232			for(j=0; out[j]; j++){
233			if( out[j]=='\'' ){
234			printf(" ERROR contains (')\n");
235			exit(1);
236			}
237			}
238			j = sqlite_decode_binary(out, out);
239			if( j!=n ){
240			printf(" ERROR decode size %d\n", j);
241			exit(1);
242			}
243			if( memcmp(in, out, n)!=0 ){
244			printf(" ERROR decode mismatch\n");
245			exit(1);
246			}
247			printf(" OK\n");
248			}
249			fprintf(stderr,"Finished. Total encoding: %d->%d bytes\n",
250			nByteIn, nByteOut);
251			fprintf(stderr,"Avg size increase: %.3f%%\n",
252			(nByteOut-nByteIn)*100.0/(double)nByteIn);
253			}
254			#endif /* ENCODER_TEST */