-
Notifications
You must be signed in to change notification settings - Fork 0
/
bocu1.h
234 lines (198 loc) · 7.09 KB
/
bocu1.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
/*
******************************************************************************
*
* Copyright (C) 2002, International Business Machines
* Corporation and others. All Rights Reserved.
*
* For licensing terms see the ICU X License:
* http://oss.software.ibm.com/cvs/icu/~checkout~/icu/license.html
*
******************************************************************************
* file name: bocu1.h
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2002jan24
* created by: Markus W. Scherer
*
* This is the definition file for the sample implementation of BOCU-1,
* a MIME-compatible Binary Ordered Compression for Unicode.
*/
#ifndef __BOCU1_H__
#define __BOCU1_H__
/*
* Standard ICU header.
* - Includes inttypes.h or defines its types.
* - Defines UChar for UTF-16 as an unsigned 16-bit type (wchar_t or uint16_t).
* - Defines UTF* macros to handle reading and writing
* of in-process UTF-8/16 strings.
*/
#include "unicode/utypes.h"
/* BOCU-1 constants and macros ---------------------------------------------- */
/*
* BOCU-1 encodes the code points of a Unicode string as
* a sequence of byte-encoded differences (slope detection),
* preserving lexical order.
*
* Optimize the difference-taking for runs of Unicode text within
* small scripts:
*
* Most small scripts are allocated within aligned 128-blocks of Unicode
* code points. Lexical order is preserved if the "previous code point" state
* is always moved into the middle of such a block.
*
* Additionally, "prev" is moved from anywhere in the Unihan and Hangul
* areas into the middle of those areas.
*
* C0 control codes and space are encoded with their US-ASCII bytes.
* "prev" is reset for C0 controls but not for space.
*/
/* initial value for "prev": middle of the ASCII range */
#define BOCU1_ASCII_PREV 0x40
/* bounding byte values for differences */
#define BOCU1_MIN 0x21
#define BOCU1_MIDDLE 0x90
#define BOCU1_MAX_LEAD 0xfe
#define BOCU1_MAX_TRAIL 0xff
#define BOCU1_RESET 0xff
/* number of lead bytes */
#define BOCU1_COUNT (BOCU1_MAX_LEAD-BOCU1_MIN+1)
/* adjust trail byte counts for the use of some C0 control byte values */
#define BOCU1_TRAIL_CONTROLS_COUNT 20
#define BOCU1_TRAIL_BYTE_OFFSET (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT)
/* number of trail bytes */
#define BOCU1_TRAIL_COUNT ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT)
/*
* number of positive and negative single-byte codes
* (counting 0==BOCU1_MIDDLE among the positive ones)
*/
#define BOCU1_SINGLE 64
/* number of lead bytes for positive and negative 2/3/4-byte sequences */
#define BOCU1_LEAD_2 43
#define BOCU1_LEAD_3 3
#define BOCU1_LEAD_4 1
/* The difference value range for single-byters. */
#define BOCU1_REACH_POS_1 (BOCU1_SINGLE-1)
#define BOCU1_REACH_NEG_1 (-BOCU1_SINGLE)
/* The difference value range for double-byters. */
#define BOCU1_REACH_POS_2 (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
#define BOCU1_REACH_NEG_2 (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
/* The difference value range for 3-byters. */
#define BOCU1_REACH_POS_3 \
(BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
#define BOCU1_REACH_NEG_3 (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
/* The lead byte start values. */
#define BOCU1_START_POS_2 (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1)
#define BOCU1_START_POS_3 (BOCU1_START_POS_2+BOCU1_LEAD_2)
#define BOCU1_START_POS_4 (BOCU1_START_POS_3+BOCU1_LEAD_3)
/* ==BOCU1_MAX_LEAD */
#define BOCU1_START_NEG_2 (BOCU1_MIDDLE+BOCU1_REACH_NEG_1)
#define BOCU1_START_NEG_3 (BOCU1_START_NEG_2-BOCU1_LEAD_2)
#define BOCU1_START_NEG_4 (BOCU1_START_NEG_3-BOCU1_LEAD_3)
/* ==BOCU1_MIN+1 */
/* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */
#define BOCU1_LENGTH_FROM_LEAD(lead) \
((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \
(BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \
(BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4)
/* The length of a byte sequence, according to its packed form. */
#define BOCU1_LENGTH_FROM_PACKED(packed) \
((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4)
/*
* 12 commonly used C0 control codes (and space) are only used to encode
* themselves directly,
* which makes BOCU-1 MIME-usable and reasonably safe for
* ASCII-oriented software.
*
* These controls are
* 0 NUL
*
* 7 BEL
* 8 BS
*
* 9 TAB
* a LF
* b VT
* c FF
* d CR
*
* e SO
* f SI
*
* 1a SUB
* 1b ESC
*
* The other 20 C0 controls are also encoded directly (to preserve order)
* but are also used as trail bytes in difference encoding
* (for better compression).
*/
#define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t])
/*
* Byte value map for control codes,
* from external byte values 0x00..0x20
* to trail byte values 0..19 (0..0x13) as used in the difference calculation.
* External byte values that are illegal as trail bytes are mapped to -1.
*/
static int8_t
bocu1ByteToTrail[BOCU1_MIN]={
/* 0 1 2 3 4 5 6 7 */
-1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,
/* 8 9 a b c d e f */
-1, -1, -1, -1, -1, -1, -1, -1,
/* 10 11 12 13 14 15 16 17 */
0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
/* 18 19 1a 1b 1c 1d 1e 1f */
0x0e, 0x0f, -1, -1, 0x10, 0x11, 0x12, 0x13,
/* 20 */
-1
};
/*
* Byte value map for control codes,
* from trail byte values 0..19 (0..0x13) as used in the difference calculation
* to external byte values 0x00..0x20.
*/
static int8_t
bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={
/* 0 1 2 3 4 5 6 7 */
0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,
/* 8 9 a b c d e f */
0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
/* 10 11 12 13 */
0x1c, 0x1d, 0x1e, 0x1f
};
/**
* Integer division and modulo with negative numerators
* yields negative modulo results and quotients that are one more than
* what we need here.
* This macro adjust the results so that the modulo-value m is always >=0.
*
* For positive n, the if() condition is always FALSE.
*
* @param n Number to be split into quotient and rest.
* Will be modified to contain the quotient.
* @param d Divisor.
* @param m Output variable for the rest (modulo result).
*/
#define NEGDIVMOD(n, d, m) { \
(m)=(n)%(d); \
(n)/=(d); \
if((m)<0) { \
--(n); \
(m)+=(d); \
} \
}
/* State for BOCU-1 decoder function. */
struct Bocu1Rx {
int32_t prev, count, diff;
};
typedef struct Bocu1Rx Bocu1Rx;
/* Function prototypes ------------------------------------------------------ */
/* see bocu1.c */
U_CFUNC int32_t
packDiff(int32_t diff);
U_CFUNC int32_t
encodeBocu1(int32_t *pPrev, int32_t c);
U_CFUNC int32_t
decodeBocu1(Bocu1Rx *pRx, uint8_t b);
#endif