utf8.js
4.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
// encodings/impl/utf8
/**
* Encodes a standard JavaScript string to UTF8 bytes.
* @param {string} src Source string
* @param {number} srcOffset Source offset
* @param {!ByteBuffer} dst Destination ByteBuffer
* @param {number} dstOffset Destination offset
* @param {number} count Number of char codes to encode
* @returns {number} Number of bytes encoded
* @inner
*/
function utf8_encode(src, srcOffset, dst, dstOffset, count) {
if (count === 0)
return 0;
var n = 0;
//? // SET(varValue, varOffset, varTarget) with varTarget referencing a ByteBuffer
do {
var cc = src.charCodeAt(srcOffset++);
--count;
if (cc < 0x80) {
n += 1;
//? SET('cc', 'dstOffset++', 'dst');
} else if (cc < 0x800) {
n += 2;
//? SET('0xC0 | (cc >> 6)', 'dstOffset++', 'dst');
//? SET('0x80 | (cc & 0x3F)', 'dstOffset++', 'dst');
} else if (cc < 0xD800 || cc >= 0xE000) {
n += 3;
//? SET('0xE0 | (cc >> 12)', 'dstOffset++', 'dst');
//? SET('0x80 | ((cc >> 6) & 0x3F)', 'dstOffset++', 'dst');
//? SET('0x80 | (cc & 0x3F)', 'dstOffset++', 'dst');
} else { // surrogate
if (count === 0)
throw Error("truncated utf8 surrogate");
cc = 0x10000 + (((cc & 0x3FF) << 10) | (src.charCodeAt(srcOffset++) & 0x3FF));
--count;
n += 4;
//? SET('0xF0 | (cc >> 18)', 'dstOffset++', 'dst');
//? SET('0x80 | ((cc >> 12) & 0x3F)', 'dstOffset++', 'dst');
//? SET('0x80 | ((cc >> 6) & 0x3F)', 'dstOffset++', 'dst');
//? SET('0x80 | (cc & 0x3F)', 'dstOffset++', 'dst');
}
} while (count > 0);
return n;
}
/**
* Decodes UTF8 bytes to a standard JavaScript string.
* @param {!ByteBuffer} src Source ByteBuffer
* @param {number} srcOffset Source offset
* @param {number} count Number of bytes to decode
* @returns {string} Decoded string
* @inner
*/
function utf8_decode(src, srcOffset, count) {
if (count === 0)
return "";
var parts = [], // readily assembled parts
batch = []; // char codes for batch processing
//? // GET(varOffset, varTarget) with varTarget referencing a ByteBuffer
while (count--) {
var c = /*? GET('srcOffset++', 'src') */,
c2, c3;
switch (c >> 4) {
case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7:
batch.push(c);
break;
case 12: case 13:
if (count < 1)
throw Error("truncated utf8 sequence");
c2 = /*? GET('srcOffset++', 'src') */;
--count;
batch.push(((c & 0x1F) << 6) | (c2 & 0x3F));
break;
case 14:
if (count < 2)
throw Error("truncated utf8 sequence");
c2 = /*? GET('srcOffset++', 'src') */;
c3 = /*? GET('srcOffset++', 'src') */;
count -= 2;
batch.push(((c & 0x0F) << 12) | ((c2 & 0x3F) << 6) | ((c3 & 0x3F) << 0));
break;
}
if (batch.length > 1023) {
parts.push(String.fromCharCode.apply(String, batch));
batch.length = 0;
}
}
if (batch.length > 0) {
if (parts.length === 0)
return String.fromCharCode.apply(String, batch);
parts.push(String.fromCharCode.apply(String, batch));
}
return parts.join('');
}
/**
* Calculates the number of UTF8 bytes required to store a standard JavaScript string.
* @param {string} src Source string
* @param {number} srcOffset Source offset
* @param {number} count Number of char codes to calculate
* @returns {number} Number of bytes required
* @inner
*/
function utf8_calculate(src, srcOffset, count) {
if (count === 0)
return 0;
var n = 0;
do {
var cc = src.charCodeAt(srcOffset++);
--count;
if (cc < 0x80) {
n += 1;
} else if (cc < 0x800) {
n += 2;
} else if (cc < 0xD800 || cc >= 0xE000) {
n += 3;
} else {
n += 4;
}
} while (count > 0);
return n;
}
ByteBuffer.registerEncoding("utf8", utf8_encode, utf8_decode, utf8_calculate);