generated from okikio/transferables
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmod.ts
227 lines (206 loc) · 9.06 KB
/
mod.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
/**
* @module
* This module provides utilities for processing UTF-8 encoded text. It includes functions to work with iterables or async iterables of `Uint8Array` chunks, converting them into Unicode code points through different methods:
*
* - `asCodePointsIterator` for yielding code points asynchronously,
* - `asCodePointsArray` for collecting code points into an array,
* - `asCodePointsCallback` for invoking a callback function with each code point.
*
* These utilities are designed to handle text data efficiently, especially useful for streaming or batch processing scenarios.
*
* @example
* Using `asCodePointsIterator` to asynchronously iterate over code points:
* ```ts
* // Example iterable of UTF-8 encoded Uint8Array chunks (representing 'hello')
* const chunks = [new Uint8Array([104]), new Uint8Array([101, 108, 108, 111])];
* (async () => {
* for await (const codePoint of asCodePointsIterator(chunks)) {
* console.log(String.fromCodePoint(codePoint)); // Logs each character: 'h', 'e', 'l', 'l', 'o'
* }
* })();
* ```
*
* Using `asCodePointsArray` to get an array of code points:
* ```ts
* (async () => {
* const codePoints = await asCodePointsArray(chunks); // Assuming 'chunks' from the previous example
* console.log(codePoints.map(cp => String.fromCodePoint(cp)).join('')); // Logs: 'hello'
* })();
* ```
*
* Using `asCodePointsCallback` to process code points with a callback:
* ```ts
* (async () => {
* asCodePointsCallback(chunks, codePoint => {
* console.log(String.fromCodePoint(codePoint)); // Logs each character: 'h', 'e', 'l', 'l', 'o'
* });
* })();
* ```
* Versatility in handling streamed or batched UTF-8 data, making it easier to work with text in modern JavaScript environments.
*/
/**
* Converts an iterable of UTF-8 filled Uint8Array's into an async generator of Unicode code points.
*
* The function performs the following steps:
* 1. Iterate through the input iterable, which yields chunks of bytes (Uint8Array).
* 2. Process each chunk using a TextDecoder to extract UTF-8 characters.
* 3. Calculate the corresponding Unicode code points for the extracted characters.
* 4. Yield the code points one by one.
*
* @param iterable An iterator or async iterator of `Uint8Array` chunks filled with UTF-8 encoded text.
* @returns An async generator yielding Unicode code points from the given iterable.
*
* @example
* Convert an async iterable of `Uint8Array` chunks into an async iterable of Unicode code points.
* ```ts
* async function exampleIteratorUsage() {
* const utf8Chunks = [new Uint8Array([0xF0, 0x9F, 0x92, 0x96])]; // Represents the 💖 emoji
* for await (const codePoint of asCodePointsIterator(utf8Chunks)) {
* console.log(String.fromCodePoint(codePoint)); // Output: 💖
* }
* }
* exampleIteratorUsage();
* ```
*/
export async function* asCodePointsIterator<T extends Uint8Array>(
iterable: AsyncIterable<T> | Iterable<T>
): AsyncIterable<number> {
const utf8Decoder = new TextDecoder("utf-8");
// Create an async iterator from the source (works for both async and sync iterables).
const iterator = Symbol.asyncIterator in iterable
? iterable[Symbol.asyncIterator]() :
Symbol.iterator in iterable
? iterable[Symbol.iterator]()
: iterable;
// Use a while loop to iterate over the async iterator.
while (true) {
const result = await iterator.next();
if (result.done) break;
const chunk = result.value;
const str = utf8Decoder.decode(chunk, { stream: true });
// Extract code points in larger batches
let i = 0;
const len = str.length;
while (i < len) {
const codePoint = str.codePointAt(i)!;
if (codePoint === undefined) break; // If codePointAt returns undefined, break the loop.
yield codePoint;
i += codePoint > 0xFFFF ? 2 : 1; // Adjust index based on code point size
}
}
// Flush the decoder's internal state
utf8Decoder.decode(new Uint8Array());
}
/**
* Converts an iterable of Uint8Array (byte arrays) into an array of Unicode code points.
* This is particularly useful for processing streams of text data, where each chunk
* is represented as a Uint8Array, and you want to work with the text's Unicode code points.
*
* Similar to asCodePointsIterator, this function processes the input iterable to extract UTF-8 characters
* and calculate their corresponding Unicode code points. However, instead of yielding the code points one by one,
* it stores them in an array and returns the array once the processing is complete.
*
* @param iterable The source iterable, either synchronous or asynchronous, containing `Uint8Array` chunks.
* @returns A promise that resolves to an array of Unicode code points.
*
* @example
* Convert an iterable of `Uint8Array` chunks into an array of Unicode code points.
* ```ts
* async function exampleArrayUsage() {
* const utf8Chunks = [new Uint8Array([0x68, 0x65, 0x6C, 0x6C, 0x6F])]; // Represents the string 'hello'
* const codePoints = await asCodePointsArray(utf8Chunks);
* console.log(codePoints.map(cp => String.fromCodePoint(cp)).join('')); // Output: 'hello'
* }
* exampleArrayUsage();
* ```
*/
export async function asCodePointsArray<T extends Uint8Array>(
iterable: AsyncIterable<T> | Iterable<T>
): Promise<number[]> {
const arr: number[] = [];
const utf8Decoder = new TextDecoder("utf-8");
// Create an iterator from the source, accommodating both async and sync iterables.
// Iterate over each chunk in the iterable.
for await (const chunk of iterable) {
// Decode the chunk of bytes into a string using UTF-8 decoding.
const str = utf8Decoder.decode(chunk, { stream: true });
// Process each character in the decoded string.
let i = 0;
const len = str.length;
while (i < len) {
// Use the custom codePointAt function to handle surrogate pairs and regular characters.
const codePoint = str.codePointAt(i);
if (codePoint === undefined) break; // If codePointAt returns undefined, break the loop.
arr.push(codePoint);
// Increment the index based on the size of the character (1 for BMP characters, 2 for others).
if (codePoint > 0xFFFF) i += 2; // Surrogate pairs take up two units.
else i++; // Regular characters take up one unit.
}
}
// Flush the decoder's internal state
utf8Decoder.decode(new Uint8Array());
return arr;
}
/**
* Processes an iterable or async iterable of Uint8Array chunks and invokes a callback for each code point.
* The function performs the following steps:
* - Iterate through the input iterable, which yields chunks of bytes (Uint8Array).
* - Process each chunk using a TextDecoder to extract UTF-8 characters.
* - Calculate the corresponding Unicode code points for the extracted characters.
* - Invoke the provided callback for each code point.
*
* @template T The type of elements in the iterable (default: Uint8Array).
* @param iterable An iterable or async iterable to process.
* @param cb A callback function to invoke for each code point.
* @returns A promise that resolves when all code points have been processed.
*
* @example
* Process each Unicode code point from an iterable of `Uint8Array` chunks using a callback function.
* ```ts
* async function exampleCallbackUsage() {
* const utf8Chunks = [new Uint8Array([0x77, 0x6F, 0x72, 0x6C, 0x64])]; // Represents the string 'world'
* await asCodePointsCallback(utf8Chunks, codePoint => {
* console.log(String.fromCodePoint(codePoint)); // Output: 'w', 'o', 'r', 'l', 'd'
* });
* }
* exampleCallbackUsage();
* ```
*/
export async function asCodePointsCallback<T extends Uint8Array>(
iterable: AsyncIterable<T> | Iterable<T>,
cb: (codePoint: number) => void
): Promise<void> {
const utf8Decoder = new TextDecoder("utf-8");
// Create an iterator from the source, accommodating both async and sync iterables.
const iterator = Symbol.asyncIterator in iterable
? iterable[Symbol.asyncIterator]()
: Symbol.iterator in iterable
? iterable[Symbol.iterator]()
: iterable;
// Iterate over each chunk in the iterable.
while (true) {
const result = await iterator.next();
if (result.done) { break; }
const chunk = result.value;
// Decode the chunk of bytes into a string using UTF-8 decoding.
const str = utf8Decoder.decode(chunk, { stream: true });
// Extract code points in larger batches
let i = 0;
const len = str.length;
while (i < len) {
// Use the custom codePointAt function to handle surrogate pairs and regular characters.
const codePoint = str.codePointAt(i);
if (codePoint === undefined) break; // If codePointAt returns undefined, break the loop.
cb(codePoint);
// Increment the index based on the size of the character (1 for BMP characters, 2 for others).
if (codePoint > 0xFFFF) i += 2; // Surrogate pairs take up two units.
else i++; // Regular characters take up one unit.
}
}
// Flush the decoder's internal state
utf8Decoder.decode(new Uint8Array());
}
export default asCodePointsIterator;
export * from "./iterable.ts";
export * from "./byte_methods.ts";
export * from "./constants.ts";