Skip to content

Commit a9d483b

Browse files
authored
Merge pull request #91 from f14XuanLv/fix/utf8-multibyte-truncation-at-buffer-boundary2
fix: handle UTF-8 multibyte sequences truncated at buffer boundary
2 parents 272db64 + 4832a7c commit a9d483b

File tree

9 files changed

+91
-9
lines changed

9 files changed

+91
-9
lines changed

src/index.ts

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ const openAsync = promisify(fs.open);
66
const closeAsync = promisify(fs.close);
77

88
const MAX_BYTES: number = 512;
9+
const UTF8_BOUNDARY_RESERVE: number = 3;
910

1011
// A very basic non-exception raising reader. Read bytes and
1112
// at the end use hasError() to check whether this worked.
@@ -120,12 +121,12 @@ export async function isBinaryFile(file: string | Buffer, size?: number): Promis
120121

121122
const fileDescriptor = await openAsync(file, 'r');
122123

123-
const allocBuffer = Buffer.alloc(MAX_BYTES);
124+
const allocBuffer = Buffer.alloc(MAX_BYTES + UTF8_BOUNDARY_RESERVE);
124125

125126
// Read the file with no encoding for raw buffer access.
126127
// NB: something is severely wrong with promisify, had to construct my own Promise
127128
return new Promise((fulfill, reject) => {
128-
fs.read(fileDescriptor, allocBuffer, 0, MAX_BYTES, 0, (err, bytesRead, _) => {
129+
fs.read(fileDescriptor, allocBuffer, 0, MAX_BYTES + UTF8_BOUNDARY_RESERVE, 0, (err, bytesRead, _) => {
129130
closeAsync(fileDescriptor);
130131
if (err) {
131132
reject(err);
@@ -154,9 +155,9 @@ export function isBinaryFileSync(file: string | Buffer, size?: number): boolean
154155

155156
const fileDescriptor = fs.openSync(file, 'r');
156157

157-
const allocBuffer = Buffer.alloc(MAX_BYTES);
158+
const allocBuffer = Buffer.alloc(MAX_BYTES + UTF8_BOUNDARY_RESERVE);
158159

159-
const bytesRead = fs.readSync(fileDescriptor, allocBuffer, 0, MAX_BYTES, 0);
160+
const bytesRead = fs.readSync(fileDescriptor, allocBuffer, 0, MAX_BYTES + UTF8_BOUNDARY_RESERVE, 0);
160161
fs.closeSync(fileDescriptor);
161162

162163
return isBinaryCheck(allocBuffer, bytesRead);
@@ -175,7 +176,8 @@ function isBinaryCheck(fileBuffer: Buffer, bytesRead: number): boolean {
175176
}
176177

177178
let suspiciousBytes = 0;
178-
const totalBytes = Math.min(bytesRead, MAX_BYTES);
179+
const totalBytes = Math.min(bytesRead, MAX_BYTES + UTF8_BOUNDARY_RESERVE);
180+
const scanBytes = Math.min(totalBytes, MAX_BYTES);
179181

180182
// UTF-8 BOM
181183
if (bytesRead >= 3 && fileBuffer[0] === 0xef && fileBuffer[1] === 0xbb && fileBuffer[2] === 0xbf) {
@@ -230,7 +232,7 @@ function isBinaryCheck(fileBuffer: Buffer, bytesRead: number): boolean {
230232
return false;
231233
}
232234

233-
for (let i = 0; i < totalBytes; i++) {
235+
for (let i = 0; i < scanBytes; i++) {
234236
if (fileBuffer[i] === 0) {
235237
// NULL byte--it's binary!
236238
return true;
@@ -264,17 +266,17 @@ function isBinaryCheck(fileBuffer: Buffer, bytesRead: number): boolean {
264266

265267
suspiciousBytes++;
266268
// Read at least 32 fileBuffer before making a decision
267-
if (i >= 32 && (suspiciousBytes * 100) / totalBytes > 10) {
269+
if (i >= 32 && (suspiciousBytes * 100) / (scanBytes) > 10) {
268270
return true;
269271
}
270272
}
271273
}
272274

273-
if ((suspiciousBytes * 100) / totalBytes > 10) {
275+
if ((suspiciousBytes * 100) / (scanBytes) > 10) {
274276
return true;
275277
}
276278

277-
if (suspiciousBytes > 1 && isBinaryProto(fileBuffer, totalBytes)) {
279+
if (suspiciousBytes > 1 && isBinaryProto(fileBuffer, scanBytes)) {
278280
return true;
279281
}
280282

test/fixtures/508A-4byte.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
😀

test/fixtures/509A-3byte.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA测

test/fixtures/509A-4byte.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
😀

test/fixtures/510A-2byte.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
ã

test/fixtures/510A-3byte.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
测

test/fixtures/510A-4byte.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA😀
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
"""
2+
测试脚本 - DDD增强网络推理
3+
只保存预测结果,不计算指
4+
5+
作者: Dxxx Dexxx
6+
日期: 2025
7+
"""
8+
9+
import os
10+
import sys
11+
import argparse
12+
import torch
13+
import cv2
14+
import numpy as np
15+
from tqdm import tqdm
16+
from pathlib import Path
17+
18+
# 添加上级目录到路径
19+
20+
from data import DDDEnhancerDataset
21+
22+
23+
def function():
24+
"""
25+
保存预测结果
26+
27+
Args:
28+
pred: 预测结果张量 [1, H, W] 或 [H, W],值在[0, 1]
29+
save_path: 保存路径
30+
original_size: 原始图像尺寸 (height, width),如果提供则调整到此尺寸
31+
"""
32+
pass

test/index.test.ts

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -316,3 +316,45 @@ it("should return false on a UTF-8 file with emoji", () => {
316316
const result = isBinaryFileSync(file);
317317
expect(result).toBe(false);
318318
});
319+
320+
it("should return false on UTF-8 file with 4-byte sequence truncated at byte 508", () => {
321+
const file = path.join(FIXTURE_PATH, "508A-4byte.txt");
322+
const result = isBinaryFileSync(file);
323+
expect(result).toBe(false);
324+
});
325+
326+
it("should return false on UTF-8 file with 3-byte sequence truncated at byte 509", () => {
327+
const file = path.join(FIXTURE_PATH, "509A-3byte.txt");
328+
const result = isBinaryFileSync(file);
329+
expect(result).toBe(false);
330+
});
331+
332+
it("should return false on UTF-8 file with 4-byte sequence truncated at byte 509", () => {
333+
const file = path.join(FIXTURE_PATH, "509A-4byte.txt");
334+
const result = isBinaryFileSync(file);
335+
expect(result).toBe(false);
336+
});
337+
338+
it("should return false on UTF-8 file with 2-byte sequence truncated at byte 510", () => {
339+
const file = path.join(FIXTURE_PATH, "510A-2byte.txt");
340+
const result = isBinaryFileSync(file);
341+
expect(result).toBe(false);
342+
});
343+
344+
it("should return false on UTF-8 file with 3-byte sequence truncated at byte 510", () => {
345+
const file = path.join(FIXTURE_PATH, "510A-3byte.txt");
346+
const result = isBinaryFileSync(file);
347+
expect(result).toBe(false);
348+
});
349+
350+
it("should return false on UTF-8 file with 4-byte sequence truncated at byte 510", () => {
351+
const file = path.join(FIXTURE_PATH, "510A-4byte.txt");
352+
const result = isBinaryFileSync(file);
353+
expect(result).toBe(false);
354+
});
355+
356+
it("should return false on real-world Python file with UTF-8 at boundary (utf8-boundary-truncation bug case)", () => {
357+
const file = path.join(FIXTURE_PATH, "utf8-boundary-truncation_case.py");
358+
const result = isBinaryFileSync(file);
359+
expect(result).toBe(false);
360+
});

0 commit comments

Comments
 (0)