-
Notifications
You must be signed in to change notification settings - Fork 36
FIX: VARCHAR fetch fails when data length equals column size with non-ASCII CP1252 characters #444
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
82c580f
70c00bc
016e549
e324c29
90d0919
f89ad95
ef8af68
645ff6a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -39,6 +39,19 @@ | |
| #define DAE_CHUNK_SIZE 8192 | ||
| #define SQL_MAX_LOB_SIZE 8000 | ||
|
|
||
| // Returns the effective character decoding encoding for SQL_C_CHAR data. | ||
| // On Linux/macOS, the ODBC driver always returns UTF-8 for SQL_C_CHAR, | ||
| // having already converted from the server's encoding (e.g., CP1252). | ||
| // On Windows, the driver returns bytes in the server's native encoding. | ||
| inline std::string GetEffectiveCharDecoding(const std::string& userEncoding) { | ||
| #if defined(__APPLE__) || defined(__linux__) | ||
| (void)userEncoding; | ||
| return "utf-8"; | ||
| #else | ||
| return userEncoding; | ||
| #endif | ||
| } | ||
|
|
||
| //------------------------------------------------------------------------------------------------- | ||
| //------------------------------------------------------------------------------------------------- | ||
| // Logging Infrastructure: | ||
|
|
@@ -1153,7 +1166,8 @@ void SqlHandle::markImplicitlyFreed() { | |
| // Log error but don't throw - we're likely in cleanup/destructor path | ||
| LOG_ERROR("SAFETY VIOLATION: Attempted to mark non-STMT handle as implicitly freed. " | ||
| "Handle type=%d. This will cause handle leak. Only STMT handles are " | ||
| "automatically freed by parent DBC handles.", _type); | ||
| "automatically freed by parent DBC handles.", | ||
| _type); | ||
| return; // Refuse to mark - let normal free() handle it | ||
| } | ||
| _implicitly_freed = true; | ||
|
|
@@ -2875,17 +2889,18 @@ py::object FetchLobColumnData(SQLHSTMT hStmt, SQLUSMALLINT colIndex, SQLSMALLINT | |
| return py::bytes(buffer.data(), buffer.size()); | ||
| } | ||
|
|
||
| // For SQL_C_CHAR data, decode using the specified encoding | ||
| // For SQL_C_CHAR data, decode using the appropriate encoding. | ||
| const std::string effectiveCharEncoding = GetEffectiveCharDecoding(charEncoding); | ||
| py::bytes raw_bytes(buffer.data(), buffer.size()); | ||
| try { | ||
| py::object decoded = raw_bytes.attr("decode")(charEncoding, "strict"); | ||
| py::object decoded = raw_bytes.attr("decode")(effectiveCharEncoding, "strict"); | ||
| LOG("FetchLobColumnData: Decoded narrow string with '%s' - %zu bytes -> %zu chars for " | ||
| "column %d", | ||
| charEncoding.c_str(), buffer.size(), py::len(decoded), colIndex); | ||
| effectiveCharEncoding.c_str(), buffer.size(), py::len(decoded), colIndex); | ||
| return decoded; | ||
| } catch (const py::error_already_set& e) { | ||
| LOG_ERROR("FetchLobColumnData: Failed to decode with '%s' for column %d: %s", | ||
| charEncoding.c_str(), colIndex, e.what()); | ||
| effectiveCharEncoding.c_str(), colIndex, e.what()); | ||
| // Return raw bytes as fallback | ||
| return raw_bytes; | ||
| } | ||
|
|
@@ -2941,7 +2956,14 @@ SQLRETURN SQLGetData_wrap(SqlHandlePtr StatementHandle, SQLUSMALLINT colCount, p | |
| row.append( | ||
| FetchLobColumnData(hStmt, i, SQL_C_CHAR, false, false, charEncoding)); | ||
| } else { | ||
| uint64_t fetchBufferSize = columnSize + 1 /* null-termination */; | ||
| // Use columnSize * 4 + 1 to accommodate worst-case UTF-8 expansion. | ||
| // columnSize is in characters, but the ODBC driver may return UTF-8 | ||
| // encoded bytes where each character can be up to 4 bytes. This | ||
| // applies on Linux/macOS (driver always returns UTF-8 for SQL_C_CHAR) | ||
| // and on Windows when the database uses a UTF-8 collation. Without | ||
| // this, data at exact column boundary with multi-byte chars (e.g., | ||
| // CP1252 é in VARCHAR(10)) causes truncation and corruption. | ||
| uint64_t fetchBufferSize = columnSize * 4 + 1 /* null-termination */; | ||
| std::vector<SQLCHAR> dataBuffer(fetchBufferSize); | ||
| SQLLEN dataLen; | ||
| ret = SQLGetData_ptr(hStmt, i, SQL_C_CHAR, dataBuffer.data(), dataBuffer.size(), | ||
|
|
@@ -2952,20 +2974,23 @@ SQLRETURN SQLGetData_wrap(SqlHandlePtr StatementHandle, SQLUSMALLINT colCount, p | |
| uint64_t numCharsInData = dataLen / sizeof(SQLCHAR); | ||
| if (numCharsInData < dataBuffer.size()) { | ||
| // SQLGetData will null-terminate the data | ||
| // Use Python's codec system to decode bytes with specified encoding | ||
| // Use Python's codec system to decode bytes. | ||
| const std::string decodeEncoding = | ||
| GetEffectiveCharDecoding(charEncoding); | ||
| py::bytes raw_bytes(reinterpret_cast<char*>(dataBuffer.data()), | ||
| static_cast<size_t>(dataLen)); | ||
| try { | ||
| py::object decoded = | ||
| raw_bytes.attr("decode")(charEncoding, "strict"); | ||
| raw_bytes.attr("decode")(decodeEncoding, "strict"); | ||
| row.append(decoded); | ||
| LOG("SQLGetData: CHAR column %d decoded with '%s', %zu bytes " | ||
| "-> %zu chars", | ||
| i, charEncoding.c_str(), (size_t)dataLen, py::len(decoded)); | ||
| i, decodeEncoding.c_str(), (size_t)dataLen, | ||
| py::len(decoded)); | ||
| } catch (const py::error_already_set& e) { | ||
| LOG_ERROR( | ||
| "SQLGetData: Failed to decode CHAR column %d with '%s': %s", | ||
| i, charEncoding.c_str(), e.what()); | ||
| i, decodeEncoding.c_str(), e.what()); | ||
| // Return raw bytes as fallback | ||
| row.append(raw_bytes); | ||
| } | ||
|
|
@@ -3451,7 +3476,14 @@ SQLRETURN SQLBindColums(SQLHSTMT hStmt, ColumnBuffers& buffers, py::list& column | |
| // TODO: handle variable length data correctly. This logic wont | ||
| // suffice | ||
| HandleZeroColumnSizeAtFetch(columnSize); | ||
| // Use columnSize * 4 + 1 on Linux/macOS to accommodate UTF-8 | ||
| // expansion. The ODBC driver returns UTF-8 for SQL_C_CHAR where | ||
| // each character can be up to 4 bytes. | ||
| #if defined(__APPLE__) || defined(__linux__) | ||
| uint64_t fetchBufferSize = columnSize * 4 + 1 /*null-terminator*/; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The There's already a TODO comment in the code acknowledging this (line 3487) below, which we put sometime back. I'd suggest to add an upper bound check. If columnSize * 4 exceeds a reasonable threshold (e.g., SQL_MAX_LOB_SIZE or some configurable limit), fall back to LOB streaming instead of pre-allocating. D Do you this will be a good idea? |
||
| #else | ||
| uint64_t fetchBufferSize = columnSize + 1 /*null-terminator*/; | ||
| #endif | ||
| // TODO: For LONGVARCHAR/BINARY types, columnSize is returned as | ||
| // 2GB-1 by SQLDescribeCol. So fetchBufferSize = 2GB. | ||
| // fetchSize=1 if columnSize>1GB. So we'll allocate a vector of | ||
|
|
@@ -3598,7 +3630,8 @@ SQLRETURN SQLBindColums(SQLHSTMT hStmt, ColumnBuffers& buffers, py::list& column | |
| // TODO: Move to anonymous namespace, since it is not used outside this file | ||
| SQLRETURN FetchBatchData(SQLHSTMT hStmt, ColumnBuffers& buffers, py::list& columnNames, | ||
| py::list& rows, SQLUSMALLINT numCols, SQLULEN& numRowsFetched, | ||
| const std::vector<SQLUSMALLINT>& lobColumns) { | ||
| const std::vector<SQLUSMALLINT>& lobColumns, | ||
| const std::string& charEncoding = "utf-8") { | ||
| LOG("FetchBatchData: Fetching data in batches"); | ||
| SQLRETURN ret = SQLFetchScroll_ptr(hStmt, SQL_FETCH_NEXT, 0); | ||
| if (ret == SQL_NO_DATA) { | ||
|
|
@@ -3628,8 +3661,22 @@ SQLRETURN FetchBatchData(SQLHSTMT hStmt, ColumnBuffers& buffers, py::list& colum | |
| std::find(lobColumns.begin(), lobColumns.end(), col + 1) != lobColumns.end(); | ||
| columnInfos[col].processedColumnSize = columnInfos[col].columnSize; | ||
| HandleZeroColumnSizeAtFetch(columnInfos[col].processedColumnSize); | ||
| // On Linux/macOS, the ODBC driver returns UTF-8 for SQL_C_CHAR where | ||
| // each character can be up to 4 bytes. Must match SQLBindColums buffer. | ||
| #if defined(__APPLE__) || defined(__linux__) | ||
| SQLSMALLINT dt = columnInfos[col].dataType; | ||
| bool isCharType = (dt == SQL_CHAR || dt == SQL_VARCHAR || dt == SQL_LONGVARCHAR); | ||
| if (isCharType) { | ||
| columnInfos[col].fetchBufferSize = columnInfos[col].processedColumnSize * 4 + | ||
| 1; // *4 for UTF-8, +1 for null terminator | ||
| } else { | ||
| columnInfos[col].fetchBufferSize = | ||
| columnInfos[col].processedColumnSize + 1; // +1 for null terminator | ||
| } | ||
| #else | ||
| columnInfos[col].fetchBufferSize = | ||
| columnInfos[col].processedColumnSize + 1; // +1 for null terminator | ||
| #endif | ||
| } | ||
|
|
||
| // Performance: Build function pointer dispatch table (once per batch) | ||
|
|
@@ -3639,13 +3686,17 @@ SQLRETURN FetchBatchData(SQLHSTMT hStmt, ColumnBuffers& buffers, py::list& colum | |
| std::vector<ColumnProcessor> columnProcessors(numCols); | ||
| std::vector<ColumnInfoExt> columnInfosExt(numCols); | ||
|
|
||
| // Compute effective char encoding once for the batch (same for all columns) | ||
| const std::string effectiveCharEnc = GetEffectiveCharDecoding(charEncoding); | ||
|
|
||
| for (SQLUSMALLINT col = 0; col < numCols; col++) { | ||
| // Populate extended column info for processors that need it | ||
| columnInfosExt[col].dataType = columnInfos[col].dataType; | ||
| columnInfosExt[col].columnSize = columnInfos[col].columnSize; | ||
| columnInfosExt[col].processedColumnSize = columnInfos[col].processedColumnSize; | ||
| columnInfosExt[col].fetchBufferSize = columnInfos[col].fetchBufferSize; | ||
| columnInfosExt[col].isLob = columnInfos[col].isLob; | ||
| columnInfosExt[col].charEncoding = effectiveCharEnc; | ||
|
|
||
| // Map data type to processor function (switch executed once per column, | ||
| // not per cell) | ||
|
|
@@ -4085,7 +4136,8 @@ SQLRETURN FetchMany_wrap(SqlHandlePtr StatementHandle, py::list& rows, int fetch | |
| SQLSetStmtAttr_ptr(hStmt, SQL_ATTR_ROW_ARRAY_SIZE, (SQLPOINTER)(intptr_t)fetchSize, 0); | ||
| SQLSetStmtAttr_ptr(hStmt, SQL_ATTR_ROWS_FETCHED_PTR, &numRowsFetched, 0); | ||
|
|
||
| ret = FetchBatchData(hStmt, buffers, columnNames, rows, numCols, numRowsFetched, lobColumns); | ||
| ret = FetchBatchData(hStmt, buffers, columnNames, rows, numCols, numRowsFetched, lobColumns, | ||
| charEncoding); | ||
| if (!SQL_SUCCEEDED(ret) && ret != SQL_NO_DATA) { | ||
| LOG("FetchMany_wrap: Error when fetching data - SQLRETURN=%d", ret); | ||
| return ret; | ||
|
|
@@ -4094,10 +4146,10 @@ SQLRETURN FetchMany_wrap(SqlHandlePtr StatementHandle, py::list& rows, int fetch | |
| // Reset attributes before returning to avoid using stack pointers later | ||
| SQLSetStmtAttr_ptr(hStmt, SQL_ATTR_ROW_ARRAY_SIZE, (SQLPOINTER)1, 0); | ||
| SQLSetStmtAttr_ptr(hStmt, SQL_ATTR_ROWS_FETCHED_PTR, NULL, 0); | ||
|
|
||
| // Unbind columns to allow subsequent fetchone() calls to use SQLGetData | ||
| SQLFreeStmt_ptr(hStmt, SQL_UNBIND); | ||
|
|
||
| return ret; | ||
| } | ||
|
|
||
|
|
@@ -4221,8 +4273,8 @@ SQLRETURN FetchAll_wrap(SqlHandlePtr StatementHandle, py::list& rows, | |
| SQLSetStmtAttr_ptr(hStmt, SQL_ATTR_ROWS_FETCHED_PTR, &numRowsFetched, 0); | ||
|
|
||
| while (ret != SQL_NO_DATA) { | ||
| ret = | ||
| FetchBatchData(hStmt, buffers, columnNames, rows, numCols, numRowsFetched, lobColumns); | ||
| ret = FetchBatchData(hStmt, buffers, columnNames, rows, numCols, numRowsFetched, lobColumns, | ||
| charEncoding); | ||
| if (!SQL_SUCCEEDED(ret) && ret != SQL_NO_DATA) { | ||
| LOG("FetchAll_wrap: Error when fetching data - SQLRETURN=%d", ret); | ||
| return ret; | ||
|
|
@@ -4232,7 +4284,7 @@ SQLRETURN FetchAll_wrap(SqlHandlePtr StatementHandle, py::list& rows, | |
| // Reset attributes before returning to avoid using stack pointers later | ||
| SQLSetStmtAttr_ptr(hStmt, SQL_ATTR_ROW_ARRAY_SIZE, (SQLPOINTER)1, 0); | ||
| SQLSetStmtAttr_ptr(hStmt, SQL_ATTR_ROWS_FETCHED_PTR, NULL, 0); | ||
|
|
||
| // Unbind columns to allow subsequent fetchone() calls to use SQLGetData | ||
| SQLFreeStmt_ptr(hStmt, SQL_UNBIND); | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -667,6 +667,7 @@ struct ColumnInfoExt { | |
| SQLULEN processedColumnSize; | ||
| uint64_t fetchBufferSize; | ||
| bool isLob; | ||
| std::string charEncoding; // Effective decoding encoding for SQL_C_CHAR data | ||
| }; | ||
|
|
||
| // Forward declare FetchLobColumnData (defined in ddbc_bindings.cpp) - MUST be | ||
|
|
@@ -811,21 +812,48 @@ inline void ProcessChar(PyObject* row, ColumnBuffers& buffers, const void* colIn | |
| // fetchBufferSize includes null-terminator, numCharsInData doesn't. Hence | ||
| // '<' | ||
| if (!colInfo->isLob && numCharsInData < colInfo->fetchBufferSize) { | ||
| // Performance: Direct Python C API call - create string from buffer | ||
| PyObject* pyStr = PyUnicode_FromStringAndSize( | ||
| reinterpret_cast<char*>( | ||
| &buffers.charBuffers[col - 1][rowIdx * colInfo->fetchBufferSize]), | ||
| numCharsInData); | ||
| const char* dataPtr = reinterpret_cast<char*>( | ||
| &buffers.charBuffers[col - 1][rowIdx * colInfo->fetchBufferSize]); | ||
| PyObject* pyStr = nullptr; | ||
| #if defined(__APPLE__) || defined(__linux__) | ||
| // On Linux/macOS, ODBC driver returns UTF-8 — PyUnicode_FromStringAndSize | ||
| // expects UTF-8, so this is correct and fast. | ||
| pyStr = PyUnicode_FromStringAndSize(dataPtr, numCharsInData); | ||
| #else | ||
| // On Windows, ODBC driver returns bytes in the server's native encoding. | ||
| // For UTF-8, use the direct C API (PyUnicode_FromStringAndSize) which | ||
| // bypasses the codec registry for maximum reliability. For non-UTF-8 | ||
| // encodings (e.g., CP1252), use PyUnicode_Decode with the codec registry. | ||
| if (colInfo->charEncoding == "utf-8") { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This string comparison happens per-cell in the batch fetch hot loop (called for every row × every CHAR column). While small, it's unnecessary overhead. Can we mitigate this please? |
||
| pyStr = PyUnicode_FromStringAndSize(dataPtr, numCharsInData); | ||
| } else { | ||
| pyStr = | ||
| PyUnicode_Decode(dataPtr, numCharsInData, colInfo->charEncoding.c_str(), "strict"); | ||
| } | ||
| #endif | ||
| if (!pyStr) { | ||
| Py_INCREF(Py_None); | ||
| PyList_SET_ITEM(row, col - 1, Py_None); | ||
| // Decode failed — fall back to returning raw bytes (consistent with | ||
| // FetchLobColumnData and SQLGetData_wrap which also return raw bytes | ||
| // on decode failure instead of silently converting to None). | ||
| PyErr_Clear(); | ||
| PyObject* pyBytes = PyBytes_FromStringAndSize(dataPtr, numCharsInData); | ||
| if (pyBytes) { | ||
| PyList_SET_ITEM(row, col - 1, pyBytes); | ||
| } else { | ||
| PyErr_Clear(); | ||
| Py_INCREF(Py_None); | ||
| PyList_SET_ITEM(row, col - 1, Py_None); | ||
| } | ||
| } else { | ||
| PyList_SET_ITEM(row, col - 1, pyStr); | ||
| } | ||
| } else { | ||
| // Slow path: LOB data requires separate fetch call | ||
| PyList_SET_ITEM(row, col - 1, | ||
| FetchLobColumnData(hStmt, col, SQL_C_CHAR, false, false).release().ptr()); | ||
| PyList_SET_ITEM( | ||
| row, col - 1, | ||
| FetchLobColumnData(hStmt, col, SQL_C_CHAR, false, false, colInfo->charEncoding) | ||
| .release() | ||
| .ptr()); | ||
| } | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In
SQLGetData_wrap"× 4" is applied on all platforms (no #if guard), whileSQLBindColumsandFetchBatchDatause "× 4" only on Linux/macOS. The comment says "and on Windows when the database uses a UTF-8 collation" but this is a different behavior than the other two sites. This inconsistency means:On Windows with CP1252, SQLGetData_wrap allocates 4× the needed buffer (wasteful)
On Linux/macOS, SQLBindColums and FetchBatchData also allocate 4× (correct)
Can you document why
SQLGetData_wrapintentionally differs with the other two functions?