Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 69 additions & 17 deletions mssql_python/pybind/ddbc_bindings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,19 @@
#define DAE_CHUNK_SIZE 8192
#define SQL_MAX_LOB_SIZE 8000

// Returns the effective character decoding encoding for SQL_C_CHAR data.
// On Linux/macOS, the ODBC driver always returns UTF-8 for SQL_C_CHAR,
// having already converted from the server's encoding (e.g., CP1252).
// On Windows, the driver returns bytes in the server's native encoding.
inline std::string GetEffectiveCharDecoding(const std::string& userEncoding) {
#if defined(__APPLE__) || defined(__linux__)
(void)userEncoding;
return "utf-8";
#else
return userEncoding;
#endif
}

//-------------------------------------------------------------------------------------------------
//-------------------------------------------------------------------------------------------------
// Logging Infrastructure:
Expand Down Expand Up @@ -1153,7 +1166,8 @@ void SqlHandle::markImplicitlyFreed() {
// Log error but don't throw - we're likely in cleanup/destructor path
LOG_ERROR("SAFETY VIOLATION: Attempted to mark non-STMT handle as implicitly freed. "
"Handle type=%d. This will cause handle leak. Only STMT handles are "
"automatically freed by parent DBC handles.", _type);
"automatically freed by parent DBC handles.",
_type);
return; // Refuse to mark - let normal free() handle it
}
_implicitly_freed = true;
Expand Down Expand Up @@ -2875,17 +2889,18 @@ py::object FetchLobColumnData(SQLHSTMT hStmt, SQLUSMALLINT colIndex, SQLSMALLINT
return py::bytes(buffer.data(), buffer.size());
}

// For SQL_C_CHAR data, decode using the specified encoding
// For SQL_C_CHAR data, decode using the appropriate encoding.
const std::string effectiveCharEncoding = GetEffectiveCharDecoding(charEncoding);
py::bytes raw_bytes(buffer.data(), buffer.size());
try {
py::object decoded = raw_bytes.attr("decode")(charEncoding, "strict");
py::object decoded = raw_bytes.attr("decode")(effectiveCharEncoding, "strict");
LOG("FetchLobColumnData: Decoded narrow string with '%s' - %zu bytes -> %zu chars for "
"column %d",
charEncoding.c_str(), buffer.size(), py::len(decoded), colIndex);
effectiveCharEncoding.c_str(), buffer.size(), py::len(decoded), colIndex);
return decoded;
} catch (const py::error_already_set& e) {
LOG_ERROR("FetchLobColumnData: Failed to decode with '%s' for column %d: %s",
charEncoding.c_str(), colIndex, e.what());
effectiveCharEncoding.c_str(), colIndex, e.what());
// Return raw bytes as fallback
return raw_bytes;
}
Expand Down Expand Up @@ -2941,7 +2956,14 @@ SQLRETURN SQLGetData_wrap(SqlHandlePtr StatementHandle, SQLUSMALLINT colCount, p
row.append(
FetchLobColumnData(hStmt, i, SQL_C_CHAR, false, false, charEncoding));
} else {
uint64_t fetchBufferSize = columnSize + 1 /* null-termination */;
// Use columnSize * 4 + 1 to accommodate worst-case UTF-8 expansion.
// columnSize is in characters, but the ODBC driver may return UTF-8
// encoded bytes where each character can be up to 4 bytes. This
// applies on Linux/macOS (driver always returns UTF-8 for SQL_C_CHAR)
// and on Windows when the database uses a UTF-8 collation. Without
// this, data at exact column boundary with multi-byte chars (e.g.,
// CP1252 é in VARCHAR(10)) causes truncation and corruption.
uint64_t fetchBufferSize = columnSize * 4 + 1 /* null-termination */;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In SQLGetData_wrap "× 4" is applied on all platforms (no #if guard), while SQLBindColums and FetchBatchData use "× 4" only on Linux/macOS. The comment says "and on Windows when the database uses a UTF-8 collation" but this is a different behavior than the other two sites. This inconsistency means:

On Windows with CP1252, SQLGetData_wrap allocates 4× the needed buffer (wasteful)
On Linux/macOS, SQLBindColums and FetchBatchData also allocate 4× (correct)

Can you document why SQLGetData_wrap intentionally differs with the other two functions?

std::vector<SQLCHAR> dataBuffer(fetchBufferSize);
SQLLEN dataLen;
ret = SQLGetData_ptr(hStmt, i, SQL_C_CHAR, dataBuffer.data(), dataBuffer.size(),
Expand All @@ -2952,20 +2974,23 @@ SQLRETURN SQLGetData_wrap(SqlHandlePtr StatementHandle, SQLUSMALLINT colCount, p
uint64_t numCharsInData = dataLen / sizeof(SQLCHAR);
if (numCharsInData < dataBuffer.size()) {
// SQLGetData will null-terminate the data
// Use Python's codec system to decode bytes with specified encoding
// Use Python's codec system to decode bytes.
const std::string decodeEncoding =
GetEffectiveCharDecoding(charEncoding);
py::bytes raw_bytes(reinterpret_cast<char*>(dataBuffer.data()),
static_cast<size_t>(dataLen));
try {
py::object decoded =
raw_bytes.attr("decode")(charEncoding, "strict");
raw_bytes.attr("decode")(decodeEncoding, "strict");
row.append(decoded);
LOG("SQLGetData: CHAR column %d decoded with '%s', %zu bytes "
"-> %zu chars",
i, charEncoding.c_str(), (size_t)dataLen, py::len(decoded));
i, decodeEncoding.c_str(), (size_t)dataLen,
py::len(decoded));
} catch (const py::error_already_set& e) {
LOG_ERROR(
"SQLGetData: Failed to decode CHAR column %d with '%s': %s",
i, charEncoding.c_str(), e.what());
i, decodeEncoding.c_str(), e.what());
// Return raw bytes as fallback
row.append(raw_bytes);
}
Expand Down Expand Up @@ -3451,7 +3476,14 @@ SQLRETURN SQLBindColums(SQLHSTMT hStmt, ColumnBuffers& buffers, py::list& column
// TODO: handle variable length data correctly. This logic wont
// suffice
HandleZeroColumnSizeAtFetch(columnSize);
// Use columnSize * 4 + 1 on Linux/macOS to accommodate UTF-8
// expansion. The ODBC driver returns UTF-8 for SQL_C_CHAR where
// each character can be up to 4 bytes.
#if defined(__APPLE__) || defined(__linux__)
uint64_t fetchBufferSize = columnSize * 4 + 1 /*null-terminator*/;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The columnSize is an SQLULEN. Multiplying by 4 can overflow for very large column sizes (e.g., LONGVARCHAR where columnSize can be 2GB-1 = 2147483647). 2147483647 * 4 = 8589934588 bytes (~8 GB), which may cause an out-of-memory crash or excessive memory allocation.

There's already a TODO comment in the code acknowledging this (line 3487) below, which we put sometime back. I'd suggest to add an upper bound check. If columnSize * 4 exceeds a reasonable threshold (e.g., SQL_MAX_LOB_SIZE or some configurable limit), fall back to LOB streaming instead of pre-allocating. D

Do you this will be a good idea?

#else
uint64_t fetchBufferSize = columnSize + 1 /*null-terminator*/;
#endif
// TODO: For LONGVARCHAR/BINARY types, columnSize is returned as
// 2GB-1 by SQLDescribeCol. So fetchBufferSize = 2GB.
// fetchSize=1 if columnSize>1GB. So we'll allocate a vector of
Expand Down Expand Up @@ -3598,7 +3630,8 @@ SQLRETURN SQLBindColums(SQLHSTMT hStmt, ColumnBuffers& buffers, py::list& column
// TODO: Move to anonymous namespace, since it is not used outside this file
SQLRETURN FetchBatchData(SQLHSTMT hStmt, ColumnBuffers& buffers, py::list& columnNames,
py::list& rows, SQLUSMALLINT numCols, SQLULEN& numRowsFetched,
const std::vector<SQLUSMALLINT>& lobColumns) {
const std::vector<SQLUSMALLINT>& lobColumns,
const std::string& charEncoding = "utf-8") {
LOG("FetchBatchData: Fetching data in batches");
SQLRETURN ret = SQLFetchScroll_ptr(hStmt, SQL_FETCH_NEXT, 0);
if (ret == SQL_NO_DATA) {
Expand Down Expand Up @@ -3628,8 +3661,22 @@ SQLRETURN FetchBatchData(SQLHSTMT hStmt, ColumnBuffers& buffers, py::list& colum
std::find(lobColumns.begin(), lobColumns.end(), col + 1) != lobColumns.end();
columnInfos[col].processedColumnSize = columnInfos[col].columnSize;
HandleZeroColumnSizeAtFetch(columnInfos[col].processedColumnSize);
// On Linux/macOS, the ODBC driver returns UTF-8 for SQL_C_CHAR where
// each character can be up to 4 bytes. Must match SQLBindColums buffer.
#if defined(__APPLE__) || defined(__linux__)
SQLSMALLINT dt = columnInfos[col].dataType;
bool isCharType = (dt == SQL_CHAR || dt == SQL_VARCHAR || dt == SQL_LONGVARCHAR);
if (isCharType) {
columnInfos[col].fetchBufferSize = columnInfos[col].processedColumnSize * 4 +
1; // *4 for UTF-8, +1 for null terminator
} else {
columnInfos[col].fetchBufferSize =
columnInfos[col].processedColumnSize + 1; // +1 for null terminator
}
#else
columnInfos[col].fetchBufferSize =
columnInfos[col].processedColumnSize + 1; // +1 for null terminator
#endif
}

// Performance: Build function pointer dispatch table (once per batch)
Expand All @@ -3639,13 +3686,17 @@ SQLRETURN FetchBatchData(SQLHSTMT hStmt, ColumnBuffers& buffers, py::list& colum
std::vector<ColumnProcessor> columnProcessors(numCols);
std::vector<ColumnInfoExt> columnInfosExt(numCols);

// Compute effective char encoding once for the batch (same for all columns)
const std::string effectiveCharEnc = GetEffectiveCharDecoding(charEncoding);

for (SQLUSMALLINT col = 0; col < numCols; col++) {
// Populate extended column info for processors that need it
columnInfosExt[col].dataType = columnInfos[col].dataType;
columnInfosExt[col].columnSize = columnInfos[col].columnSize;
columnInfosExt[col].processedColumnSize = columnInfos[col].processedColumnSize;
columnInfosExt[col].fetchBufferSize = columnInfos[col].fetchBufferSize;
columnInfosExt[col].isLob = columnInfos[col].isLob;
columnInfosExt[col].charEncoding = effectiveCharEnc;

// Map data type to processor function (switch executed once per column,
// not per cell)
Expand Down Expand Up @@ -4085,7 +4136,8 @@ SQLRETURN FetchMany_wrap(SqlHandlePtr StatementHandle, py::list& rows, int fetch
SQLSetStmtAttr_ptr(hStmt, SQL_ATTR_ROW_ARRAY_SIZE, (SQLPOINTER)(intptr_t)fetchSize, 0);
SQLSetStmtAttr_ptr(hStmt, SQL_ATTR_ROWS_FETCHED_PTR, &numRowsFetched, 0);

ret = FetchBatchData(hStmt, buffers, columnNames, rows, numCols, numRowsFetched, lobColumns);
ret = FetchBatchData(hStmt, buffers, columnNames, rows, numCols, numRowsFetched, lobColumns,
charEncoding);
if (!SQL_SUCCEEDED(ret) && ret != SQL_NO_DATA) {
LOG("FetchMany_wrap: Error when fetching data - SQLRETURN=%d", ret);
return ret;
Expand All @@ -4094,10 +4146,10 @@ SQLRETURN FetchMany_wrap(SqlHandlePtr StatementHandle, py::list& rows, int fetch
// Reset attributes before returning to avoid using stack pointers later
SQLSetStmtAttr_ptr(hStmt, SQL_ATTR_ROW_ARRAY_SIZE, (SQLPOINTER)1, 0);
SQLSetStmtAttr_ptr(hStmt, SQL_ATTR_ROWS_FETCHED_PTR, NULL, 0);

// Unbind columns to allow subsequent fetchone() calls to use SQLGetData
SQLFreeStmt_ptr(hStmt, SQL_UNBIND);

return ret;
}

Expand Down Expand Up @@ -4221,8 +4273,8 @@ SQLRETURN FetchAll_wrap(SqlHandlePtr StatementHandle, py::list& rows,
SQLSetStmtAttr_ptr(hStmt, SQL_ATTR_ROWS_FETCHED_PTR, &numRowsFetched, 0);

while (ret != SQL_NO_DATA) {
ret =
FetchBatchData(hStmt, buffers, columnNames, rows, numCols, numRowsFetched, lobColumns);
ret = FetchBatchData(hStmt, buffers, columnNames, rows, numCols, numRowsFetched, lobColumns,
charEncoding);
if (!SQL_SUCCEEDED(ret) && ret != SQL_NO_DATA) {
LOG("FetchAll_wrap: Error when fetching data - SQLRETURN=%d", ret);
return ret;
Expand All @@ -4232,7 +4284,7 @@ SQLRETURN FetchAll_wrap(SqlHandlePtr StatementHandle, py::list& rows,
// Reset attributes before returning to avoid using stack pointers later
SQLSetStmtAttr_ptr(hStmt, SQL_ATTR_ROW_ARRAY_SIZE, (SQLPOINTER)1, 0);
SQLSetStmtAttr_ptr(hStmt, SQL_ATTR_ROWS_FETCHED_PTR, NULL, 0);

// Unbind columns to allow subsequent fetchone() calls to use SQLGetData
SQLFreeStmt_ptr(hStmt, SQL_UNBIND);

Expand Down
46 changes: 37 additions & 9 deletions mssql_python/pybind/ddbc_bindings.h
Original file line number Diff line number Diff line change
Expand Up @@ -667,6 +667,7 @@ struct ColumnInfoExt {
SQLULEN processedColumnSize;
uint64_t fetchBufferSize;
bool isLob;
std::string charEncoding; // Effective decoding encoding for SQL_C_CHAR data
};

// Forward declare FetchLobColumnData (defined in ddbc_bindings.cpp) - MUST be
Expand Down Expand Up @@ -811,21 +812,48 @@ inline void ProcessChar(PyObject* row, ColumnBuffers& buffers, const void* colIn
// fetchBufferSize includes null-terminator, numCharsInData doesn't. Hence
// '<'
if (!colInfo->isLob && numCharsInData < colInfo->fetchBufferSize) {
// Performance: Direct Python C API call - create string from buffer
PyObject* pyStr = PyUnicode_FromStringAndSize(
reinterpret_cast<char*>(
&buffers.charBuffers[col - 1][rowIdx * colInfo->fetchBufferSize]),
numCharsInData);
const char* dataPtr = reinterpret_cast<char*>(
&buffers.charBuffers[col - 1][rowIdx * colInfo->fetchBufferSize]);
PyObject* pyStr = nullptr;
#if defined(__APPLE__) || defined(__linux__)
// On Linux/macOS, ODBC driver returns UTF-8 — PyUnicode_FromStringAndSize
// expects UTF-8, so this is correct and fast.
pyStr = PyUnicode_FromStringAndSize(dataPtr, numCharsInData);
#else
// On Windows, ODBC driver returns bytes in the server's native encoding.
// For UTF-8, use the direct C API (PyUnicode_FromStringAndSize) which
// bypasses the codec registry for maximum reliability. For non-UTF-8
// encodings (e.g., CP1252), use PyUnicode_Decode with the codec registry.
if (colInfo->charEncoding == "utf-8") {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This string comparison happens per-cell in the batch fetch hot loop (called for every row × every CHAR column). While small, it's unnecessary overhead. Can we mitigate this please?

pyStr = PyUnicode_FromStringAndSize(dataPtr, numCharsInData);
} else {
pyStr =
PyUnicode_Decode(dataPtr, numCharsInData, colInfo->charEncoding.c_str(), "strict");
}
#endif
if (!pyStr) {
Py_INCREF(Py_None);
PyList_SET_ITEM(row, col - 1, Py_None);
// Decode failed — fall back to returning raw bytes (consistent with
// FetchLobColumnData and SQLGetData_wrap which also return raw bytes
// on decode failure instead of silently converting to None).
PyErr_Clear();
PyObject* pyBytes = PyBytes_FromStringAndSize(dataPtr, numCharsInData);
if (pyBytes) {
PyList_SET_ITEM(row, col - 1, pyBytes);
} else {
PyErr_Clear();
Py_INCREF(Py_None);
PyList_SET_ITEM(row, col - 1, Py_None);
}
} else {
PyList_SET_ITEM(row, col - 1, pyStr);
}
} else {
// Slow path: LOB data requires separate fetch call
PyList_SET_ITEM(row, col - 1,
FetchLobColumnData(hStmt, col, SQL_C_CHAR, false, false).release().ptr());
PyList_SET_ITEM(
row, col - 1,
FetchLobColumnData(hStmt, col, SQL_C_CHAR, false, false, colInfo->charEncoding)
.release()
.ptr());
}
}

Expand Down
1 change: 1 addition & 0 deletions tests/test_013_encoding_decoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -5697,6 +5697,7 @@ def test_default_encoding_behavior_validation(conn_str):
def test_encoding_with_bytes_and_bytearray_parameters(db_connection):
"""Test encoding with bytes and bytearray parameters (SQL_C_CHAR path)."""
db_connection.setencoding(encoding="utf-8", ctype=mssql_python.SQL_CHAR)
db_connection.setdecoding(mssql_python.SQL_CHAR, encoding="utf-8")

cursor = db_connection.cursor()
try:
Expand Down
Loading
Loading