Skip to content

to_sql into MSSQL datetimeoffset column drops offset for DataFrame with one row #30884

Open
@gordthompson

Description

@gordthompson

Problem description

(Prompted by this Stack Overflow question.)

When using to_sql with mssql+pyodbc and fast_executemany=True, uploading a DataFrame with a single row containing a tz-aware datetime into a datetimeoffset column causes the timezone offset to be lost. Doing the same thing with a DataFrame containing more than one row produces the correct result.

from pprint import pprint
import sys
import urllib

import pandas as pd
import sqlalchemy as sa

print(sys.version)
# 3.8.1 (tags/v3.8.1:1b293b6, Dec 18 2019, 22:39:24) [MSC v.1916 32 bit (Intel)]

print(f'SQLAlchemy {sa.__version__}, pandas {pd.__version__}, pyodbc {pyodbc.version}')
# SQLAlchemy 1.3.12, pandas 0.25.3, pyodbc 4.0.28

connection_string = (
    r'DRIVER=ODBC Driver 17 for SQL Server;'
    r'SERVER=localhost,49242;'
    r'DATABASE=myDb;'
    r'Trusted_Connection=Yes;'
    r'UseFMTONLY=Yes;'
)
connection_uri = 'mssql+pyodbc:///?odbc_connect=' + urllib.parse.quote_plus(connection_string)
engine = sa.create_engine(connection_uri, fast_executemany=True)

# test environment
table_name = 'DateTimeOffset_Test'
engine.execute(sa.text(f"DROP TABLE IF EXISTS [{table_name}]"))
engine.execute(sa.text(f"CREATE TABLE [{table_name}] (id int primary key, dto datetimeoffset)"))

# test data
my_tz = datetime.timezone(datetime.timedelta(hours=-7))
dto_value = datetime.datetime(2020, 1, 1, 0, 0, 0, tzinfo=my_tz)
print(dto_value)  # 2020-01-01 00:00:00-07:00
#                                        ^

# test code
num_rows = 1
row_data = [(x, dto_value) for x in range(num_rows)]
df = pd.DataFrame(row_data, columns=['id', 'dto'])
print(df)
#    id                       dto
# 0   0 2020-01-01 00:00:00-07:00
#                            ^ - good
df.to_sql(table_name, engine, if_exists='append', index=False)
result = engine.execute(sa.text(f"SELECT id, CAST(dto as varchar(50)) AS foo FROM [{table_name}]")).fetchall()
pprint(result)
# [(1, '2020-01-01 00:00:00.0000000 +00:00')]
#                                     ^ - bad

Simply changing num_rows = 1 to num_rows = 2 produces correct results.

# test code
num_rows = 2
row_data = [(x, dto_value) for x in range(num_rows)]
df = pd.DataFrame(row_data, columns=['id', 'dto'])
print(df)
#    id                       dto
# 0   0 2020-01-01 00:00:00-07:00
# 1   1 2020-01-01 00:00:00-07:00
#                            ^ - good
df.to_sql(table_name, engine, if_exists='append', index=False)
result = engine.execute(sa.text(f"SELECT id, CAST(dto as varchar(50)) AS foo FROM [{table_name}]")).fetchall()
pprint(result)
# [(0, '2020-01-01 00:00:00.0000000 -07:00'),
#  (1, '2020-01-01 00:00:00.0000000 -07:00')]
#                                     ^ - good

Output of pd.show_versions()

INSTALLED VERSIONS

commit : None
python : 3.8.1.final.0
python-bits : 32
OS : Windows
OS-release : 8.1
machine : AMD64
processor : AMD64 Family 18 Model 1 Stepping 0, AuthenticAMD
byteorder : little
LC_ALL : None
LANG : None
LOCALE : English_United States.1252

pandas : 0.25.3
numpy : 1.18.1
pytz : 2019.3
dateutil : 2.8.1
pip : 19.3.1
setuptools : 44.0.0
Cython : None
pytest : None
hypothesis : None
sphinx : None
blosc : None
feather : None
xlsxwriter : None
lxml.etree : None
html5lib : None
pymysql : 0.9.3
psycopg2 : 2.8.4 (dt dec pq3 ext)
jinja2 : None
IPython : None
pandas_datareader: None
bs4 : None
bottleneck : None
fastparquet : None
gcsfs : None
lxml.etree : None
matplotlib : None
numexpr : None
odfpy : None
openpyxl : None
pandas_gbq : None
pyarrow : None
pytables : None
s3fs : None
scipy : None
sqlalchemy : 1.3.12
tables : None
xarray : None
xlrd : None
xlwt : None
xlsxwriter : None

Metadata

Metadata

Assignees

No one assigned

    Labels

    BugIO SQLto_sql, read_sql, read_sql_query

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions