33import inspect
44from io import BufferedIOBase , BytesIO , RawIOBase
55import os
6+ from pathlib import Path
67from textwrap import fill
7- from typing import Any , Dict , Mapping , Union , cast
8+ from typing import Any , Dict , Mapping , Union , cast , BinaryIO
89import warnings
10+ from zipfile import ZipFile
911
1012from pandas ._config import config
1113
@@ -888,32 +890,73 @@ def close(self):
888890 return content
889891
890892
891- def _is_ods_stream (stream : Union [BufferedIOBase , RawIOBase ] ) -> bool :
893+ def _peek (stream : Union [BufferedIOBase , RawIOBase , BinaryIO ], size : int = 20 ) -> bytes :
892894 """
893- Check if the stream is an OpenDocument Spreadsheet (.ods) file
894-
895- It uses magic values inside the stream
895+ Return the specified number of bytes from the start of the stream
896+ and seek back to the start of the stream afterwards.
896897
897898 Parameters
898899 ----------
899900 stream : Union[BufferedIOBase, RawIOBase]
900- IO stream with data which might be an ODS file
901901
902902 Returns
903903 -------
904- is_ods : bool
905- Boolean indication that this is indeed an ODS file or not
904+ content : bytes
905+ The bytes founds.
906906 """
907907 stream .seek (0 )
908- is_ods = False
909- if stream .read (4 ) == b"PK\003 \004 " :
910- stream .seek (30 )
911- is_ods = (
912- stream .read (54 ) == b"mimetype"
913- b"application/vnd.oasis.opendocument.spreadsheet"
914- )
908+ content = stream .read (size )
915909 stream .seek (0 )
916- return is_ods
910+ return content
911+
912+
913+ _XLS_SIGNATURE = b"\xD0 \xCF \x11 \xE0 \xA1 \xB1 \x1A \xE1 "
914+ _ZIP_SIGNATURE = b"PK\x03 \x04 "
915+ _PEEK_SIZE = max (len (_XLS_SIGNATURE ), len (_ZIP_SIGNATURE ))
916+
917+
918+ def _engine_from_content (stream : Union [BufferedIOBase , RawIOBase , BinaryIO ]) -> str :
919+ """
920+ Use the content of a stream to try and figure out which engine to use.
921+
922+ It uses magic values inside the stream.
923+
924+ Parameters
925+ ----------
926+ stream : Union[BufferedIOBase, RawIOBase]
927+ IO stream with data which might contain spreadsheet data.
928+
929+ Returns
930+ -------
931+ engine : Optional[engine]
932+ The string engine if it can be confidently inferred.
933+ """
934+ engine = None
935+ peek = _peek (stream , _PEEK_SIZE )
936+
937+ if peek .startswith (_XLS_SIGNATURE ):
938+ engine = "xlrd"
939+
940+ elif peek .startswith (_ZIP_SIGNATURE ):
941+ zf = ZipFile (stream )
942+
943+ # Workaround for some third party files that use forward slashes and
944+ # lower case names. We map the expected name in lowercase to the
945+ # actual filename in the zip container.
946+ component_names = {
947+ name .replace ("\\ " , "/" ).lower (): name for name in zf .namelist ()
948+ }
949+
950+ stream .seek (0 )
951+
952+ if "xl/workbook.xml" in component_names :
953+ engine = "openpyxl"
954+ if "xl/workbook.bin" in component_names :
955+ engine = "pyxlsb"
956+ if "content.xml" in component_names :
957+ engine = "odf"
958+
959+ return engine
917960
918961
919962class ExcelFile :
@@ -970,21 +1013,39 @@ class ExcelFile:
9701013 "pyxlsb" : PyxlsbReader ,
9711014 }
9721015
1016+ _ext_to_engine : Mapping [str , str ] = {
1017+ ".ods" : "odf" ,
1018+ ".xls" : "xlrd" ,
1019+ ".xlsx" : "openpyxl" ,
1020+ }
1021+
9731022 def __init__ (
9741023 self , path_or_buffer , engine = None , storage_options : StorageOptions = None
9751024 ):
9761025 if engine is None :
977- # Determine ext and use odf for ods stream/file
1026+
1027+ ext = peek = None
1028+
1029+ if isinstance (path_or_buffer , bytes ):
1030+ path_or_buffer = BytesIO (path_or_buffer )
1031+
9781032 if isinstance (path_or_buffer , (BufferedIOBase , RawIOBase )):
979- ext = None
980- if _is_ods_stream (path_or_buffer ):
981- engine = "odf"
982- else :
1033+ engine = _engine_from_content ( path_or_buffer )
1034+ peek = _peek (path_or_buffer )
1035+
1036+ elif isinstance ( path_or_buffer , ( str , os . PathLike )) :
9831037 ext = os .path .splitext (str (path_or_buffer ))[- 1 ]
984- if ext == ".ods" :
985- engine = "odf"
1038+ handles = get_handle (
1039+ stringify_path (path_or_buffer ),
1040+ "rb" ,
1041+ storage_options = storage_options ,
1042+ is_text = False ,
1043+ )
1044+ with handles :
1045+ engine = _engine_from_content (handles .handle )
1046+ peek = _peek (handles .handle )
9861047
987- if (
1048+ elif (
9881049 import_optional_dependency (
9891050 "xlrd" , raise_on_missing = False , on_version = "ignore"
9901051 )
@@ -995,38 +1056,16 @@ def __init__(
9951056 if isinstance (path_or_buffer , Book ):
9961057 engine = "xlrd"
9971058
998- # GH 35029 - Prefer openpyxl except for xls files
1059+ # Couldn't tell for definite, so guess based on extension:
9991060 if engine is None :
1000- if ext is None or isinstance (path_or_buffer , bytes ) or ext == ".xls" :
1001- engine = "xlrd"
1002- elif (
1003- import_optional_dependency (
1004- "openpyxl" , raise_on_missing = False , on_version = "ignore"
1005- )
1006- is not None
1007- ):
1008- engine = "openpyxl"
1009- else :
1010- caller = inspect .stack ()[1 ]
1011- if (
1012- caller .filename .endswith ("pandas/io/excel/_base.py" )
1013- and caller .function == "read_excel"
1014- ):
1015- stacklevel = 4
1016- else :
1017- stacklevel = 2
1018- warnings .warn (
1019- "The xlrd engine is no longer maintained and is not "
1020- "supported when using pandas with python >= 3.9. However, "
1021- "the engine xlrd will continue to be allowed for the "
1022- "indefinite future. Beginning with pandas 1.2.0, the "
1023- "openpyxl engine will be used if it is installed and the "
1024- "engine argument is not specified. Either install openpyxl "
1025- "or specify engine='xlrd' to silence this warning." ,
1026- FutureWarning ,
1027- stacklevel = stacklevel ,
1028- )
1029- engine = "xlrd"
1061+ engine = self ._ext_to_engine .get (ext )
1062+
1063+ if engine is None :
1064+ raise ValueError (
1065+ f"Could not find engine for { path_or_buffer !r} , content was "
1066+ f"{ peek !r} "
1067+ )
1068+
10301069 if engine not in self ._engines :
10311070 raise ValueError (f"Unknown engine: { engine } " )
10321071
0 commit comments