diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index 3f8a3a759545..69b3b2f17028 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -520,17 +520,20 @@ def func(iterator): raise TypeError("path can be only string, list or RDD") @since(1.5) - def orc(self, path, recursiveFileLookup=None): + def orc(self, path, mergeSchema=None, recursiveFileLookup=None): """Loads ORC files, returning the result as a :class:`DataFrame`. + :param mergeSchema: sets whether we should merge schemas collected from all + ORC part-files. This will override ``spark.sql.orc.mergeSchema``. + The default value is specified in ``spark.sql.orc.mergeSchema``. :param recursiveFileLookup: recursively scan a directory for files. Using this option - disables `partition discovery`_. + disables `partition discovery`_. >>> df = spark.read.orc('python/test_support/sql/orc_partitioned') >>> df.dtypes [('a', 'bigint'), ('b', 'int'), ('c', 'int')] """ - self._set_opts(recursiveFileLookup=recursiveFileLookup) + self._set_opts(mergeSchema=mergeSchema, recursiveFileLookup=recursiveFileLookup) if isinstance(path, basestring): path = [path] return self._df(self._jreader.orc(_to_seq(self._spark._sc, path))) diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py index 93b4c7895386..459314732461 100644 --- a/python/pyspark/sql/streaming.py +++ b/python/pyspark/sql/streaming.py @@ -514,13 +514,16 @@ def json(self, path, schema=None, primitivesAsString=None, prefersDecimal=None, raise TypeError("path can be only a single string") @since(2.3) - def orc(self, path, recursiveFileLookup=None): + def orc(self, path, mergeSchema=None, recursiveFileLookup=None): """Loads a ORC file stream, returning the result as a :class:`DataFrame`. .. note:: Evolving. + :param mergeSchema: sets whether we should merge schemas collected from all + ORC part-files. This will override ``spark.sql.orc.mergeSchema``. + The default value is specified in ``spark.sql.orc.mergeSchema``. :param recursiveFileLookup: recursively scan a directory for files. Using this option - disables `partition discovery`_. + disables `partition discovery`_. >>> orc_sdf = spark.readStream.schema(sdf_schema).orc(tempfile.mkdtemp()) >>> orc_sdf.isStreaming @@ -528,7 +531,7 @@ def orc(self, path, recursiveFileLookup=None): >>> orc_sdf.schema == sdf_schema True """ - self._set_opts(recursiveFileLookup=recursiveFileLookup) + self._set_opts(mergeSchema=mergeSchema, recursiveFileLookup=recursiveFileLookup) if isinstance(path, basestring): return self._df(self._jreader.orc(path)) else: