From d1d3e4fe82d476575af9528b1a3dce1fd2ff978d Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Sun, 11 Apr 2021 12:53:58 -0500 Subject: [PATCH 1/8] ENH: Add IO support for R data files with pandas.read_rdata and DataFrame.to_rdata --- doc/source/getting_started/install.rst | 2 + doc/source/user_guide/io.rst | 300 +++ doc/source/whatsnew/v1.3.0.rst | 103 + environment.yml | 1 + pandas/__init__.py | 1 + pandas/core/frame.py | 225 ++ pandas/io/api.py | 1 + pandas/io/rdata.py | 1826 +++++++++++++++++ .../io/data/rdata/climate_non_utf8_df.rda | Bin 0 -> 423 bytes .../io/data/rdata/climate_non_utf8_df.rds | Bin 0 -> 400 bytes pandas/tests/io/data/rdata/env_data_dfs.rda | Bin 0 -> 7259 bytes .../tests/io/data/rdata/env_data_non_dfs.rda | Bin 0 -> 8948 bytes pandas/tests/io/data/rdata/env_data_objs.rda | Bin 0 -> 13735 bytes pandas/tests/io/data/rdata/ghg_df.rds | Bin 0 -> 1475 bytes pandas/tests/io/data/rdata/ghg_t_tests.rds | Bin 0 -> 1136 bytes pandas/tests/io/data/rdata/plants_arry.rds | Bin 0 -> 584 bytes pandas/tests/io/data/rdata/plants_df.rds | Bin 0 -> 325 bytes pandas/tests/io/data/rdata/ppm_df.csv | 757 +++++++ pandas/tests/io/data/rdata/ppm_ts.rds | Bin 0 -> 9004 bytes pandas/tests/io/data/rdata/sea_ice_df.rds | Bin 0 -> 5374 bytes pandas/tests/io/data/rdata/species_mtx.rds | Bin 0 -> 1907 bytes pandas/tests/io/rdata/test_pyreadr.py | 596 ++++++ pandas/tests/io/rdata/test_rscript.py | 972 +++++++++ requirements-dev.txt | 1 + 24 files changed, 4785 insertions(+) create mode 100644 pandas/io/rdata.py create mode 100644 pandas/tests/io/data/rdata/climate_non_utf8_df.rda create mode 100644 pandas/tests/io/data/rdata/climate_non_utf8_df.rds create mode 100644 pandas/tests/io/data/rdata/env_data_dfs.rda create mode 100644 pandas/tests/io/data/rdata/env_data_non_dfs.rda create mode 100644 pandas/tests/io/data/rdata/env_data_objs.rda create mode 100644 pandas/tests/io/data/rdata/ghg_df.rds create mode 100644 pandas/tests/io/data/rdata/ghg_t_tests.rds create mode 100644 pandas/tests/io/data/rdata/plants_arry.rds create mode 100644 pandas/tests/io/data/rdata/plants_df.rds create mode 100644 pandas/tests/io/data/rdata/ppm_df.csv create mode 100644 pandas/tests/io/data/rdata/ppm_ts.rds create mode 100644 pandas/tests/io/data/rdata/sea_ice_df.rds create mode 100644 pandas/tests/io/data/rdata/species_mtx.rds create mode 100644 pandas/tests/io/rdata/test_pyreadr.py create mode 100644 pandas/tests/io/rdata/test_rscript.py diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index a9c3d637a41e3..99ebe01b0e53f 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -358,6 +358,8 @@ zlib Compression for HDF5 fastparquet 0.4.0 Parquet reading / writing pyarrow 0.15.0 Parquet, ORC, and feather reading / writing pyreadstat SPSS files (.sav) reading +pyreadr R files (.RData, .rda, .rds) reading / writing +Rscript R files (.RData, .rda, .rds) reading / writing ========================= ================== ============================================================= Access data in the cloud diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 3b7a6037a9715..75a3626ef80b5 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -31,6 +31,7 @@ The pandas I/O API is a set of top level ``reader`` functions accessed like binary;`Parquet Format `__;:ref:`read_parquet`;:ref:`to_parquet` binary;`ORC Format `__;:ref:`read_orc`; binary;`Msgpack `__;:ref:`read_msgpack`;:ref:`to_msgpack` + binary;`R `__;:ref:`read_rdata`;:ref:`to_rdata` binary;`Stata `__;:ref:`read_stata`;:ref:`to_stata` binary;`SAS `__;:ref:`read_sas`; binary;`SPSS `__;:ref:`read_spss`; @@ -5903,6 +5904,304 @@ respective functions from ``pandas-gbq``. Full documentation can be found `here `__. + +.. _io.rdata: + +R data format +------------- + +.. _io.rdata_reader: + +Reading R data +'''''''''''''' + +.. versionadded:: 1.3.0 + +The top-level function ``read_rdata`` will read the native serialization types +in the R language and environment. For .RData and its synonymous shorthand, .rda, +that can hold multiple R objects, method will return a ``dict`` of ``DataFrames``. +For .rds types that only contains a single R object, method will return a single +``DataFrame``. + +.. note:: + + Since *any* R object can be saved in these types, this method will only return + data.frame objects or objects coercible to data.frames including matrices, + tibbles, and data.tables even 3D arrays. Depending on engine used, either + an error raises for non-data.frame objects or such objects are ignored. + +For example, consider the following generated data.frames in R using samples from +US EPA, UK BGCI, and NOAA pubilc data: + +.. code-block:: r + + ghg_df <- data.frame( + gas = c("Carbon dioxide", "Methane", "Nitrous oxide", + "Fluorinated gases", "Total"), + year = c(2018, 2018, 2018, 2018, 2018), + emissions = c(5424.88150213288, 634.457127078267, 434.528555376666, + 182.782432461777, 6676.64961704959), + row.names = c(141:145), + stringsAsFactors = FALSE + ) + + saveRDS(ghg_df, file="ghg_df.rds") + + plants_df <- data.frame( + plant_group = c("Pteridophytes", "Pteridophytes", "Pteridophytes", + "Pteridophytes", "Pteridophytes"), + status = c("Data Deficient", "Extinct", "Not Threatened", + "Possibly Threatened", "Threatened"), + count = c(398, 65, 1294, 408, 1275), + row.names = c(16:20), + stringsAsFactors = FALSE + ) + + saveRDS(plants_df, file="plants_df.rds") + + sea_ice_df_new <- data.frame( + year = c(2016, 2017, 2018, 2019, 2020), + mo = c(12, 12, 12, 12, 12), + data.type = c("Goddard", "Goddard", "Goddard", "Goddard", "NRTSI-G"), + region = c("S", "S", "S", "S", "S"), + extent = c(8.28, 9.48, 9.19, 9.41, 10.44), + area = c(5.51, 6.23, 5.59, 6.59, 6.5), + row.names = c(1012:1016), + stringsAsFactors = FALSE + ) + + saveRDS(sea_ice_df, file="sea_ice_df.rds") + + save(ghg_df, plants_df, sea_ice_df, file="env_data_dfs.rda") + +Then in pandas you can read the .rds or .rda files: + +.. ipython:: python + :suppress: + + rel_path = os.path.join("..", "pandas", "tests", "io", "data", "rdata") + file_path = os.path.abspath(rel_path) + +.. ipython:: python + + rds_file = os.path.join(file_path, "ghg_df.rds") + ghg_df = pd.read_rdata(rds_file).tail() + ghg_df + + rda_file = os.path.join(file_path, "env_data_dfs.rda") + env_dfs = pd.read_rdata(rda_file) + env_dfs + +To ignore the rownames of data.frame, use option ``rownames=False``: + +.. ipython:: python + + rds_file = os.path.join(file_path, "plants_df.rds") + plants_df = pd.read_rdata(rds_file, rownames=False).tail() + plants_df + + +To select specific objects in .rda, pass a list of names into ``select_frames``: + +.. ipython:: python + + rda_file = os.path.join(file_path, "env_data_dfs.rda") + env_dfs = pd.read_rdata(rda_file, select_frames=["sea_ice_df"]) + env_dfs + +To read from URL, pass link directly into method: + +.. ipython:: python + + url = ("https://github.com/hadley/nycflights13/" + "blob/master/data/airlines.rda?raw=true") + + airlines = pd.read_rdata(url, file_format="rda") + airlines + +To read from a file-like object, read object in argument, ``path_or_buffer``: + +.. ipython:: python + + rds_file = os.path.join(file_path, "sea_ice_df.rds") + with open(rds_file, "rb") as f: + sea_ice_df = pd.read_rdata(f.read(), file_format="rds") + + sea_ice_df + +With ``rscript`` as ``engine``, a direct command line call to Rscript is run +to read data natively in R and transfer content with several options of ``mode``. + +.. note:: + + If you do not have R installed and attempt to use the ``rscript`` ``engine``, + then an ``ImportError`` will raise. Do note: Rscript must be recognized as a + top-level command on machine. Hence, R's bin folder must be in Path environment + variable for the OS. If Rscript is not recognized even if you have R installed, + you will receive same ``ImportError``. + +- For the ``csv`` mode (default), no other package in R is required. + Data types are adhered in this data exchange following a text approach. + +- For the ``feather`` mode, the ``arrow`` package in R must be installed. + Additionally, the counterpart ``pyarrow`` package in Python must be + installed. This binary approach allows faster data exchange than text approach. + +- For the ``parquet`` mode, again the ``arrow`` package in R must be installed. + and again ``pyarrow`` package in Python must be installed. Similarly, this + binary approach allows faster data exchange than text approach. + +- For the ``sqlite`` mode, the ``RSQLite`` package in R (part of DBI family of + database APIs) must be installed with no additional package needed for Python. + This database approach ensures data type integrity. + +.. ipython:: python + + rds_file = os.path.join(file_path, "plants_df.rds") + plants_df = pd.read_rdata(rds_file, engine="rscript", mode="sqlite").tail() + plants_df + +.. note:: + + The above selected options for ``mode`` will not generate such formats but + uses them under the hood in disk transfer of data between R and Python. + + +.. _io.rdata_writer: + +Writing R data +'''''''''''''' + +.. versionadded:: 1.3.0 + +The method :func:`~pandas.core.frame.DataFrame.to_rdata` will write a DataFrame +or multiple DataFrames into R data files (.Rdata, .rda, and .rds). + +For single object in rds type: + +.. ipython:: python + + plants_df.to_rdata("plants_df.rds") + +For multiple objects in RData or rda types using the ``rscript`` engine, +use the ``other_frames`` argument and be sure to provide ``rda_names`` for all +DataFrames: + +.. ipython:: python + + plants_df.to_rdata( + "env_dfs.rda", + engine="rscript", + other_frames=[ghg_df, sea_ice_df], + rda_names=["plants_df", "ghg_df", "sea_ice_df"] + ) + +With either engine, pandas index will not map into R rownames. Using the default +``index=True`` will output an index column or multiple columns for MultiIndex. + +.. ipython:: python + + (ghg_df.rename_axis(None) + .to_rdata("ghg_df.rds", engine="rscript") + ) + pd.read_rdata("ghg_df.rds").tail() + +Otherwise, use ``index=False``: + +.. ipython:: python + + (ghg_df.rename_axis(None) + .to_rdata("ghg_df.rds", engine="rscript", index=False) + ) + pd.read_rdata("ghg_df.rds").tail() + +With both engines, the default compression of R data files will be ``gzip``. +Notice the different sizes of compressed and uncompressed files: + +.. ipython:: python + + plants_df.to_rdata("plants_df_uncomp.rds", compress=False) + + os.stat("plants_df.rds").st_size + os.stat("plants_df_uncomp.rds").st_size + +The ``rscript`` engine supports all listed compression types including: +``gzip``, ``bzip2``, and ``xz``. + +Additionally, with ``rscript`` engine, data files can be written in ascii (text) +rather than default binary with ``ascii`` argument: + +.. ipython:: python + + sea_ice_df.to_rdata("sea_ice_df_ascii.rda", engine="rscript", + ascii=True, compress=False) + + with open("sea_ice_df_ascii.rda", "r") as f: + for i in range(10): + line = next(f).strip() + print(line) + +.. ipython:: python + :suppress: + + os.remove("ghg_df.rds") + os.remove("plants_df.rds") + os.remove("env_dfs.rda") + os.remove("plants_df_uncomp.rds") + os.remove("sea_ice_df_ascii.rda") + +Once exported, the single DataFrame can be read back in R or multiple DataFrames +loaded in R: + +.. code-block:: r + + plants_df <- readRDS("plants_df.rds") + tail(plants_df, 5) + plant_group status count + 16 Pteridophytes Data Deficient 398 + 17 Pteridophytes Extinct 65 + 18 Pteridophytes Not Threatened 1294 + 19 Pteridophytes Possibly Threatened 408 + 20 Pteridophytes Threatened 1275 + + + load("env_dfs.rda") + eapply(.GlobalEnv, tail, 5) + $plants_df + plant_group status count + 16 Pteridophytes Data Deficient 398 + 17 Pteridophytes Extinct 65 + 18 Pteridophytes Not Threatened 1294 + 19 Pteridophytes Possibly Threatened 408 + 20 Pteridophytes Threatened 1275 + + $sea_ice_df + year mo data.type region extent area + 1012 2016 12 Goddard S 8.28 5.51 + 1013 2017 12 Goddard S 9.48 6.23 + 1014 2018 12 Goddard S 9.19 5.59 + 1015 2019 12 Goddard S 9.41 6.59 + 1016 2020 12 NRTSI-G S 10.44 6.50 + + $ghg_df + gas year emissions + 141 Carbon dioxide 2018 5424.8815 + 142 Methane 2018 634.4571 + 143 Nitrous oxide 2018 434.5286 + 144 Fluorinated gases 2018 182.7824 + 145 Total 2018 6676.6496 + +For more information of ``pyreadr`` engine, see main page of `pyreadr`_ package for +further notes on support and limitations. For more information of R serialization +data types, see docs on `rds`_ and `rda`_ data files. + +.. _pyreadr: https://github.com/ofajardo/pyreadr + +.. _rds: https://www.rdocumentation.org/packages/base/versions/3.6.2/topics/readRDS + +.. _rda: https://www.rdocumentation.org/packages/base/versions/3.6.2/topics/save + + .. _io.stata: Stata format @@ -5958,6 +6257,7 @@ outside of this range, the variable is cast to ``int16``. 115 dta file format. Attempting to write *Stata* dta files with strings longer than 244 characters raises a ``ValueError``. + .. _io.stata_reader: Reading from Stata format diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 92efb225682b7..b85a773014c59 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -110,6 +110,109 @@ both XPath 1.0 and XSLT 1.0 is available. (:issue:`27554`) For more, see :ref:`io.xml` in the user guide on IO tools. +.. _whatsnew_130.read_to_rdata: + +Read and write R data files +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +We added I/O support to read and write R data files (.rda, .Rdata, .rds) using +:func:`pandas.read_rdata` and :meth:`DataFrame.to_rdata`. Equipped with two engines, +`pyreadr`_ and command line caller, `rscript`_, these methods will maintain fast and +durable support for open source data migration between R and Python. (:issue:`40287`) + +.. _pyreadr: https://github.com/ofajardo/pyreadr +.. _rscript: https://www.rdocumentation.org/packages/utils/versions/3.6.2/topics/Rscript + +In R, the below generated data frame and matrix: + +.. code-block:: r + + In [1]: carbon_ppm_df <- data.frame( + ...: year = c(2020, 2020, 2020, 2021, 2021), + ...: month = c(10, 11, 12, 1, 2), + ...: monthly_average = c(411.51, 413.11, 414.25, 415.52, 416.75), + ...: num_days = c(30, 27, 30, 29, 28), + ...: st_dev_of_days = c(0.22, 0.8, 0.48, 0.44, 1.01), + ...: unc_mon_mean = c(0.08, 0.29, 0.17, 0.16, 0.36) + ...: ) + + In [2]: iucn_species_mtx <- matrix( + ...: c(102, 79, 159, 63, 30, 13, 267, 35, 85, + ...: 30, 10, 5, 1, 2, 7, 14, 2, 2, + ...: 409, 121, 22, 75, 40, 78, 134, 146, 28, + ...: 29, 6, 0, 0, 0, 12, 2, 1, 0, + ...: 3770, 627, 223, 365, 332, 699, 604, 663, 225, + ...: 6972, 989, 460, 730, 588, 1302, 518, 1060, 542, + ...: 7089, 1219, 798, 831, 538, 1051, 975, 719, 556, + ...: 2990, 4251, 52, 2819, 1220, 914, 1648, 1184, 845, + ...: 43885, 20685, 11158, 10865, 8492, 8192, 7326, 7212, 5940 + ...: ), + ...: ncol=9, nrow=9, + ...: dimnames = list( + ...: c("MAGNOLIOPSIDA", "ACTINOPTERYGII", "AVES", + ...: "INSECTA", "REPTILIA", "LILIOPSIDA", + ...: "GASTROPODA", "AMPHIBIA", "MAMMALIA"), + ...: c("EX", "EW", "CR(PE)", "CR(PEW)", "CR", + ...: "EN", "VU", "DD", "Total") + ...: ) + ...: ) + + In [3]: saveRDS(ppm_df, "ppm_df_r.rds") + In [4]: save(ppm_df, iucn_species_mtx, "env_objs_r.rda") + +Can then be read in pandas with either engine: + +.. code-block:: ipython + + In [1]: ppm_df = pd.read_rdata("ppm_df_r.rds", engine="pyreadr") + In [2]: ppm_df + Out[3]: + year month monthly_average num_days st_dev_of_days unc_mon_mean + 0 2020 10 411.51 30 0.22 0.08 + 1 2020 11 413.11 27 0.80 0.29 + 2 2020 12 414.25 30 0.48 0.17 + 3 2021 1 415.52 29 0.44 0.16 + 4 2021 2 416.75 28 1.01 0.36 + + In [4]: env_objs = pd.read_rdata("env_objs_r.rda", engine="rscript") + Out[5]: + {'carbon_ppm_df': + year month monthly_average num_days st_dev_of_days unc_mon_mean + 0 2020 10 411.51 30 0.22 0.08 + 1 2020 11 413.11 27 0.80 0.29 + 2 2020 12 414.25 30 0.48 0.17 + 3 2021 1 415.52 29 0.44 0.16 + 4 2021 2 416.75 28 1.01 0.36 + + [5 rows x 6 columns], + 'species_matrix': + EX EW CR(PE) CR(PEW) CR EN VU DD Total + rownames + MAGNOLIOPSIDA 102 30 409 29 3770 6972 7089 2990 43885 + ACTINOPTERYGII 79 10 121 6 627 989 1219 4251 20685 + AVES 159 5 22 0 223 460 798 52 11158 + INSECTA 63 1 75 0 365 730 831 2819 10865 + REPTILIA 30 2 40 0 332 588 538 1220 8492 + LILIOPSIDA 13 7 78 12 699 1302 1051 914 8192 + GASTROPODA 267 14 134 2 604 518 975 1648 7326 + AMPHIBIA 35 2 146 1 663 1060 719 1184 7212 + + [8 rows x 9 columns]} + +Even exported back out to R data files: + +.. code-block:: ipython + + In [5]: ppm_df.to_rdata("ppm_df_py.rds") + In [6]: ppm_df.to_rdata( + ...: "env_objs_py.rda", + ...: engine="rscript", + ...: other_frames=env_objs["species_matrix"], + ...: rda_names=["ppm_df", "species_mtx"] + ...: ) + +For more, see :ref:`io.read_rdata` in the user guide on IO tools. + .. _whatsnew_130.enhancements.other: Other enhancements diff --git a/environment.yml b/environment.yml index 1259d0dd4ae44..88df4ee035da0 100644 --- a/environment.yml +++ b/environment.yml @@ -110,6 +110,7 @@ dependencies: - xarray # DataFrame.to_xarray - cftime # Needed for downstream xarray.CFTimeIndex test - pyreadstat # pandas.read_spss + - pyreadr # pandas.read_rdata, DataFrame.to_rdata - tabulate>=0.8.3 # DataFrame.to_markdown - natsort # DataFrame.sort_values - pip: diff --git a/pandas/__init__.py b/pandas/__init__.py index 7cad3eded0585..5c18d6072b9a3 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -172,6 +172,7 @@ read_stata, read_sas, read_spss, + read_rdata, ) from pandas.io.json import _json_normalize as json_normalize diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 62341045413a7..46aecd6d3a087 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2270,6 +2270,231 @@ def _from_arrays( ) return cls(mgr) + @doc(storage_options=generic._shared_docs["storage_options"]) + def to_rdata( + self, + path_or_buffer: FilePathOrBuffer, + file_format: str = "infer", + engine: str = "pyreadr", + mode: str = "csv", + other_frames: Optional[List[DataFrame]] = None, + rda_names: List[str] = ["pandas_dataframe"], + index: bool = True, + ascii: bool = False, + compress: Union[bool, str] = "gzip", + encoding: str = "utf-8", + storage_options: StorageOptions = None, + ) -> None: + """ + Render one or more DataFrames to R data (.rda, .Rdata, .rds). + + .. versionadded:: 1.3.0 + + Parameters + ---------- + path_or_buffer : a valid str, path object or file-like object + Any valid string path is acceptable. + + file_format : {{'infer', 'rda', 'rdata', 'rds'}}, default 'infer' + R serialization type generated from native commands: base::save + (that saves multiple objects) or base::saveRDS (that saves a + single object to disk). Default 'infer' will use extension in file + name to determine the format type. + + engine : {{'pyreadr', 'rscript'}}, default 'pyreadr' + Engine used to write to R data files. Currently, two types are + supported: ``pyreadr`` which requires the pyreadr package to be + installed and ``rscript`` which requires R to be installed on machine. + For ``rscript``, be sure the R bin installation folder is included in + the system Path environment variable. The ``pyreadr`` is the faster + parser to handle most needs but ``rscript`` engine provides fuller + support of rda and rds formats since it calls native R commands. + + mode : {{'csv', 'parquet', 'feather'}}, default 'csv' + Python and R I/O transfer mode that only applies to ``rscript`` + engine (ignored for ``pyreadr``). Using ``csv`` (text approach), no + additional packages are required. Using ``parquet`` or ``feather`` + (binary approach) requires pyarrow installed in Python and arrow + package installed in R. Using ``sqlite`` (database approach) requires + RSQLite package installed in R. Binary will usually be faster to process + than text data. Database usually ensures data type integrity. + + other_frames : list, optional + Other DataFrames to be included in rda (not rds) files that can + contain multiple objects. Ignored ``pyreadr`` engine that currently + supports only a single DataFrame written to rda files. + + rda_names : list, default ["pandas_dataframe"] + Names for current and other DataFrames in rda file. The number of names + should equal the number of current DataFrame and ``other_frames``. + For ``pyreadr`` engine that can only write one DataFrame to rda file, + only the first name in list will be used. + + index : bool, default True + Include index or MulitIndex in output as separate columns. Since + DataFrame indexes can include multiple columns and R rownames can only + include one column, neither ``pyreadr`` nor ``rscript`` engines will + map DataFrame index to R data.frame rownames. + + ascii : bool, default False + Write data into ASCII (text) representation. Only supported with + ``rscript`` engine. + + compress : bool or {{'gzip', 'bzip2', 'xz'}}, default 'gzip' + Compression types for R data files. Use False for uncompressed + files. For ``pyreadr`` engine, False and 'gzip' is supported. + + encoding : str, optional, default 'utf-8' + Encoding of R data. + + {storage_options} + + Returns + ------- + None + Either None or ValueError is raised. + + See Also + -------- + to_stata : Convert DataFrame to a Stata dataset. + to_parquet : Convert DataFrame to parquet format. + to_feather: Convert DataFrame to feather formatt. + + Examples + -------- + To save an .rds file which only contains a single DataFrame: + + >>> ghg_df = pd.DataFrame( + ... {{'gas': ['Carbon dioxide', 'Methane', + ... 'Nitrous oxide', + ... 'Fluorinated gases', + ... 'Total'], + ... 'year': [2018, 2018, 2018, 2018, 2018], + ... 'emissions': [5424.88, 634.46, 434.53, + ... 182.78, 6676.65] + ... }}) + >>> ghg_df.to_rdata("ghg_df.rds") + + >>> R_code = ''' + ... ghg_df <- readRDS("ghg_df.rds") + ... ghg_df + ... index gas year emissions + ... 1 0 Carbon dioxide 2018 5424.88 + ... 2 1 Methane 2018 634.46 + ... 3 2 Nitrous oxide 2018 434.53 + ... 4 3 Fluorinated gases 2018 182.78 + ... 5 4 Total 2018 6676.65 + ... ''' + + To save an .rda or .RData file which can contains one or more + DataFrames: + + >>> plants_df = pd.DataFrame( + ... {{'plant_group': ['Pteridophytes', + ... 'Pteridophytes', + ... 'Pteridophytes', + ... 'Pteridophytes', + ... 'Pteridophytes'], + ... 'status': ['Data Deficient', + ... 'Extinct', + ... 'Not Threatened', + ... 'Possibly Threatened', + ... 'Threatened'], + ... 'count': [398, 65, 1294, 408, 1275] + ... }}) + >>> sea_ice_df = pd.DataFrame( + ... {{'year': [2016, 2017, 2018, 2019, 2020], + ... 'mo': [12, 12, 12, 12, 12], + ... 'data.type': ['Goddard', + ... 'Goddard', + ... 'Goddard', + ... 'Goddard', + ... 'NRTSI-G'], + ... 'region': ['S', 'S', 'S', 'S', 'S'], + ... 'extent': [8.28, 9.48, 9.19, 9.41, 10.44], + ... 'area': [5.51, 6.23, 5.59, 6.59, 6.5] + ... }}) + >>> ghg_df.to_rdata( + ... "env_data_df.rda", + ... engine="rscript", + ... other_frames=[plants_df, sea_ice_df], + ... rda_names=["ghg_df", "plants_df", "sea_ice_df"] + ... ) # doctest: +SKIP + + >>> R_code = ''' + ... load("env_data_df.rds") + ... + ... mget(ls()) + ... $ghg_df + ... index gas year emissions + ... 1 0 Carbon dioxide 2018 5424.88 + ... 2 1 Methane 2018 634.46 + ... 3 2 Nitrous oxide 2018 434.53 + ... 4 3 Fluorinated gases 2018 182.78 + ... 5 4 Total 2018 6676.65 + ... + ... $plants_df + ... index plant_group status count + ... 1 0 Pteridophytes Data Deficient 398 + ... 2 1 Pteridophytes Extinct 65 + ... 3 2 Pteridophytes Not Threatened 1294 + ... 4 3 Pteridophytes Possibly Threatened 408 + ... 5 4 Pteridophytes Threatened 1275 + ... + ... $sea_ice_df + ... index year mo data.type region extent area + ... 1 0 2016 12 Goddard S 8.28 5.51 + ... 2 1 2017 12 Goddard S 9.48 6.23 + ... 3 2 2018 12 Goddard S 9.19 5.59 + ... 4 3 2019 12 Goddard S 9.41 6.59 + ... 5 4 2020 12 NRTSI-G S 10.44 6.50 + ... ''' + """ + from pandas.io.rdata import ( + RSCRIPT_EXISTS, + PyReadrWriter, + RscriptWriter, + ) + + pyreadr = import_optional_dependency("pyreadr", errors="ignore") + pyarrow = import_optional_dependency("pyarrow", errors="ignore") + + RDataWriter: Union[Type[PyReadrWriter], Type[RscriptWriter]] + + if engine == "pyreadr": + if pyreadr is None: + raise ImportError("pyreadr not found, please install for this engine.") + RDataWriter = PyReadrWriter + + elif engine == "rscript": + if RSCRIPT_EXISTS is None: + raise FileNotFoundError( + "R is either not installed on this system or its " + "bin folder is not in Path environment variable." + ) + if pyarrow is None and mode in ["parquet", "feather"]: + raise ImportError("pyarrow not found, please install for this mode.") + RDataWriter = RscriptWriter + else: + raise ValueError(f"{engine} is not a supported engine.") + + rdata_writer = RDataWriter( + self, + path_or_buffer=path_or_buffer, + file_format=file_format, + engine=engine, + mode=mode, + other_frames=other_frames, + rda_names=rda_names, + index=index, + ascii=ascii, + compress=compress, + encoding=encoding, + storage_options=storage_options, + ) + + return rdata_writer.write_data() + @doc(storage_options=generic._shared_docs["storage_options"]) @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") def to_stata( diff --git a/pandas/io/api.py b/pandas/io/api.py index 5926f2166ee9d..9cacb014e7dd0 100644 --- a/pandas/io/api.py +++ b/pandas/io/api.py @@ -29,6 +29,7 @@ HDFStore, read_hdf, ) +from pandas.io.rdata import read_rdata from pandas.io.sas import read_sas from pandas.io.spss import read_spss from pandas.io.sql import ( diff --git a/pandas/io/rdata.py b/pandas/io/rdata.py new file mode 100644 index 0000000000000..ffd726e8cfbff --- /dev/null +++ b/pandas/io/rdata.py @@ -0,0 +1,1826 @@ +from datetime import datetime +import io +import os +import platform +import subprocess +from tempfile import TemporaryDirectory +from typing import ( + Dict, + List, + Optional, + Type, + Union, +) + +from pandas._typing import ( + Buffer, + FilePathOrBuffer, + StorageOptions, +) +from pandas.compat._optional import import_optional_dependency +from pandas.errors import ( + AbstractMethodError, + ParserError, +) +from pandas.util._decorators import doc + +from pandas.core.dtypes.common import is_list_like + +from pandas.core.frame import DataFrame +from pandas.core.shared_docs import _shared_docs + +from pandas.io.common import ( + file_exists, + get_handle, + is_fsspec_url, + is_url, + stringify_path, +) +from pandas.io.feather_format import read_feather +from pandas.io.parquet import read_parquet +from pandas.io.parsers import read_csv +from pandas.io.sql import read_sql + + +class RScriptError(Exception): + """ + Exception raises when command line call to RScript throws a non-empty + error message. Message will capture verbatim R output in console. + """ + + pass + + +def _executable_exists(name) -> bool: + """ + Internal method to check if R exists on system. + + This method will return True if R is installed for Rscript command + line call and if machine recognizes Rscript in Path env variable. + """ + + WHICH_CMD = "where" if platform.system() == "Windows" else "which" + + return ( + subprocess.call( + [WHICH_CMD, name], stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + == 0 + ) + + +RSCRIPT_EXISTS = _executable_exists("Rscript") + + +@doc(storage_options=_shared_docs["storage_options"]) +def read_rdata( + path_or_buffer: FilePathOrBuffer, + file_format: str = "infer", + engine: str = "pyreadr", + mode: str = "csv", + select_frames: Optional[List[str]] = None, + rownames: bool = True, + encoding: str = "utf-8", + storage_options: StorageOptions = None, +) -> Union[DataFrame, Dict[str, DataFrame]]: + r""" + Read R data (.RData, .rda, .rds) into DataFrame or ``dict`` of DataFrames. + + .. versionadded:: 1.3.0 + + Parameters + ---------- + path_or_buffer : str, path object, or file-like object + Any valid file path is acceptable. The string could be a URL. + Valid URL schemes include http, ftp, s3, and file. + + file_format : {{'infer', 'rda', 'rdata', 'rds'}}, default 'infer' + R serialization type as output from R's base::save or base::saveRDS + commands. Default 'infer' will use extension in file name to + to determine the format type. + + engine : {{'pyreadr'. 'rscript'}}, default 'pyreadr' + Engine used to parse or read R data. Currently, two types are + supported: ``pyreadr`` which requires the pyreadr package to be + installed and ``rscript`` which requires R to be installed on machine. + For ``rscript``, be sure the R bin installation folder is included in + the system Path environment variable. The ``pyreadr`` is the faster + parser to handle most needs but ``rscript`` engine provides fuller + support of rda and rds formats since it calls native R commands. + + mode : {{'csv', 'parquet', 'feather', 'sqlite'}}, default 'csv' + Python and R I/O transfer mode that only applies to ``rscript`` + engine (ignored for ``pyreadr``). Using ``csv`` (text approach), no + additional packages are required. Using ``parquet`` or ``feather`` + (binary approach) requires pyarrow installed in Python and arrow + package installed in R. Using ``sqlite`` (database approach) requires + RSQLite package installed in R. Binary will usually be faster to process + than text data. Database usually ensures data type integrity. + + select_frames : list, default None + Selected names of DataFrames to return from R rda and RData types that + can contain multiple objects. + + rownames : bool, default True + Include original rownames in R data frames to map into a DataFrame index. + + encoding : str, optional, default 'utf-8' + Encoding of R data. Currently, ``pyreadr`` engine only supports utf-8 + encoded data. + + {storage_options} + + Returns + ------- + DataFrame or dict of DataFrames + Depends on R data type where rds formats returns a single DataFrame and + rda or RData formats return ``dict`` of DataFrames. + + See Also + -------- + read_sas : Read SAS datasets into DataFrame. + read_stata : Read Stata datasets into DataFrame. + read_spss : Read SPSS datasets into DataFrame. + + Notes + ----- + For ``pyreadr`` engine, any R data file that contains a non-data.frame object + may raise parsing errors. For ``rscript`` engine, such objects will be + ignored. Both methods will or attempt to return data.frame objects or any + object that is coercible to R's data.frame such as matrix, tibble, + and data.table. For arrays, method will attempt to convert to 2D + structure and may not reproduce original R object representation. + + If object in rds types or all objects in rda or RData types are not data + frames, this method will raise an error and will not return None or an empty + dictionary. + + For ``pyreadr`` engine, ``select_frames`` above is synonymous to ``use_objects`` + in package's `read_r` method. Also, ``timezone`` argument defaults to current + system regional timezone in order to correspond to original date/times in R. + + Examples + -------- + To read an .rds file which only contains a single object, below returns a + DataFrame: + + >>> R_code = ''' + ... ghg_df <- data.frame( + ... gas = c('Carbon dioxide', + ... 'Methane', + ... 'Nitrous oxide', + ... 'Fluorinated gases', + ... 'Total'), + ... year = c(2018, + ... 2018, + ... 2018, + ... 2018, + ... 2018), + ... emissions = c(5424.88, + ... 634.46, + ... 434.53, + ... 182.78, + ... 6676.65) + ... ) + ... saveRDS(ghg_df, file="ghg_df.rds") + ... ''' + + >>> ghg_df = pd.read_rdata("ghg_df.rds") # doctest: +SKIP + >>> ghg_df # doctest: +SKIP + gas year emissions + rownames + 1 Carbon dioxide 2018 5424.88 + 2 Methane 2018 634.46 + 3 Nitrous oxide 2018 434.53 + 4 Fluorinated gases 2018 182.78 + 5 Total 2018 6676.65 + + To read an .rda or .RData file which can contain multiple objects, blue + returns a ``dict`` of DataFrames: + + >>> R_code = ''' + ... plants_df <- pd.DataFrame( + ... plant_group = c('Pteridophytes', + ... 'Pteridophytes', + ... 'Pteridophytes', + ... 'Pteridophytes', + ... 'Pteridophytes'), + ... status = c('Data Deficient', + ... 'Extinct', + ... 'Not Threatened', + ... 'Possibly Threatened', + ... 'Threatened'), + ... count = c(398, 65, 1294, 408, 1275) + ... ) + ... sea_ice_df <- pd.DataFrame( + ... year = c(2016, 2017, 2018, 2019, 2020), + ... mo = c(12, 12, 12, 12, 12], + ... data.type: c('Goddard', + ... 'Goddard', + ... 'Goddard', + ... 'Goddard', + ... 'NRTSI-G'), + ... region = c('S', 'S', 'S', 'S', 'S'), + ... extent = c(8.28, 9.48, 9.19, 9.41, 10.44), + ... area = c(5.51, 6.23, 5.59, 6.59, 6.5) + ... ) + ... save(ghg_df, plants_df, sea_ice_df, file="env_data_dfs.rda") + ... ''' + + >>> env_dfs = pd.read_rdata("env_data_dfs.rda") # doctest: +SKIP + >>> env_dfs # doctest: +SKIP + {{'ghg_df': + gas year emissions + rownames + 1 Carbon dioxide 2018 5424.88 + 2 Methane 2018 634.46 + 3 Nitrous oxide 2018 434.53 + 4 Fluorinated gases 2018 182.79 + 5 Total 2018 6676.65, + 'plants_df': + plant_group status count + rownames + 1 Pteridophytes Data Deficient 398 + 2 Pteridophytes Extinct 65 + 3 Pteridophytes Not Threatened 1294 + 4 Pteridophytes Possibly Threatened 408 + 5 Pteridophytes Threatened 1275, + 'sea_ice_df': + year mo data.type region extent area + rownames + 1 2016 12 Goddard S 8.28 5.51 + 2 2017 12 Goddard S 9.48 6.23 + 3 2018 12 Goddard S 9.19 5.59 + 4 2019 12 Goddard S 9.41 6.59 + 5 2020 12 NRTSI-G S 10.44 6.50}} + """ + + return _parse( + path_or_buffer=path_or_buffer, + file_format=file_format, + engine=engine, + mode=mode, + select_frames=select_frames, + rownames=rownames, + encoding=encoding, + storage_options=storage_options, + ) + + +def _parse( + path_or_buffer, + file_format, + engine, + mode, + select_frames, + rownames, + encoding, + storage_options, + **kwargs, +) -> Union[DataFrame, Dict[str, DataFrame]]: + """ + Call internal parser classes. + + This method will conditionally call internal parsers: + _PyReadrParser or _RscriptParser. + + Raises + ------ + FileNotFoundError + * If Rscript bin executable is not installed or found on machine. + + ImportError + * If pyreadr for engine and pyarrow for mode is not installed. + + ValueError + * If engine is neither pyreadr or rscript. + """ + pyreadr = import_optional_dependency("pyreadr", errors="ignore") + pyarrow = import_optional_dependency("pyarrow", errors="ignore") + + RDataReader: Union[Type[_PyReadrParser], Type[_RscriptParser]] + + if engine == "pyreadr": + if pyreadr is None: + raise ImportError("pyreadr not found, please install for this engine.") + + RDataReader = _PyReadrParser + + elif engine == "rscript": + if RSCRIPT_EXISTS is None: + raise FileNotFoundError( + "R is either not installed on this system or its " + "bin folder is not in Path environment variable." + ) + + if pyarrow is None and mode in ["parquet", "feather"]: + raise ImportError("pyarrow not found, please install for this mode.") + + RDataReader = _RscriptParser + else: + raise ValueError(f"{engine} is not a supported engine.") + + rdr = RDataReader( + path_or_buffer, + file_format, + engine, + mode, + select_frames, + rownames, + encoding, + storage_options, + ) + + return rdr.parse_data() + + +def _get_data_from_filepath( + filepath_or_buffer, + encoding, + compression, + storage_options, +) -> Union[str, bytes, Buffer]: + """ + Extract raw R data. + + The method accepts three input types: + 1. filepath (string-like) + 2. file-like object (e.g. open file object, BytesIO) + 3. R data file in ascii or binary content + + This method turns (1) into (2) to simplify the rest of the processing. + It returns input types (2) and (3) unchanged. + """ + filepath_or_buffer = stringify_path(filepath_or_buffer) + + if ( + not isinstance(filepath_or_buffer, str) + or is_url(filepath_or_buffer) + or is_fsspec_url(filepath_or_buffer) + or file_exists(filepath_or_buffer) + ): + with get_handle( + filepath_or_buffer, + "rb", + encoding=encoding, + compression=compression, + storage_options=storage_options, + is_text=False, + ) as handle_obj: + filepath_or_buffer = ( + handle_obj.handle.read() + if hasattr(handle_obj.handle, "read") + else handle_obj.handle + ) + else: + raise FileNotFoundError(f"{filepath_or_buffer} file cannot be found.") + + return filepath_or_buffer + + +def _preprocess_data(data) -> Union[io.StringIO, io.BytesIO]: + """ + Convert extracted raw data. + + This method will return underlying data of extracted R data formats. + The data either has a `read` attribute (e.g. a file object or a + StringIO/BytesIO) or is bytes that represents the R data. + """ + + if isinstance(data, str): + data = io.StringIO(data) + + elif isinstance(data, bytes): + data = io.BytesIO(data) + + return data + + +class _RDataReader: + """ + Internal subclass to parse R data files into dict of DataFrames. + + Parameters + ---------- + path_or_buffer : a valid str, path object or file-like object + Any valid string path is acceptable. The string could be a URL. Valid + URL schemes include http, ftp, s3, and file. + + file_format : {{'infer', 'rda', 'rdata', 'rds'}}, default 'infer' + R serialization type. + + engine : {{'pyreadr', 'rscript'}}, default 'pyreadr' + Engine used to parse or read R data. + + mode : {{'csv', 'parquet', 'feather', 'sqlite'}}, default 'csv' + Python and R i/o transfer mode. + + select_frames : list, default None + Selected names of DataFrames to return from R data. + + rownames : bool, default True + Include original rownames in R data frames. + + encoding : str, optional, default 'utf-8' + Encoding of R data. + + storage_options : dict, optional + Extra options that make sense for a particular storage connection, + e.g. host, port, username, password, etc., + + See also + -------- + pandas.io.rdata._PyReadrParser + pandas.io.rdata._RscriptParser + + Notes + ----- + To subclass this class effectively you must override the following methods:` + * :func:`handle_rownames` + * :func:`parse_data` + + See each method's respective documentation for details on their + functionality. + """ + + def __init__( + self, + path_or_buffer, + file_format, + engine, + mode, + select_frames, + rownames, + encoding, + storage_options, + ) -> None: + self.path_or_buffer = path_or_buffer + self.file_format = file_format.lower() + self.engine = engine + self.mode = mode + self.select_frames = select_frames + self.rownames = rownames + self.encoding = encoding + self.storage_options = storage_options + + def verify_params(self) -> None: + """ + Verify user entries of parameters. + + This method will check the values and types of select parameters + and raise appropriate errors. + """ + + if self.file_format not in ["infer", "rda", "rdata", "rds"]: + raise ValueError( + f"'{self.file_format}' is not a valid value for file_format" + ) + + if ( + self.file_format == "infer" + and isinstance(self.path_or_buffer, str) + and not self.path_or_buffer.lower().endswith((".rda", ".rdata", ".rds")) + ) or (self.file_format == "infer" and not isinstance(self.path_or_buffer, str)): + raise ValueError( + f"Unable to infer file format from file name: {self.path_or_buffer}. " + "Please use known R data type (.rda, .rdata, .rds)." + ) + + if self.file_format == "infer": + self.file_format = os.path.splitext(self.path_or_buffer.lower())[1][1:] + + if self.mode is not None and self.mode not in [ + "csv", + "feather", + "parquet", + "sqlite", + ]: + raise ValueError(f"'{self.mode}' is not supported value for mode.") + + if self.select_frames is not None and not is_list_like(self.select_frames): + raise TypeError( + f"{type(self.select_frames).__name__} is " + "not a valid type for select_frames" + ) + + def buffer_to_disk(self, tmp_dir: str) -> str: + """ + Convert path or buffer to disk file. + + This method will convert path_or_buffer to temp file + for pyreadr to parse and rscript to import. + """ + + r_temp = os.path.join(tmp_dir, "rdata.rda") + + handle_data = _get_data_from_filepath( + filepath_or_buffer=self.path_or_buffer, + encoding=self.encoding, + compression=None, + storage_options=self.storage_options, + ) + + with _preprocess_data(handle_data) as r_data: + mode = "wb" if isinstance(r_data, io.BytesIO) else "w" + with open(r_temp, mode) as f: + f.write(r_data.read()) + + return r_temp + + def handle_row_names(self) -> DataFrame: + """ + Migrate R rownames to DataFrame index. + + This method will conditionally adjust index to reflect + original R rownames. + """ + + raise AbstractMethodError(self) + + def parse_data(self) -> Union[DataFrame, Dict[str, DataFrame]]: + """ + Parse R data files. + + This method will run engine methods to return a single DataFrame + for rds type or dictionary of DataFrames for RData or rda types. + """ + + raise AbstractMethodError(self) + + +class _PyReadrParser(_RDataReader): + """ + Internal class to parse R data types using third-party + package, pyreadr. + """ + + def __init__(self, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + self.verify_params() + + def handle_rownames(self, df) -> DataFrame: + if not self.rownames: + df = df.reset_index(drop=True) + df.index.name = None + + if self.rownames and df.index.name != "rownames": + df.index.name = "rownames" + if df.index[0] == 0: + df.index += 1 + + return df + + def parse_data(self) -> Union[DataFrame, Dict[str, DataFrame]]: + from pyreadr import read_r + + tz = datetime.now().astimezone().tzinfo + with TemporaryDirectory() as tmp_dir: + r_temp = self.buffer_to_disk(tmp_dir) + rdata = read_r(r_temp, use_objects=self.select_frames, timezone=tz) + + rdata = {k: self.handle_rownames(df) for k, df in rdata.items()} + rdata = rdata[None] if self.file_format == "rds" else dict(rdata) + + return rdata + + +class _RscriptParser(_RDataReader): + """ + Internal class to parse R data types using temp script and data + files and command line call to installed Rscript executable. + """ + + def __init__(self, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + self.verify_params() + + def handle_rownames(self, df) -> DataFrame: + if self.rownames: + df = df.set_index("rownames") + else: + df = df.drop(["rownames"], axis=1) + + return df + + def run_rscript(self, tmp_dir, r_batch, cmds) -> str: + """ + Run R script at command line. + + This method will call subprocess.Popen to run R script that + saves temp data and meta files and returns R's console output. + """ + + with open(cmds[1], "w") as f: + f.write(r_batch) + + p = subprocess.Popen( + cmds, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + cwd=tmp_dir, + ) + output, error = p.communicate() + if len(error) != 0: + raise RScriptError(error.decode(self.encoding)) + + return output.decode(self.encoding) + + def parse_data(self) -> Union[DataFrame, Dict[str, DataFrame]]: + self.r_to_py_types = { + "logical": "bool", + "integer": "int64", + "numeric": "float64", + "factor": "category", + "character": "str", + "Date": "date", + "POSIXct": "date", + } + + switch_board = { + "rda": { + "csv": self.read_rdata_csv, + "feather": self.read_rdata_feather, + "parquet": self.read_rdata_parquet, + "sqlite": self.read_rdata_sqlite, + }, + "rdata": { + "csv": self.read_rdata_csv, + "feather": self.read_rdata_feather, + "parquet": self.read_rdata_parquet, + "sqlite": self.read_rdata_sqlite, + }, + "rds": { + "csv": self.read_rds_csv, + "feather": self.read_rds_feather, + "parquet": self.read_rds_parquet, + "sqlite": self.read_rds_sqlite, + }, + } + + rdata: Union[DataFrame, Dict[str, DataFrame], None] + rdata = switch_board[self.file_format][self.mode]() + + rdata = ( + {k: v for k, v in rdata.items() if k in self.select_frames} + if self.select_frames + else rdata + ) + rdata = {k: self.handle_rownames(df) for k, df in rdata.items()} + + rdata = rdata or None + rdata = ( + rdata["r_df"] + if (self.file_format == "rds" and rdata is not None) + else rdata + ) + + if rdata is None: + raise ValueError( + "No actual data frame or coercible data frames found in R data file." + ) + return rdata + + def read_rdata_csv(self) -> Dict[str, DataFrame]: + """ + Read R rda data via IO csv. + + This method will call `load` and `write.csv` in R to export all + data frames and metadata into temp csv files for pandas `read_csv`. . + """ + + r_batch = """ + args <- commandArgs(trailingOnly=TRUE) + load(args[1], temp_env <- new.env()) + + env_list <- as.list.environment(temp_env) + rm(temp_env) + + output_data_meta <- function(obj, nm) { + df <- tryCatch(data.frame(obj, + check.names=FALSE, + stringsAsFactors=FALSE + ), error=function(e) NULL) + + if (!is.null(df)) { + cat(nm, "\n", sep="") + + df <- data.frame(rownames = row.names(df), df, + check.names=FALSE, + stringsAsFactors=FALSE) + writeLines( + c(paste0(colnames(df), collapse=","), + paste0(sapply(df, + function(x) class(x)[1]), collapse=",")), + con=paste0("meta_", nm, ".txt") + ) + + write.csv(df, paste0("data_", nm, ".csv"), + row.names=FALSE, na="") + } + } + + output <- mapply(output_data_meta, env_list, names(env_list)) + """ + + with TemporaryDirectory() as tmp_dir: + r_file = os.path.join(tmp_dir, "r_batch.R") + rda_file = self.buffer_to_disk(tmp_dir) + + output = self.run_rscript(tmp_dir, r_batch, ["Rscript", r_file, rda_file]) + + oline: str + dfs: Dict[str, DataFrame] = {} + for oline in filter(None, output.strip().split("\n")): + with open( + os.path.join(tmp_dir, f"meta_{oline}.txt"), + encoding=self.encoding, + ) as f: + flines = [fline.strip() for fline in f] + + r_hdrs: List[List[str]] = [h.split(",") for h in flines] + py_types = {n: self.r_to_py_types[d] for n, d in zip(*r_hdrs)} + + dt_cols = [col for col, d in py_types.items() if d == "date"] + py_types = {k: v for k, v in py_types.items() if v != "date"} + + try: + dfs[oline] = read_csv( + os.path.join(tmp_dir, f"data_{oline}.csv"), + dtype=py_types, # type: ignore[arg-type] + parse_dates=dt_cols, + encoding=self.encoding, + ) + except (ParserError, ValueError): + dfs[oline] = read_csv( + os.path.join(tmp_dir, f"data_{oline}.csv"), + encoding=self.encoding, + ) + + return dfs + + def read_rdata_feather(self) -> Dict[str, DataFrame]: + """ + Read R rda data via IO feather. + + This method will call `readRDS` and `write_feather` in R to export all + data frames into temp feather files for pandas `read_feather`. + """ + + r_batch = """ + suppressPackageStartupMessages(library(arrow)) + + args <- commandArgs(trailingOnly=TRUE) + + load(args[1], temp_env <- new.env()) + env_list <- as.list.environment(temp_env) + rm(temp_env) + + output_data_meta <- function(obj, nm) { + df <- tryCatch(data.frame(obj, + check.names=FALSE, + stringsAsFactors=FALSE + ), error=function(e) NULL) + + if (!is.null(df)) { + cat(nm, "\n", sep="") + df <- data.frame(rownames = row.names(df), df, + check.names=FALSE, + stringsAsFactors=FALSE) + arrow::write_feather(df, paste0("data_", nm, ".feather")) + } + } + + output <- mapply(output_data_meta, env_list, names(env_list)) + """ + + with TemporaryDirectory() as tmp_dir: + r_file = os.path.join(tmp_dir, "r_batch.R") + rda_file = self.buffer_to_disk(tmp_dir) + + output = self.run_rscript(tmp_dir, r_batch, ["Rscript", r_file, rda_file]) + + oline: str + dfs: Dict[str, DataFrame] = { + oline: read_feather(os.path.join(tmp_dir, f"data_{oline}.feather")) + for oline in filter(None, output.strip().split("\n")) + } + + return dfs + + def read_rdata_parquet(self) -> Dict[str, DataFrame]: + """ + Read R rda data via IO parquet. + + This method will call `load` and `write_parquet` in R to export all + data frames into temp parquet files for pandas `read_parquet`. + """ + + r_batch = """ + suppressPackageStartupMessages(library(arrow)) + + args <- commandArgs(trailingOnly=TRUE) + + load(args[1], temp_env <- new.env()) + env_list <- as.list.environment(temp_env) + rm(temp_env) + + output_data_meta <- function(obj, nm) { + df <- tryCatch(data.frame(obj, + check.names=FALSE, + stringsAsFactors=FALSE + ), error=function(e) NULL) + + if (!is.null(df)) { + cat(nm, "\n", sep="") + df <- data.frame(rownames = row.names(df), df, + check.names=FALSE, + stringsAsFactors=FALSE) + arrow::write_parquet(df, paste0("data_", nm, ".parquet")) + } + } + + output <- mapply(output_data_meta, env_list, names(env_list)) + """ + + with TemporaryDirectory() as tmp_dir: + r_file = os.path.join(tmp_dir, "r_batch.R") + rda_file = self.buffer_to_disk(tmp_dir) + + output = self.run_rscript(tmp_dir, r_batch, ["Rscript", r_file, rda_file]) + + oline: str + dfs: Dict[str, DataFrame] = { + oline: read_parquet(os.path.join(tmp_dir, f"data_{oline}.parquet")) + for oline in filter(None, output.strip().split("\n")) + } + + return dfs + + def read_rdata_sqlite(self) -> Dict[str, DataFrame]: + """ + Read R rda data via IO sql. + + This method will call `load` and `dbWriteTable` in R to export all + data frames into a temp SQLite database for pandas `read_sql`. + """ + import sqlite3 + + r_batch = """ + suppressPackageStartupMessages(library(RSQLite)) + + args <- commandArgs(trailingOnly=TRUE) + + load(args[1], temp_env <- new.env()) + env_list <- as.list.environment(temp_env) + rm(temp_env) + + conn <- dbConnect(RSQLite::SQLite(), "r_data.db") + output_data_meta <- function(obj, nm) { + df <- tryCatch(data.frame(obj, + check.names=FALSE, + stringsAsFactors=FALSE + ), error=function(e) NULL) + + if (!is.null(df)) { + cat(nm, "\n", sep="") + df <- data.frame(rownames = row.names(df), df, + check.names=FALSE, + stringsAsFactors=FALSE) + dbWriteTable(conn, paste0("data_", nm), df, row.names=FALSE) + } + } + + output <- mapply(output_data_meta, env_list, names(env_list)) + dbDisconnect(conn) + """ + + with TemporaryDirectory() as tmp_dir: + r_db = os.path.join(tmp_dir, "r_data.db") + r_file = os.path.join(tmp_dir, "r_batch.R") + rda_file = self.buffer_to_disk(tmp_dir) + + output = self.run_rscript(tmp_dir, r_batch, ["Rscript", r_file, rda_file]) + + oline: str + conn = sqlite3.connect(r_db) + dfs: Dict[str, DataFrame] = { + oline: read_sql(f"SELECT * FROM data_{oline}", conn) + for oline in filter(None, output.strip().split("\n")) + } + conn.close() + + return dfs + + def read_rds_csv(self) -> Dict[str, DataFrame]: + """ + Read R rds data via IO csv. + + This method will call `readRDS` and `write.csv` in R to export single + data frame and metadata into temp csv files for pandas `read_csv`. + """ + + r_batch = """ + args <- commandArgs(trailingOnly=TRUE) + + raw <- readRDS(args[1]) + df <- tryCatch(data.frame(raw, + check.names=FALSE, + stringsAsFactors=FALSE + ), error = function(e) NULL) + + if(!is.null(df)) { + df <- data.frame(rownames = row.names(df), df, + check.names=FALSE, + stringsAsFactors=FALSE) + write.csv(df, file=args[2], row.names=FALSE) + + cat(paste0(colnames(df), collapse=","),"|", + paste0(sapply(df, function(x) + class(x)[1]), collapse=","), + sep="") + } + """ + + dfs: Dict[str, DataFrame] = {} + with TemporaryDirectory() as tmp_dir: + r_data = os.path.join(tmp_dir, "r_data.csv") + r_file = os.path.join(tmp_dir, "r_batch.R") + + rds_file = self.buffer_to_disk(tmp_dir) + output = self.run_rscript( + tmp_dir, r_batch, ["Rscript", r_file, rds_file, r_data] + ) + + if os.path.isfile(r_data): + r_hdrs = [h.split(",") for h in output.split("|")] + n: str + py_types = {n: self.r_to_py_types[d] for n, d in zip(*r_hdrs)} + + dt_cols = [col for col, d in py_types.items() if d == "date"] + py_types = {k: v for k, v in py_types.items() if v != "date"} + + try: + dfs["r_df"] = read_csv( + r_data, + dtype=py_types, # type: ignore[arg-type] + parse_dates=dt_cols, + encoding=self.encoding, + ) + except (ParserError, ValueError): + dfs["r_df"] = read_csv(r_data) + + return dfs + + def read_rds_feather(self) -> Dict[str, DataFrame]: + """ + Read R rds data via IO feather. + + This method will call `readRDS` and `write_feather` in R to export single + data frame into a temp feather file for pandas `read_feather`. + """ + + r_batch = """ + suppressPackageStartupMessages(library(arrow)) + args <- commandArgs(trailingOnly=TRUE) + + raw <- readRDS(args[1]) + df <- tryCatch(data.frame(raw, + check.names=FALSE, + stringsAsFactors=FALSE + ), error = function(e) NULL) + + if(!is.null(df)) { + df <- data.frame(rownames = row.names(df), df, + check.names=FALSE, + stringsAsFactors=FALSE) + arrow::write_feather(df, args[2]) + } + """ + + with TemporaryDirectory() as tmp_dir: + r_data = os.path.join(tmp_dir, "r_data.feather") + r_file = os.path.join(tmp_dir, "r_batch.R") + + rds_file = self.buffer_to_disk(tmp_dir) + self.run_rscript(tmp_dir, r_batch, ["Rscript", r_file, rds_file, r_data]) + + dfs: Dict[str, DataFrame] = ( + {"r_df": read_feather(r_data)} if os.path.isfile(r_data) else {} + ) + + return dfs + + def read_rds_parquet(self) -> Dict[str, DataFrame]: + """ + Read R rds data via IO parquet. + + This method will call `readRDS` and `write_parquet` in R to export + single data frame into a temp parquet file for pandas `read_parquet`. + """ + + r_batch = """ + suppressPackageStartupMessages(library(arrow)) + args <- commandArgs(trailingOnly=TRUE) + + raw <- readRDS(args[1]) + df <- tryCatch(data.frame(raw, + check.names=FALSE, + stringsAsFactors=FALSE + ), error = function(e) NULL) + + if(!is.null(df)) { + df <- data.frame(rownames = row.names(df), df, + check.names=FALSE, + stringsAsFactors=FALSE) + arrow::write_parquet(df, args[2]) + } + """ + + with TemporaryDirectory() as tmp_dir: + r_data = os.path.join(tmp_dir, "r_data.parquet") + r_file = os.path.join(tmp_dir, "r_batch.R") + + rds_file = self.buffer_to_disk(tmp_dir) + self.run_rscript(tmp_dir, r_batch, ["Rscript", r_file, rds_file, r_data]) + + dfs: Dict[str, DataFrame] = ( + {"r_df": read_parquet(r_data, engine="pyarrow")} + if os.path.isfile(r_data) + else {} + ) + + return dfs + + def read_rds_sqlite(self) -> Dict[str, DataFrame]: + """ + Read R rds data via IO sql. + + This method will call `readRDS` and `dbWriteTable` in R to export + single data frame into a temp SQLite database for pandas `read_sql`. + """ + import sqlite3 + + r_batch = """ + suppressPackageStartupMessages(library(RSQLite)) + args <- commandArgs(trailingOnly=TRUE) + + raw <- readRDS(args[1]) + df <- tryCatch(data.frame(raw, + check.names=FALSE, + stringsAsFactors=FALSE + ), error = function(e) NULL) + + if(!is.null(df)) { + conn <- dbConnect(RSQLite::SQLite(), args[2]) + df <- data.frame(rownames = row.names(df), df, + check.names=FALSE, + stringsAsFactors=FALSE) + dbWriteTable(conn, "rdata", df, row.names=FALSE) + dbDisconnect(conn) + } + """ + + dfs: Dict[str, DataFrame] = {} + with TemporaryDirectory() as tmp_dir: + r_data = os.path.join(tmp_dir, "r_data.db") + r_file = os.path.join(tmp_dir, "r_batch.R") + + rds_file = self.buffer_to_disk(tmp_dir) + self.run_rscript(tmp_dir, r_batch, ["Rscript", r_file, rds_file, r_data]) + + if os.path.isfile(r_data): + conn = sqlite3.connect(r_data) + dfs["r_df"] = read_sql("SELECT * FROM rdata", conn) + conn.close() + + return dfs + + +class RDataWriter: + """ + Subclass to write pandas DataFrames into R data files. + + Parameters + ---------- + path_or_buffer : a valid str, path object or file-like object + Any valid string path is acceptable. + + file_format : {{'infer', 'rda', 'rdata', 'rds'}}, default 'infer' + R serialization type. + + engine : {{'rscript','pyreadr'}}, default 'utf-8' + Engine used to write R data. + + mode : {{'csv', 'parquet', 'feather'}}, default 'csv' + Python and R i/o transfer mode. + + other_frames : list, optional + Other DataFrames to be included in rda (not rds) files + that can contain multiple objects. + + rda_names : list, default ["pandas_dataframe"] + Names for all exported objects in rda file. + + index : bool, default True + Include index or MultiIndex in output as separate columns. + + ascii : bool, default False + Write data in ASCII representation. + + compress : bool or {{'gzip', 'bzip2', 'xz'}}, default 'gzip' + Compression types for R data. For pyreadr engine, only gzip + is supported. Use False for uncompressed files. + + encoding : str, optional, default 'utf-8' + Encoding of R data. + + storage_options : dict, optional + Extra options that make sense for a particular storage connection, + e.g. host, port, username, password, etc. + + See also + -------- + pandas.io.rdata.PyReadrWriter + pandas.io.rdata.RscriptWriter + + Notes + ----- + To subclass this class effectively you must override the following methods:` + * :func:`write_data` + + See each method's respective documentation for details on their + functionality. + """ + + def __init__( + self, + frame: DataFrame, + path_or_buffer: FilePathOrBuffer, + file_format: str = "infer", + engine: str = "rscript", + mode: str = "csv", + other_frames: Optional[List[DataFrame]] = None, + rda_names: List[str] = ["pandas_dataframe"], + index: bool = True, + ascii: bool = False, + compress: Union[bool, str] = "gzip", + encoding: str = "utf-8", + storage_options: StorageOptions = None, + ) -> None: + self.frame = frame + self.path_or_buffer = path_or_buffer + self.file_format = file_format.lower() + self.engine = engine + self.mode = mode + self.other_frames = other_frames + self.rda_names = rda_names + self.index = index + self.ascii = ascii + self.compress = compress + self.encoding = encoding + self.storage_options = storage_options + + def verify_params(self) -> None: + """ + Verify user entries of parameters. + + This method will check the values and types of select parameters + and raise appropriate errors. + """ + + if self.file_format not in ["infer", "rda", "rdata", "rds"]: + raise ValueError( + f"{self.file_format} is not a valid value for file_format." + ) + + if ( + self.file_format == "infer" + and isinstance(self.path_or_buffer, str) + and not self.path_or_buffer.lower().endswith((".rda", ".rdata", ".rds")) + ): + raise ValueError( + f"Unable to infer file format from file name: {self.path_or_buffer}" + "Please use known R data type (.rda, .rdata, .rds)." + ) + + if self.file_format == "infer" and isinstance(self.path_or_buffer, str): + self.file_format = os.path.splitext(self.path_or_buffer.lower())[1][1:] + + if self.mode is not None and self.mode not in [ + "csv", + "feather", + "parquet", + "sqlite", + ]: + raise ValueError(f"{self.mode} is not supported value for mode.") + + if self.other_frames is not None and not is_list_like(self.other_frames): + raise TypeError( + f"{type(self.other_frames).__name__} is not " + " a valid type for other_frames." + ) + elif self.other_frames is not None: + for df in self.other_frames: + if not isinstance(df, DataFrame): + raise TypeError( + "One or more of the objects in " + "other_frames is not a DataFrame." + ) + + if self.rda_names is not None and not is_list_like(self.rda_names): + raise TypeError( + f"{type(self.rda_names).__name__} is not a valid type for rda_names." + ) + + if self.compress is not None and self.compress not in [ + True, + False, + "gzip", + "bzip2", + "xz", + ]: + raise ValueError(f"{self.compress} is not a supported value for compress.") + + def disk_to_buffer(self, r_file: str) -> None: + """ + Save temp file to path or buffer. + + This method will convert written R data to path_or_buffer. + """ + + with open(r_file, "rb") as rdata: + with get_handle( + self.path_or_buffer, + "wb", + compression=None, + storage_options=self.storage_options, + is_text=False, + ) as handles: + handles.handle.write(rdata.read()) # type: ignore[arg-type] + + return None + + def write_data(self) -> None: + """ + Write DataFrames to R data files. + + This method will run engine methods to export DataFrames + to R data files. + """ + + raise AbstractMethodError(self) + + +class PyReadrWriter(RDataWriter): + """ + Main class called in `pandas.core.frame` to write DataFrame to R + data types using third-party package, pyreadr. + """ + + def __init__(self, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + self.verify_params() + + def write_data(self) -> None: + from pyreadr import ( + write_rdata, + write_rds, + ) + + self.frame = ( + self.frame.reset_index() + if self.index + else self.frame.reset_index(drop=True) + ) + + with TemporaryDirectory() as tmp_dir: + r_temp = os.path.join(tmp_dir, "rdata.rda") + + if self.file_format in ["rda", "rdata"]: + write_rdata( + path=r_temp, + df=self.frame, + df_name=self.rda_names[0], + compress=self.compress, + ) + elif self.file_format == "rds": + write_rds(path=r_temp, df=self.frame, compress=self.compress) + + self.disk_to_buffer(r_temp) + + return None + + +class RscriptWriter(RDataWriter): + """ + Main class called in `pandas.core.frame` to write DataFrame(s) to R + data types using command line to Rscript. + """ + + def __init__(self, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + self.verify_params() + self.handle_objects() + + def handle_objects(self) -> None: + + self.all_frames = ( + [self.frame] + self.other_frames if self.other_frames else [self.frame] + ) + + if len(self.rda_names) != len(self.all_frames): + raise ValueError( + f"Length of {self.rda_names} does not match number " + "of current DataFrame and other_frames" + ) + + return None + + def run_rscript(self, tmp_dir, r_batch, cmds) -> None: + """ + Run R script at command line. + + This method will call subprocess.Popen to run R script + and return only non-empty error R output in console. + """ + + with open(cmds[1], "w") as f: + f.write(r_batch) + + a = subprocess.Popen( + cmds, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + cwd=tmp_dir, + ) + output, error = a.communicate() + if len(error) != 0: + raise RScriptError(error.decode(self.encoding)) + + return None + + def write_data(self) -> None: + self.py_to_r_types = { + "int32": "integer", + "int64": "integer", + "float64": "numeric", + "category": "factor", + "object": "character", + "bool": "logical", + "datetime64[ns]": "POSIXct", + } + + switch_board = { + "rda": { + "csv": self.write_rdata_csv, + "feather": self.write_rdata_feather, + "parquet": self.write_rdata_parquet, + "sqlite": self.write_rdata_sqlite, + }, + "rdata": { + "csv": self.write_rdata_csv, + "feather": self.write_rdata_feather, + "parquet": self.write_rdata_parquet, + "sqlite": self.write_rdata_sqlite, + }, + "rds": { + "csv": self.write_rds_csv, + "feather": self.write_rds_feather, + "parquet": self.write_rds_parquet, + "sqlite": self.write_rds_sqlite, + }, + } + + switch_board[self.file_format][self.mode]() + + return None + + def write_rdata_csv(self) -> None: + """ + Write R rda data via IO csv. + + This method will export one or more DataFrames into temp data + and metadata csv files and call `read.csv` and `save` in R. + """ + + r_batch = """ + args <- commandArgs(trailingOnly=TRUE) + + py_names <- strsplit(args[1], ",")[[1]] + + for(obj in py_names) { + meta <- paste0("meta_", obj, ".txt") + r_types <- strsplit(readLines(meta, n=-1, + warn=FALSE), ",")[[1]] + + data <- paste0("data_", obj, ".csv") + df <- tryCatch( + read.csv(data, colClasses=r_types), + error = function(e) read.csv(data) + ) + assign(obj, df) + rm(df) + } + + r_ascii <- as.logical(args[3]) + r_compress <- ifelse(args[4] %in% c("True", "False"), + as.logical(args[4]), + args[4]) + + dfs <- names(Filter(is.data.frame, mget(ls()))) + save(list=dfs, file=args[2], + ascii=r_ascii, compress=r_compress) + """ + + with TemporaryDirectory() as tmp_dir: + for nm, df in zip(self.rda_names, self.all_frames): + + data_file = os.path.join(tmp_dir, f"data_{nm}.csv") + meta_file = os.path.join(tmp_dir, f"meta_{nm}.txt") + r_code = os.path.join(tmp_dir, "rbatch.R") + r_temp = os.path.join(tmp_dir, "rdata.rda") + + df = df.reset_index() if self.index else df + df.to_csv(data_file, index=False) + + with open(meta_file, "w") as f: + f.write( + ",".join( + self.py_to_r_types[p] + for p in df.dtypes.astype(str).tolist() + ) + ) + + cmds = [ + "Rscript", + r_code, + ",".join(self.rda_names), + r_temp, + str(self.ascii), + str(self.compress), + ] + self.run_rscript(tmp_dir, r_batch, cmds) + + self.disk_to_buffer(r_temp) + + return None + + def write_rdata_feather(self) -> None: + """ + Write R rda data via IO feather. + + This method will export one or more DataFrames into temp + feather files and call `read_feather` and `save` in R. + """ + + r_batch = """ + suppressPackageStartupMessages(library(arrow)) + args <- commandArgs(trailingOnly=TRUE) + + py_names <- strsplit(args[1], ",")[[1]] + + for(obj in py_names) { + data <- paste0("data_", obj, ".feather") + df <- arrow::read_feather(data) + assign(obj, df) + rm(df) + } + + r_ascii <- as.logical(args[3]) + r_compress <- ifelse(args[4] %in% c("True", "False"), + as.logical(args[4]), + args[4]) + + dfs <- names(Filter(is.data.frame, mget(ls()))) + save(list=dfs, file=args[2], + ascii=r_ascii, compress=r_compress) + """ + + with TemporaryDirectory() as tmp_dir: + for nm, df in zip(self.rda_names, self.all_frames): + + data_file = os.path.join(tmp_dir, f"data_{nm}.feather") + r_code = os.path.join(tmp_dir, "rbatch.R") + r_temp = os.path.join(tmp_dir, "rdata.rda") + + df = df.reset_index() if self.index else df.reset_index(drop=True) + df.to_feather(data_file) + + cmds = [ + "Rscript", + r_code, + ",".join(self.rda_names), + r_temp, + str(self.ascii), + str(self.compress), + ] + self.run_rscript(tmp_dir, r_batch, cmds) + + self.disk_to_buffer(r_temp) + + def write_rdata_parquet(self) -> None: + """ + Write R rda data via IO parquet. + + This method will export one or more DataFrames into temp + parquet files and call `read_parquet` and `save` in R. + """ + + r_batch = """ + suppressPackageStartupMessages(library(arrow)) + args <- commandArgs(trailingOnly=TRUE) + + py_names <- strsplit(args[1], ",")[[1]] + + for(obj in py_names) { + data <- paste0("data_", obj, ".parquet") + df <- arrow::read_parquet(data) + assign(obj, df) + rm(df) + } + + r_ascii <- as.logical(args[3]) + r_compress <- ifelse(args[4] %in% c("True", "False"), + as.logical(args[4]), + args[4]) + + dfs <- names(Filter(is.data.frame, mget(ls()))) + save(list=dfs, file=args[2], + ascii=r_ascii, compress=r_compress) + """ + + with TemporaryDirectory() as tmp_dir: + for nm, df in zip(self.rda_names, self.all_frames): + + data_file = os.path.join(tmp_dir, f"data_{nm}.parquet") + r_code = os.path.join(tmp_dir, "rbatch.R") + r_temp = os.path.join(tmp_dir, "rdata.rda") + + df = df.reset_index() if self.index else df + df.to_parquet(data_file, index=False) + + cmds = [ + "Rscript", + r_code, + ",".join(self.rda_names), + r_temp, + str(self.ascii), + str(self.compress), + ] + self.run_rscript(tmp_dir, r_batch, cmds) + + self.disk_to_buffer(r_temp) + + def write_rdata_sqlite(self) -> None: + """ + Write R rda data via IO sql. + + This method will export one or more DataFrames into a temp + SQLite database and call `dbReadTable` and `save` in R. + """ + import sqlite3 + + r_batch = """ + suppressPackageStartupMessages(library(RSQLite)) + args <- commandArgs(trailingOnly=TRUE) + + conn <- dbConnect(RSQLite::SQLite(), args[1]) + py_names <- strsplit(args[2], ",")[[1]] + + for(obj in py_names) { + data <- paste0("data_", obj) + df <- dbReadTable(conn, data) + assign(obj, df) + rm(df) + } + dbDisconnect(conn) + + r_ascii <- as.logical(args[4]) + r_compress <- ifelse(args[5] %in% c("True", "False"), + as.logical(args[5]), + args[5]) + + dfs <- names(Filter(is.data.frame, mget(ls()))) + save(list=dfs, file=args[3], + ascii=r_ascii, compress=r_compress) + """ + + with TemporaryDirectory() as tmp_dir: + r_db = os.path.join(tmp_dir, "rdata.db") + conn = sqlite3.connect(r_db) + + for nm, df in zip(self.rda_names, self.all_frames): + r_code = os.path.join(tmp_dir, "rbatch.R") + r_temp = os.path.join(tmp_dir, "rdata.rda") + + df = df.reset_index() if self.index else df + df.to_sql(f"data_{nm}", conn, index=False) + + conn.close() + cmds = [ + "Rscript", + r_code, + r_db, + ",".join(self.rda_names), + r_temp, + str(self.ascii), + str(self.compress), + ] + self.run_rscript(tmp_dir, r_batch, cmds) + + self.disk_to_buffer(r_temp) + + def write_rds_csv(self) -> None: + """ + Write R rds data via IO csv. + + This method will export a single DataFrame into temp csv + data and call `read.csv` and `saveRDS` in R. + """ + + r_batch = """ + args <- commandArgs(trailingOnly=TRUE) + py_data <- args[1] + r_types <- strsplit(args[2], ",")[[1]] + + df <- tryCatch( + read.csv(py_data, colClasses=r_types), + error = function(e) read.csv(py_data) + ) + + r_ascii <- as.logical(args[4]) + r_compress <- ifelse(args[5] %in% c("True", "False"), + as.logical(args[5]), + args[5]) + + saveRDS(df, file=args[3], + ascii=r_ascii, compress=r_compress) + """ + + with TemporaryDirectory() as tmp_dir: + r_code = os.path.join(tmp_dir, "rbatch.R") + py_data = os.path.join(tmp_dir, "pydata.csv") + r_temp = os.path.join(tmp_dir, "rdata.rds") + + py_df = self.frame.reset_index() if self.index else self.frame + r_types = ",".join(py_df.dtypes.astype(str).replace(self.py_to_r_types)) + + py_df.to_csv(py_data, index=False) + + cmds = [ + "Rscript", + r_code, + py_data, + r_types, + r_temp, + str(self.ascii), + str(self.compress), + ] + self.run_rscript(tmp_dir, r_batch, cmds) + + self.disk_to_buffer(r_temp) + + return None + + def write_rds_feather(self) -> None: + """ + Write R rds data via IO feather. + + This method will export a single DataFrame into a temp + feather file to call `read_feather` and `saveRDS` in R. + """ + + r_batch = """ + suppressPackageStartupMessages(library(arrow)) + args <- commandArgs(trailingOnly=TRUE) + + df <- arrow::read_feather(args[1]) + + r_ascii <- as.logical(args[3]) + r_compress <- ifelse(args[4] %in% c("True", "False"), + as.logical(args[4]), + args[4]) + + saveRDS(df, file=args[2], + ascii=r_ascii, compress=r_compress) + """ + + with TemporaryDirectory() as tmp_dir: + r_code = os.path.join(tmp_dir, "rbatch.R") + py_data = os.path.join(tmp_dir, "pydata.feather") + r_temp = os.path.join(tmp_dir, "rdata.rds") + + py_df = ( + self.frame.reset_index() + if self.index + else self.frame.reset_index(drop=True) + ) + + py_df.to_feather(py_data) + + cmds = [ + "Rscript", + r_code, + py_data, + r_temp, + str(self.ascii), + str(self.compress), + ] + self.run_rscript(tmp_dir, r_batch, cmds) + + self.disk_to_buffer(r_temp) + + def write_rds_parquet(self) -> None: + """ + Write R rds data via IO parquet. + + This method will export a single DataFrame into a temp + parquet file for `read_parquet` and `saveRDS` in R. + """ + + r_batch = """ + suppressPackageStartupMessages(library(arrow)) + args <- commandArgs(trailingOnly=TRUE) + + df <- arrow::read_parquet(args[1]) + + r_ascii <- as.logical(args[3]) + r_compress <- ifelse(args[4] %in% c("True", "False"), + as.logical(args[4]), + args[4]) + + saveRDS(df, file=args[2], + ascii=r_ascii, compress=r_compress) + """ + + with TemporaryDirectory() as tmp_dir: + r_code = os.path.join(tmp_dir, "rbatch.R") + py_data = os.path.join(tmp_dir, "pydata.parquet") + r_temp = os.path.join(tmp_dir, "rdata.rds") + + py_df = self.frame.reset_index() if self.index else self.frame + + py_df.to_parquet(py_data, index=False) + + cmds = [ + "Rscript", + r_code, + py_data, + r_temp, + str(self.ascii), + str(self.compress), + ] + self.run_rscript(tmp_dir, r_batch, cmds) + + self.disk_to_buffer(r_temp) + + def write_rds_sqlite(self) -> None: + """ + Write R rds data via IO sql. + + This method will export a single DataFrame into a temp + parquet file for `dbReadTable` and `saveRDS` in R. + """ + import sqlite3 + + r_batch = """ + suppressPackageStartupMessages(library(RSQLite)) + args <- commandArgs(trailingOnly=TRUE) + + conn <- dbConnect(RSQLite::SQLite(), args[1]) + df <- dbReadTable(conn, "pydata") + + r_ascii <- as.logical(args[3]) + r_compress <- ifelse(args[4] %in% c("True", "False"), + as.logical(args[4]), + args[4]) + + saveRDS(df, file=args[2], + ascii=r_ascii, compress=r_compress) + dbDisconnect(conn) + """ + + with TemporaryDirectory() as tmp_dir: + r_code = os.path.join(tmp_dir, "rbatch.R") + py_data = os.path.join(tmp_dir, "pydata.db") + r_temp = os.path.join(tmp_dir, "rdata.rds") + + py_df = self.frame.reset_index() if self.index else self.frame + + conn = sqlite3.connect(py_data) + py_df.to_sql("pydata", conn, index=False) + conn.close() + + cmds = [ + "Rscript", + r_code, + py_data, + r_temp, + str(self.ascii), + str(self.compress), + ] + self.run_rscript(tmp_dir, r_batch, cmds) + + self.disk_to_buffer(r_temp) diff --git a/pandas/tests/io/data/rdata/climate_non_utf8_df.rda b/pandas/tests/io/data/rdata/climate_non_utf8_df.rda new file mode 100644 index 0000000000000000000000000000000000000000..a506806405f5e57cb8bb8a0e57788a8f1384f856 GIT binary patch literal 423 zcmV;Y0a*SYiwFP!000001C3HmOB_KEZTCa73ql}Q!M>PVgy2OG50XXkAS{c5w@`N1 zBnADksp(PJlYh;h;%^Xg$icOGrgtNoOAfU)UG?hK+p3yPZjZ*dW2IDI4b)Kehw`b0 zH`nK1j+Gkpcha!b}D8mh~s0 zizRjz(laNLcF`EJ7!tTbvUk-WCoI4k-92dVuy_e%pkNNZfxn}c%*P-pjX^(EEQt4n z-Kh4b$Q~+=b<$adWk|7ZYcWBH1*YHil{7G;p$c1|iTqdJKfODpm%-9X-`D>e#05tT z4%)&ZLJH>&7T`(#LTsme7wcaQiep@h_&h_~n8NJ}qX(wY)jV-ti{3bzVE~VH*?X^0 zZHGUy*3}gD=1uNvq&t@_cb7)BkDGMHLA|0|AEt!_hZRew4fu%T!{JInzSQ2M{4AHt R-{M}gy#a()t7QNK0084$%B=tZ literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/rdata/climate_non_utf8_df.rds b/pandas/tests/io/data/rdata/climate_non_utf8_df.rds new file mode 100644 index 0000000000000000000000000000000000000000..85a65550ad80f68eb9272937fddec42ae81e182c GIT binary patch literal 400 zcmV;B0dM{viwFP!000001C3HWO9Md=J@cgo1i?1oHrgrJ31Sfx#X<<8U_05H=)iu& zxxEOf{A>9s{szIq!Z>@`iy<}^?{;VQ&6{^S``FF0teLg4cGhgmHfulJUmcxFZcl=S zU`G&18!2uIT9GK|2sQ;2zL2EdSm%263mnpl?xG<~?3nUrX>R~DIa@(zq7 zjp_&U?gD9_B+f6fAQ$k7mxWOZ#+B+bq46o#cj27_NjtAK<>&&KB+|Prkpm`Tfo1{< zEKENfX-J%d3$R0~#H@F6q|m9^h=s8#*s93;17r&$_C-+9Ma~d>QPpAq=Mzn>^p#^^ zMolBEh}!e7zdyYfPH$XJwZD`9UBoqebOx%zWI_tZ2Bu(1e!*9_JoLp&iLx**O&s1~ zUh1T6lA;ABx2TzM9E+0aH>&5bP;Wc1Sfsf(l2pfi$yn=K=#9(RS4($JQ`Uwo1S+gw um(vD{TCPrZa|m!eV(GL--)8@MTsw#_wecpO`F#E<%l8kzoFhox0ssJiCc9|> literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/rdata/env_data_dfs.rda b/pandas/tests/io/data/rdata/env_data_dfs.rda new file mode 100644 index 0000000000000000000000000000000000000000..07fbef3ecb00decb344bf9f64777d67fdd609630 GIT binary patch literal 7259 zcmaJ@dpHw}+eV2}D5oS_rAU&zlCxDRZ+Wxil;e^j#B!XCZ73;XBqU}@QclBiJ`Cl2 z8gf2t&SMT6Gn;Mpy}$4K=lkQkzURKK=eh3dzMp@d>$#rm0mg|7{deqJ!@`4Xhw9s) zi^3x5W{VjaMoZHflW`T^EzixpHEezy^42`?UdZCkvtzE2`f~TmO@H>?Ee{Si{n2G; z;&lDCoZ7|mI&Wof zLs{F?dnWS+rY(7)!&1=Id54S$u+-oC&K3rMLp*AdlTgly4U3l^Rz33bgdv`cwbTCr z`|l#uiI#?SIZ#(9rZUiKqwE&e#keP+r$9(Ebdfb6sL>enhfcU4w%Jbsdn$w>_ zZPL)P3#p$x)F)-owj&voInxZAsinKHFNG^KNRXp9{-pw9GZ?&exUD*?&e#2COxrV% zVva8`hadEg)qZ@<)BUHi`%l4%9Deva7WMdAsrI;5>oW|dKkImoiNWnK?{};&?eRhP zpOEw^W3LR>C2Vf(bkx8 zHhpZO=h2gqthVW@j&Ar`J}*+uPgiMQ7SBSD-!#ej1HB3?TRR&#^HvKsyvIg&$Hw(Y z6?FA!t4#+y-BF)FKf20WFKN=ti|!sb8*XUW{fYY|;kmdW_U)0c<6zyIHI6voZI%N$zv#f-XVebtK4Afws{jU3xr%FrC^!q-3EJwe#h(OX~M+aqRVZ!J2z z*6w=gu%pM&t^}XdbUkiT=j7C~|Jz*rQwU3efNGl1gS_as)Cnr~iZ3kIeo63{Ea5YL z-81JhB^qR0+$y?@Cs?&3WIY1b!LlB82I)a`w)&fB-QRQ-T*qO8ZI3m9 zbInS}l=334uaKeX;^38vB?^Sg$?4B)DzX~Naya`5{%b`}dY9n+g|+85GV?lPyE#um z-DtNNUyl-K8%s~JjvVB>Y~;4^li&Rf@8TRzPX=9~UG;nRaJySJaJn|diCn#9?o|sq z%eHHU{Ila#*`r_0)Fmuetp18Y1-)`u+HlyqC9n*fcIq=Jr7+aGDN}o2{`@xjXJ_0& zYU;B4w=o;j%Gu>)yp9jlWOmhF!PTRe+d4sT?Y%oe-d@P#qZa)nTKp4vB!9aa~KURy2 zaah(&npj=Ao}7ay)~W5^n(VAt8r@&em`T=;lYXs5Cy}Mre%Q zcz2+Q3VQj-2qBC~Irp#eFjEjL-`Ql{8*wm(b@Ab!3F6lRsJ{S2v6^v1c7>%~bT%bz z-vOmxMsjAetk|F?^IL6_LED-uJpEoUYo<+3d-=`J=6ds>V69KY?**RMW=4~`4#;nJ z+rtLBr-e5d65N`BYF_vIokG};ImoIJayI)%#cBL;f$V@N9l z%m0n)%Kqw(++f_WAavZUO{VA)_J0tjgq`z}QU?7T-w6k|>U*cpeSaFCoN8j}*sJeV zJ@<9tUjnjpY|{6Np8I<5xVi7FzBi5qSYO{+M{WG?Bw*VaZ)Wvz-YnN+vC2p+E`|skRaIXXDp-hYSW~bLj={e#YbzV!1Xjo>_l);@7lVA%gD-Pu}`oc z3_`83TMBV02(0Da7%4Zb{EuZytTrt7XiW(ugN%X02Jcq*L06zj{6{Pgm)!$A``R-{svzhF0zWqgUz8ZK(eH z2(WnxcZ^xP_kX>1P;Hp0no*L;!?EwO*SJK!oDAA)cWpt#^{-OU6uqn23!=j~=L`g( zcyJk578FT|3LK0uQQhON&wykEEO8+jnaSmMf-yQ-QU>zrH2~H2TV}a!5UQt5Zy6vB zoWBU@!TcbvHDX+@7#EU$e^$9V)-)23X@-_XI2|W9SIM(|6k8olpMCi?H8*`(ljPn8 zoFk3>6+GXU5Er7q`6eMzNn5k$*M5BITn^Oe#W?IvvU=olHeS37Vv zbUJo&T-A{7*fhAJ$RY55S6hNkvq?kamoJvwPL9Cs4?0J3FIaTx(r3QcpNdSbn#wj3 zHXbuuT|X-NarB5Jk&$`yCU99P)SpT~;6vDi-J3-1)y9>;Py2|X27lxI$3*!RW4*kJ z8|H$d7=r*CVo@h&sxuy5MDIVuAFvQXIL?6i`w`Eh@pkOL4;zH5eI8jk!dIne2osA) zDOJhjNF_A6l&Bw25du21w}}}B&gH6g=tNE)8%3V1Da<-c9+Ip(9wOi7T5c*s)na76 z)QWU%u@`|)EKzk&8zoPj!i?Tan4BBMZfnq!uW4-pwdp>{w#WkVTBu1HNE$>MiZXUD zG^*`Q3VY|187$id{w*0{y88DE&6AuXRHz%j9egt>V;e{siJEkZP^(S2;Tbo4aQDsh z1|<#R9Z5dVk``^?Im*Is^}{4KF;o~86YIy=$G76>&3i7n;g8bI6Wwq2l`0tlxbfG_ z_lH|iZYRO^!=uv`nx|FTO%9{oDA7so`?-=nrkj$4OU>=%^p#4BM_7QKu?jbMJ5a12K>ES!l#A5@Yyc^O+QjcN6Y^fVi>Co#uYayg4$^IHW z$wIn%e~A(Zafl)&LV7xL4Fg0k##{n%zXKcgjk@b>`n-mKHW`G53{f#Y=QBaEo;&kP zO`lqH5Xnw@DOH1P|JAk;HTMz#4BAah2Gc)x#Z4+DnahMwTc+z8y{_SotOmMLcOHwm zF6t%JAezWRlZfG%wC%C;yyiG0ymaYy>ySJC1UaJNM8CZX-;rr8g^j1y1FCf|Hyq#~ zZYd!A8d0=HYq3nkyz?(ael*c>cZgEtHf1MNc?fAK43!J9>Zp1!Zzx)4{h4sAO70y# zBJ1vI!%2fwod!wX09cv5a!zWuoLzf@RgIt~yRJ0Zt06wxAHi-r7xc%_->e@f)@{!_ zP;DvfryD-f2$q}M4ohJ_ahBtyJ`&4qEQ)COYcZvHwL#_}kKONs#*imIOP7737^{Ny-Z*1VU_tcD32B}~e ztAGquB){Q52)wD^-z&RmIVzNwInjrj`(%gR&dps&1dF3(9=S@_ z>|^Ih2Mm}>M`V3w$*k?$lxya^CzZ3Yhg7Y_2OuY}_Aw(Id?j6Ayqr}dE9%>3@J;`NtZzGY;+t=-#T3VCor#i~I!b-uEF%Ja$T&=w z8+lc-PpPd$5Z&){93EW?R`NU#5Ox(06K?{GZoY@8M?z?@2>aK_v_`aOfEJ8h5uo{W zste+UKL4xlAWD&S-@u!A{0TUXfqu@|mSk^m2&^4?H=FPYl}icGjPAQTfB&s% zLHTqNk;gbYZ_~}YLprVBY#&hHEU-?yu1iKSAx12XHDE2^D`t;K31_I&^I!Hw(?3@8HK1mt* zy`DbBO`H8B#9H)3Z)r3LgT;}rev1WhnSeaDG+bTs%`y_gHQ+w8;g+zF3!u+xU+P|N z9_fTW0*zPZ)a~2~g_Ogk7VJOdHA#MH*Xaz+5VJce?BcfIBTM|O*JM5U1$8*Q)1pNd z{TU}euWUX25B&Oq1Ig(j=Q#DC2JYqLM2Xo#q;HlD=iUO&L7}MI;=I-&7CREtBB0srj%N%9%cXTL`WF|l>i{Z7XOANRW+6n;yCz(op5$~4q3ODM;VtCuEZCZBG;((LZmJNFYw^@D;O?0`+09Bi ziI|mF95R2dVJ&b~cRN4uo?mnyS$`|J5Mm+5I%V*UE}_oE-Z1@vttmtu#Qyp`m+-QBytt+ZNnq^3x4I?IaRyO)5r!QR}?`*-k%P)=G8 zkcYyD!R})DJ{=&RhHb(oY;&i*Y1fKl)Nf%t6q5L>k_S_wb?o{N6DW@Q)Pf^nwn*#X zj92s=@KEJh;DUO1Ea-z+j3jRMz-}x!DpuiS!&=z0#9(qEZ$E9MuVJ}OaDJmO^lal^ zvA#J2EW{D}AZAsU%eI_rq?rSI3Y8X&Y+&N?v6WHm>{B7Cre>e=^ey0ju_Q8LUTQ6L zb!?3qHq43)8Qfs{nusE9kd$WO;$bS#M+*ftRyR5FEmoa@=Ox`y`#GaLQJ(sG>?yPp zK62x7i2QK)ZoZT{`5W&wwhxsX{-y`_88!rMZni+F|K^=S>3^+xziB|v+f7K>$0|U4 zD1x1ac8TrAo=I@N1@5u zWN;HRc~+}sEVeJ#enD;xF1dOkc50NDLs{Y9WDR!?;UmeOvp4x=4GWnyEsJz!MWhk9E5ZT7rpVh$6^kEWC*EQo!m zOd%g~hr01_MPe97vfZWk#8q&B40bfG++uN}rewHarSc1|>hznseTcqWeb-k{e@6V+ z+@eU7n2L-ODPZoyYaEA}$cE26EbCUQXtvgO{wFDOowMPAG`I?aWoCnF^s%Ymm`+%#!Jwy6}A$9uo>B;QB(x~ELg)$5q( zUp2NSJzX`MXT+S+Y+U2{-{g0_8OixecG)o}Q23eZm7IKyk!zksD{Y-pQRds_>LhC| zLz0hf$Z2+`$~Z53(}z8Af^MS@>d;@2^f(m$quB8+(9V?LyzSL$>$Z{7N$}31yady` z((`;2nJJIZ3-nyXZFN+sEHcH2{(SOl&7EAzY1oi|3gn`Ua;HL|ZM{|aa`g%zfAFDU zlDRs<8hK+o?ils{)OEPv@w|?rq3VmCUw*NX)=|7zcKv-M_2auN2^-*jmjlN6GU|OU zXE<(ZodpU7>steDwA*q#*VduMq=y`1+Nga7rF=G zM~AL|UAi!D(@`G9IM;|z#6DAw&u?STV-|34_3GHcxL&go!S@DAG+A{~mu8ZPY+UT` zFJK7|?fcX;@eVg)I{KB5srqryz#AxI3fvs2PEd^>L5T=h&NAW8AX|@duCocMs}6HE zc!hB67Rhu*l(jI(Z^|f*nO(*mM$s>03z&Pcko&x2 zHkqw^MS%ga$s*J{jLi5A*^$qBSe*D9t@<4q0jE@>1ui^viwV%$lZC3g+BhG`@)&}W z2+EkcU`uzAfMm3-%fsL&93>8P$X1Qi26d|LBz|u&JX+t0@tjkz#awJ&x83M@TDLNk zFR>(S&ou)Ce8AbeR9y@Dk`~TR#9{v?N~5;Y?f2u77nXuWp{ASN2mB<7OG0`;qwW?< z6UnJHpkYu2nnEAj%U0Q7O(pOvHUg+#0%?5!b#A;tY1f$27h#FXQIpv4ozI9g)j7F4 z4l@XNE{NER>8r{+$jwTrT~njAkK3w- z(3(u2w#hN0!L*x ze&PEHV>mx5bh5?VmDG60q0Fp2qzhAoXvx*%zA}Lvo;IBS zuHTb|m&MizR-AJcc&_!HwEB~Vw=lD{hVyoD#wSPxy*gK4jx6qEJ1vQ>?cHs-wghYz*B}EAV0!S7KR9{J7Il|nIR9lst=SMC!N0I^@^!Izzyp9Ie7c1C z0FbhDl~};&+)(cmM;@J*wcWGlfSatQKk2Lg`)^f7yJqyAOd$^MXt^yY_P^ve+@4F> zum)IdMgkVNA7Xxz=N_qPQ)9L9qT$-~-=EFM{FV=G=DoYqB-d<}Z{gbhj}9Z4*o@7g z-G;xHZoC4M?~HG}3oYgaKmTx&S^qKjWvCU|c9~mue2%GX8|9xr62Ay0%d|YqV{tDG zdmV34*q@>t@nZTzz7B+T zjm%l*1xV~U?)Jl=EUu0V{F2eoj%UUCbzaVfb`kQR7OEqwOCCk-H}-+H52BU|$hTrv z@6eMnR&{y}Q}=Az26ii#@IQ-7REG3|{Y>}QyRko@CHvEGspNjHqjZf8&L+#16OR2Q z`Ne)no$2xs3A}|ly{1&SZ|JQVyZRJr0tIe06nC@FhpZq+hb{J-6AQ9Vp`~y~xBGMc zJ>xkOJMQ$co#*UQvc-8^q4CjOJHYs2bk&7Rv%EZ{)X>iT-2`*;Aa!zCmRW3UuT1-Q zW1lSGgPvy9i&Q!?J}z$6$+Hcijz#(h*}K!sXyEi{YA|HfY^9x`B^ zaDhiMpLimS)5DBa1o+o#X;RySG*~dxA6H$KRvXou5Sj41ar2yuar+w0vE+NDd42KI zeJ&>ZMEd(jhGruA00$sf4tqajg|SvUVOgFBeGdd4GM5h83BI^~cPB)uQXuLZt$O_O Iu#nJy0pnhx_y7O^ literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/rdata/env_data_non_dfs.rda b/pandas/tests/io/data/rdata/env_data_non_dfs.rda new file mode 100644 index 0000000000000000000000000000000000000000..e1b6bbb5e117ea72764d8dcc41593b9f24806b3b GIT binary patch literal 8948 zcmVnitj$_}qEGbJQ>86s*l&G}Wqa+<1OL1_{ zp_Chv7Sf(JlvL76X+bqpB<+f_l{HJ2>}vuiu}`|Y#t3Jik8ImY(iq{~WSOvPr|9)>prtkLW-5hP!Ru8URI|)>{v|kF)&+;1 zkqP_{;8gxm#>_jwm*K@RD$VWlT+^6;E1;SaB%e|S{{i$r2NdhdEySKTCXNlz#Y0^t z&(>;=4ba(1&L#avOeT%MhGUZF7{2enjrT1FnG1UF{3Sp$`jiu^@X=kF=LQ&892g<*}YtHu4Gd}*FcWh9CTJ8ILl8%lMN)3|}lN~||zX3)M>KU(- zy%zdkp~iJk6~+YR@Nx5BZ0Ub+r2OCRnf~-^jdmyKF}6!3caaEB9{t zr?S7~FaLZK|5JbY-`>5K1%f1fMbUp~{2ORoFd*=;J#_onXf$EmJ**qSJ;ryQy+sv^ zyW|&rqmglax6ypM4ffY3E|=}d`b&QD&v)-J`pN&*J}%Rqtfm{R_Ya?c1D`94^%ZTm z9~kQoM&&j9@-N27r`mL`1*=5kzVVr!HGcJGXoqh-HD873(Jw$H-(rNuIS#)fjj14&U020hit1n9QiU_8_M0G)`~S z7ko28>M!}sKWB1`KJ&lMrQzkC%H!sdi6NHA1_4&?`G)jb@BQkPpC^0N2`bG8+X8ilf`(e_4Hc`Ky9MLah z_z#tgHY=-&t(% zN2`r&jl@nUPS~ScSmO(oOT*p>w6Nhgm8>57Om(PgmZoO&{5_NxRiNBBvll8B)muIi zl!l7*EiR{Di9u;<(coQ~OsKL>|K!C39Qn8@CC0XIpu9f0(tcee9IsBb%QjsKg}j+N z>)g|!Tv5puKXL(-(_AUJ&_5L_%a2oN_(dH`rJCP|4QWuyD9JV0tQN}D-d=tpqY{e0 zw{%@qcor(3&()TW4~5DPC3hm|0;M%J9&`9K11f2%3t1F!K?#vV1t-?%L8au9$|+Vhy45*S8#o;_N;8!5I*e+Eb=j=cPdL zB`(jL&s;)}>W}GFAN>gVp_Ttu!3an;z47Vzyq!=~B;?t+4Ghq`&9G^w;0MsbbENyw z^cv79X`{8%J0HAC|8T6?Bp&jo^zDh~mVknTLb?WB_Mp#e#+_(KHl!3>kUmkv1sZOu z*hg8tf)uF;7lWqtkWZf5U+}Rq6hTv|Gn&?ezT?IBT;~~s&X*b3zvvFAr>c=ilL_eR zTar>%_(90Ex$W#sFA+#|yXmTRJQF?RZoSg-NE&ia9a^p@bQts8Nyr}`;|yMGo$-;! z$Q-?Uy+G1Cu>&-2y}hPsjwPye^yBF{zYKNCmCGazOhL8MiynDIokc^E1*ypT4(JNF zJYS#Z3TUU^AhZ<&VUZ0#>1=umKxYlmCjrkgD=g2OK00}c?)F;n6iC&ZYu!%t{ z_}F=J)9I%RQM=X0Mi<5XsP}}^%l({Lr*=-MMU7GCTDe8*aYp*-3m$FCia;(|2g{&~I zNc0o!3JT`s7VC2ujK;jH*wG!UWidrJVXkRgGWv`yJaKVg4fN)wp5lUlyTgPiJcUy zQtyqJCnhRTpQj27o2EHvO%6bv1}~gxoiV6-ckzu~MjJ7&u$I*Y=x)V?mbGSW+?J3~x9$ma-{u?o>R8W#nB=N@_B(oN0F7rHKYuph0=d3E^9qPsda6 zQ~3rgwXeJJZ1y_LpL9Jr^*+LK_N_W|lQcLytO?E+;=+`7#xd>f4wzrg!S=2BJ1o(d z_EIL*2}=%b1)kfUVX1NZl-8UCSmtWXv}wkAnC$vGG~{#+=C_k43`rfuQhL$LmyEZ; zJm%E3s=GGgah;UylQMg;_>v`Jt2UZoF^8@IvxFmERA}l$BcO{nX(^TkuoQ?%)Yt9Ao-NC|}ulbn-Z^Ckt!449+C$WGj86F`bm01@d}Z#5XAD zfY#DABK#faA@4eCeHSmxZVDFJNE#UG3Ln`wRJXd`UUQ%~erVcqnffV}- zPEB=Cxc9Ym%~AKN?ezZQ$BiqxM`0nog4G--!+k|?lm}o zS-D{<5DP8{HMN*~4Y9L}jUz7#fnpbnD}CpL!9yR`P5WdiFwiH>m7s78^0p3`qU|RMkl*&khtBJvkbiD;VgTt1=ylbZ zX_eQA+Tz=WVhvxTL5!%SL<1LB&b<$k1XymBz=Z zM^=_VuKNlJCFjKfyH+Waeu1NZw|au_uUe4n(Y&ed29Y40*X2^Jj5OwHT+$1x<$@^p6y>|DD7Z)6>AG?03edy-YkkW5YRLCC;@0^tUMT+DNXr&cAb&wv zmzDW^&|PW4DhZp8N@brt7u&{#I*k1zP6ex?E|r#6*W~A*)@(vl0Gm(olDc z!`9McF(5m!Xw3;yH|S9T_u`t|pisUc>Fo46&|?>I-q1Ks-bn{^ ztj5nZj@p1W3HQ<{(-o-uD(WQD@-ctqR(4Kt6edL+5%;~e15+6F4eu(>pgw*Bp4^*F zpzFQcxg|PF1VA5&+ z9R(;)V5#Tr3)?`K#j#|(H5l_0&Y6-Rn}*7KCLLlv7y|dm_e55CzXM(Fs`=Y&b0BGB z#hkO-k3#~pF~+3o3}}}%WtWug!=&rY`5_b6V4mmfGiw7Dpn>%d8z(09ppIih+INkv zgD#zeis?6#FpqK-ulcw{OkliCaxQ%a>il!*R(EnB_d7n>xWlfH&p<1yrD-lC4D`w5 zFET+53iX$pRogK+B#USJ#o1WULg(P-S`kcEaC4g%`37}}ZGJDKy$CcZr<>&_nPIM! zU)p?zR$(q$((#@QLo|fuo?2i`0fl>WT5Rd(LBsfzG>a>pU_emm^{b&7sOK=%y!x&< z7V@;*x#^l8mb^sMcw)ynU(HfnRMnS)xmLZBy!mk-%Jy3kxmEu$78O-#pB1c%DUYN* zjuuOx-ohD|m5w?g_B3+FN>&A^mF{i3YV#4jjFWZ<-fV&SLL?rhb2t-`amyYpXVLb7D8=VgMI6cS7Hfm_l1kjbJqEV<~OF3jWHqmLPA}? z6{^oHA?})?4vJYSA+tLqQLA3~`|%zVQQI%Co2S+J$kJDd@t!T zD6!C3FoiONijDm&ONFCQS7X0u%6cjmkE1m=zV*gZS($0^q_I_*Fm{OHrj?26{z>r0=p#Z5-ea_dw`f; zf+Yfur(W8dfaO*?U1K_7ELpR}=*4UUOtM_{AmNxbdS|FRZkd!27B!ZVQj)I0LLm3l zD$P#REg#5}=9q>aoYyaZJ#P@S(Q;&7cr}AJTU~@J6ZRo?b5rJxvjo&vF^kkQqZJF+ zew_ULi3sM=xHWZ;j6EhRbU?>h zSLmJ8jqGzz6|gA(G4C~Td{|_0&g|IoA-5Y z6Vf7sF`sTyrtvaAEUGs@x!}GHmLx2>ml3Z3Ox zEa;=>>AZeSdGUUG;`JO%5eR4qalC;Vx9~=8<(xM%R<0Pby;O#BO6I!&=ZDAM>JJQWtfbo>Gq$F2!8kHL=~C7ekq?0WR4~#$lz?DaCakGcYCL zUXAe?C)697+7j**0`hl^cja}6$9y;MKYMiiFc#SK`1usK4%BOZ!h&J_LVZZRc9oy+xe#>m3*iC$x$44YH z{F55U1hVI@KfQs4q^`%9*2ZIELT*=o;9FExy@hsl{(VqwjqH~?8++TI-6$XZsg}{4|SaiU+E& zhyrQOMWYv(61rvIURD(PsAzEix~?RsUp0qvVv9K>+*%abU)>1WRMuZFrSpTcAL91M z<)~mm``0`aq?oW_QNJbs9)SQ#K)}T|DOMQLvcJRPJHT6!6Ar^BfbY{W;%pEoONh8=5 zlU_NpN}LCuojaXDq^mWcqnxR~QKW<*m7T zwIAqw-7|NVRV(CHba;={#)1AJ2G`oye$@HG?Z~mseW+(%ge6BxIc9VVzXb}b-B;+dv7NCJeYVGJ>$Fk)`GGG^sh3x7em63 z9IrBA0^c)8JY#Qef0_uI8wXabY1)r^#p&@a#2sj8$hd}_U4!bKoKSPu%z@vb*1 zRp1j-;zL@xCF&e+NYvbA$a!B#IM=acJ}B3Ko;fcgKzsHD!v2^6$U_dWdHB%>a`F0- z?XN8cH4j|XYg0r~$G&*gTPp~hbEt;d%Wb@<^*9?{v=BlU{F!u}sCv+`x)=FR1|e}9kT=oU4eFYYPF8DC z0T&z6rfI&ez=SoS+UH_9<5YVqSiAThXy3MLPw|c#$V({F)-m6QK1?gN_MWYXGJTd( z3oKP2VVkEwnwlZ#sJUBpfTRY+ZlyOyR+Am@DSSg8KMG#k9SxL9VIUKi@V zIX?T%jOVEMO46XF#ALv3uhPzq7=%J`Ue1xamq2;2q9?u44BbB1+|;!CASm5ipyj;& zDx^$ynGiHH0P-v3Y}NTV6^dysCo%U40`~aUzWXApP`j29nO1QJlkPCK7w7e%hAQb8 z>U}@Z-6_(MFI5DE7phY?+r9wRg2iU0n~0!hoXpKs(Q}}6-s+;1BmR(bsxupRB|~W) z+a96(cTj9mq}sjN2(Swdba;Qb1nQ{s4OhraLhYjC`_>hA#vFSn*tIxaHS`*kJj$$p*M1m|<9(&KB0B)eDBP2; ziZq0xZxbI%mz9E!vNY1I^R}SE)H`s3egWj(cyq?hcrPewR3KEbLhMi~t%TRXB!_xEMC6vsZuew2Z z78EjzD;jj%47&VmBBb_oKrTz_q%_;zP_VW$=-ss)P=Q*YkYwf!`O0Q2aeuKG3WV?r z{z58*5hF#-h2n4b9^XTn4W&|4B9_@D zLIq2urcY#dsA?(@zjsXxRNnt$$F7aVP^!&Pdr7_ll&HD9;=qGjP?*KMuIw5GMVAd3 zI(A?v*EGEtWYpv4IC=Eh{-!8_v;r5e+&mYiA_#qvZ z8EG?;AGwd0#K~M@eTR?FgHh`XT}xk7Lu&&Y{g5Ut9Oad)>|Y{<@@mNn`@_aTgZHK` zI|zZ$S!;@@5NGq7kmK{qhK;{+@>6#aK>#?InN84oQiiEPZ1&uEC8#)kn!~Qsf>2S& z1AJ2Q!iwq%5-UL&o+SA+EV6OI$l&JeJCQG%p4B=hruyM5&%E7p4$#tO57@1o`A|ur zbH1L$SEEY~5q^3@DfPMhXE%}*%|#;@Y+8TkZ6v?xsQY^xmYc4MDTCqrEdYR}@vAFzE^IcbLE8OJ0ty%=pSkSt)5$r%fj0z{twvY`5&|UUqb1JX{L-QbftQG z|LjVd%%Cy7+ybcHBL#x4zBG4tFIO)b$H$GV9jdn%$5W2Pw(u~$Jbb9%MLvzF(q;Mw zP#NFkgs5(7bw?KpxQ~w2;q(}&8$f4rhWI&7=%>*#0?Ys(A1WhA_jB~i?=Q9eys31S zv#X~U(|z^EGvYU(=wxujz)0uc_s5`N3iRL7`vM15IBQ z{CmD%cs`XyRhds4hHHc^(+w;yy6Z?*c$qIt#nzKS}+}kH~M9fHqXm~jz`*4!XJ%?r^6;{ zQX913z9|(h*5b+Vw1Yqjbz&u4Sg~N<_9T6%)!llNde#;)&G*yKXI+77w4~%zWn5P>bS1qq0_d*0zu!K7DmX9EPg38i0{SLjffe;1q4rSEIRm4EQ00={%4nkj zOmfX-C%ITbvfb6)yQ8mQ1>eV2DT!<>&UN~IS^f^ptJOB~0e2CWHqvuGWH1r(cB#5X zul9tp?)i6!drYC$j)OgpDjA^jA^$>8%R*GGJ(sAPB#mY1KBfxxQK+GP>&a9Z!T-M# zqaSzKINo77R#e~*P%hdIC12+!CM|OY+4hP1&i1{5(nW*TJM#}gT~Uo*US=~?4c>9& zN^&X|VDD-_yMF@|ythPhNWKD(S1fO*P4LFz7djMvy|okb`ll~g6t9dG*L}*8>r;i~ zNlQ(69j8F$ac83?-Adr3a{p&`CW4@AWrazy;8av!#tM89B!m@oJ!?%H!%>@`{Ds*Q zs{S8N%n!R7d1M>>s{yos80|WICh>J)(inl?9*v*2!H?$3`j-FQvEy@iErZ4!L$EAi O@c#gnnX0jnMF0RrD%-mN literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/rdata/env_data_objs.rda b/pandas/tests/io/data/rdata/env_data_objs.rda new file mode 100644 index 0000000000000000000000000000000000000000..61731d7774e4544850a6cb232ab1ac07625a06ba GIT binary patch literal 13735 zcmY+~do&d9;|K5}gir|~gd~woD&(@2q*7FpOR=8_No>e{FD^+Eo0P3wHc66La=$K- zJ4-I>zKdnucP(r0J3oE@`_1c|_sp3y=b3q)XU@!=c^t<*6enM`X3dd-e-G*BH7k^Xk>-?@q;zH%r_tb7Li94htO@l7Dr; zYVVz!2mZu_Vls&5x}Th4!im&crUf(HqLotR4dBOou@&QEvW$mJDCRHt&-BwH#`}lB z;^l*m)`frHPALQj81J8T(6Vjn_;H<)j6Hbj*YA7s-w%HI!5A7ybPUdP)(VeX`?4SH zecskh_0ikX&&Q0gj#&dm3TfmTv6|ehl}+Q)K*^0?B3AvWoMw7;o~4n;t|b2* zlI_qmCuCCfJ5D711?z;Kuh9PeFqkfp=;~Z?*hl0VL9cR#-2E&4fU)F(C+RxnjiX=? z_}avvkmuaY{?Mj?_e=6s1tPtH3Uf8)74jm&9E#j4@zla(jY0iDicEWFaLtpmS5+Nv zU7cF*9Z1Z~(lgh;ZF9}M{(5Z0lbV{`QT@>D8Mz_&C;A5)b>Uz4ZP6-^O@6qh)t-;m zsz5E7m>(~b*m!1q&-ub9xNJ5e-Re!j{~r1#=n#<9OJxudtPxXRghy}O*VDAJp8vGTGD_I#W}(O(NCTm4ho z*;mrMqOux)WhJD2Xf~6u)xA5NCfjqkuw}GuRJ$hpu@3^$BVT^4mECFWyWzRIKHK(= zT_?f1HKaj2BBtBbhIY97;l>xYBhvcJyHUB0v)9BJhf0gX81+pTJz6!@h}^!n)=7$} zim)dGaprfN%WDKuF*b*oKCK ztyvADPU^?R$V}NB^`K$aqk*af)6bphzDDDSlH=3B5gz6_kV?bs1Qh?pM;b#Ve4!G2e`;re4~rFeO%~s zge)C;QE;Ojr^q(a1iZ+=9yZugfPVu7k%2>Ou(be00z`PYC;&|Z=tqMFiaLAPsCPpK zUQc{BhG}`pKlZ~US$AoLqN{?jnjb zNWv6Y{)&Kolm!Ql*)bh%rb>!3mZdt)3Lw#Ehm26U787tWXj$ znkq(=X;9I|Q066bT%owhO!)v6r`(rB1UXfun(40$CQG z1davvAP(|QvL4VNjmVwb6>+3Nnvf!3A;d1;ZWfe;iv~mmrYtH67po=$mOvcf$S30&7o?;{~D5 zq!H&>izM$T%ua+T>m$iK1|yD;XE~F+qcKv5v#e1PB?=>qkPefj9VOjpOph@Vfl6fU z{0E`WswPolFk%Q*Ryc_gi;+QCgc;JjNIi||u|{H0=_#Q)kuXVG8tFvSy_iZNsK}H= z-I*|7+AztlaXhMWCvFE%h$h(x+J)Q8yGToH1PS4G*PRHvN2?$y#6tEU3|LwubQDAg zp~K20p<^nAp<+|gb>?A_v@McKqiIy=4(jfyeRY~)+B92IW}|6rs3>*Mlt`Uqm;$Yc zwo;Clu4phvJ>KKyQm^l zl64AU^8a%-Y+IG&gvgo@RccDP&L~X!fAqutQT>NCM?%L!q^HE{WWv;Fu>TRF{d4i3 zr$$xE6H;!w{y~Va&V<6%PIhF&-F&ZL;IKvtu5uzZ%2&pi|e^`fU7f7*oNecUne>2m{C~trvW?nN5E+Cyi$OAqBJMy4 zvo6sRNU}}PUATR`%e3@Hs1Qz^=S53zf{J2xS%|~-vrdpIqliKlim+3xhcq3MePdn} zQ39dMDkN1#6GaiqtUI)Pl6@0Y45z{ip_w*Pci@D1wlvcw>Mo3^g%nJQTCW;`8 zSd*mCSfVgOhGkA-#trTV~^-x zA~4sgJ8Awrv)ne8w-j~!$MDk2X}F(buW!=PgHzn>+V`B;AK%A3!@NOtGmY=Qy$IdM z81hQ-4FezYLhPXq>)eocTd$wyd!@Orq+VUeCM52=AsLVq8P}@D`i==C^qCllAqXN*3pB&+nYg zMgc}0xJJ%IsrpW9(^WqMm^aJNf_iL#=D_(E*pEX~p-Nj^S&rXA>CAEao6TH(mMClG z0xG9{v3PSm*X3D5kX|JBQ?yaZ%0fXbOzapzgA%bh9)pzbqeA5~%QlAM@Zq;zBDlRJ zV_c=bYhoiNkVfWS+9o#|+T1SksWl(UXs-s&z+-eV!V))l7L}n(L>wkf0hk$@R4nkx`J&O0Sygh4Y>js@ zUW0|)J(<@Y+V_Roc5iEgP(H<-4F@#o@TmJi>&*KrrJn>1Eo)*WZJ;y^PvvnD%Kjw@F9-TaX9U*eJ0q8K>zjy8O>09Mtn8+(w1v_E3LeF=uTYkh>4m- zH%`#sPN|&f+-`zM(R$2hK&KD&IV<*1HLX^{!fmrAjv1;+@L1S53Pb;0yFreO*^+&O z+<~Z7+HTe5de+>76_(j5F`M@$P?q-*#KT5|qj(o|lX(bUur@ENRT|ru72bwNwBl64 zw;lj0W06~a6|m3;mCl>myy4r3UFO`Kz&@L)^Jal+TKWyGS>2(6(S?muFtnw11l1A` zKEc^BYj_?e_UT@P*e9oe{Z}-lneG5;x>hmrWD(!GFyeD`uH)yDIYMLgT_C#aXGrvQ zj@FuxA`MW+t$mq@oxt6r3J0I47os?YNEauIj!KQ+LfoMih1}C8{I<;6>oT zl%cASUrMc5#5)b}vR9)oTl?hmQ_Ad5Umz!J1n2!&#i%`HmAnH_*Bd@WD6ZDMlWi0( zW0IJOC%9ru_XLusFHgy{3Y9k<|3f#BG#5cphf-52Uqyg9%hHgo{OFB-Y@(b|v__}I z3f@e<>LVxio!$9RnJrIqnUns)=AXO15Th4DRRT+iyH92b8>V>dNXf z-!=ZXxqCou_Qc6VR=QZj+pP>sq4Wi&tgf4oO`}?qzwCFrMhH`by0OWWN6Z}B|6i8) ziWQ2QkW)D53=zg#hAQ)@)BGninc`XR&1Qi)tsvL>WY1g8uo|sU+JV+4Fl`@)`BJ&J zF#1GG2Ph5E`v9F2$oeJ@jz}Ti`#=zg;4zOSRCP+?RV(rszYD%k*73F<|#3+1*LYFeOKjoYQ zl*T2d-`d+E;W`!z5JBg8ZKz6-LQCur(x&C+9V83_G-HhUyJDD2S- zxk~k6ophotSs-fBrYD#lIAQUCuJ>RIqP}IC-P2zA0de;?MRKL9PAR!^EG|{g43;*B zKDn;ACNlXaEn|9s#Lpj+)eS`gB0VRBfGrJkW4V5YfM8IsG-Yn2c_^vN(U-1 zYvXO@df?PXLr3K1W}w@8Z@l0Hy>KHO_onVUnL)X??l%BBOs(yHOZay0vfKNO_e?lP z9WvvoLMzg+yOH&~OVHQL^(U*6g05|uxY2Fh=x3%F@6^Nt7rb`~wQ zaUqafYnKGrH&B`IRh75EQ6wM-YtT^Ro(S=R|DwunX#l9cRWaZr$8D1@Z4#Ho=&n_O z&k0C-5z`q^c1#ib zi2f+^xvV&|Dm1(py-@;6;2og&LzMU_cFf!qZJQ{IkGx0bbT6rPVy48@-WtPpo2;u1 zYu8yeZomP$r}qWqS+>crj~eAaqBFVnR3lzXX5@L@=Lz2ol-W;8LmIhjtPjL}YKkFB z)Za_N`AHGawvT6-{pZaJdCN152);y>rg?(by1o+EMhukef8|pSLyE|MN@CMPR%+pK z|AgC7Jmm<24b?M_S(X81W;Pa(1mAc!W=E_rkad) z!=2+{1b5;s8uq;4)`-XHO^@R7W1DK$Ca(ps1?Q`9J^obzWKG#W3d{{8`n7}D3;81q z|CPh+hr^ac&CO=>Z&XFHH-s9Q5*8*Mj@gO)fKc57FbjVHZxn_{5G!AXBo?=6kmX8u9U7HKHpaV9mSYzgROa3Fjv2V=ix+E@|0L_{a32#O5pDL z)pO4#^2W3_vZyjFK~g3nubw;rT*&eud&{;121HJ3d%>R+fIKm0r;XGOhs~QKEtk5H z0>gG@0D_6JMyws8W4kQ{UYi3kJc<=vm3NyJd;~Nqer{fy+Yl90hY4@Tn}5SZT7tuO z<1V=U(%vL%^6J~MZy}q^3q&DuwGe{5%2TC}cUudh2RrcWj~JftG(insq=7QqQTx2ju;?yXI!4!R4Ag)HnD$+b~E0|~%6C*aJ716C7;>PpYRH#9JmF5?aGL~1*ZA1aVdkexoAj*}?;{XZQm8_R1#pLT z3L(4#5n3@6A{z(b^BQ7MuL$aK=xJbsMi1XM68!3Rc1xB=`an^MvY3E zWG)P(p5rE;cYM0W_R9!RIr&Cg8DT;WopX!kUpcH$quU2Y)osj?Z@IP)#c!WasG9uV zgeuw|ziXsz7w3)G)*Ir9R(-Oy5dTdjjy$kvO{Wn0fSVjeAut`ZWgg|K0XR(=LUzFd z!BO`K_7sG{$$gxwEKdpg4NLvCt`#2R*-9@}UI^7p*|);lMYnjtdrL*`A+?ed$a*rOqJ@3UsoMD z#@sz(Pv_P4qk`^|4u{hrpHsQVhKY^D_GXhK3x`3-0Bye+7qD-J&A%|Ffaz#H8Z)xV?7cNgr+J z-+dFHas;JV_s+~uoBrE1oG5JZ6MtcN{qXFG55hRJ1x4ruBkCw1<_wAbiEYa6le$>6 zj~r+1yngT@M)i=GdH9dd=s+P}$MP;kwWbbQbu6sx9kVZaecwZ{KLzIuj^tTM1O*!N|GQ-Vz4V7abOYBpKyTfoO^p3gfd_rk90Tz5M`GeU}chBmVVhjnC7Zc z4)g5hxP-ujj6UT=CS9K8KNY5hPdM?&P zeN12OE}eeRuBpY|XG-lAwDjggrVQ92dHx#RCfe6|anSktLFjW3<-#lV7mLbeBeM$2ao2MQtNG)F>6bxOwEkHLD#zy7 z_+x5r{pPr{?jMMBVWuj&@EB-tY4!B|%>yQzVauO~=C+^fTu3E)&Cy3Y;-y&xZ#nEI zYnZ*vPvrku`)!BOKIao`ZYx5VwPh`@PoB7(j~9-pWAxFSc!Hs@zP7ktJ1T_petlR+ zwBEu$m>>B#x50K{>-8DISNOLrH#Kiy%`Cf19Tsf<%)-2!S(C{KC}QStFEF8YNw(y_ z6h~LvdiYO4u*oxUNdLmK$R&E~E3}i~l_%hYcoSirWXr@^5W6B>RVOWeL(q_^(Bx?5 z;M6jIX}Q54@3dU5kG9DA`i^}Yg?8F_Ic4{=D(yDaP7TYc%VA>M{C-yT6PKUWxVB8y z5WlK+sYaMBAo?jmlhP59=F$v?6%){U_p5h-vovc}8Dz9tZ?X2w#M7KGYdCE79iLjV zWpI(;?2ARUB^!+Wd0Uw}tUt_2%`Ybqm+5B)Xvi~m%Lo+1Z7K8gIXL3YFU!|87SP3Y z?6?hz>9p~E%Hpp1u!R+d`pj&&hDNPOEVo#kbz@?C9Z`mv0L}$<&S!NKu?hd$Ks;bk zvPozw@y9w*vYbL*mb)#?8VrYDQj4$Q)u?!6f;`v;@|mjG2-C@;8ncq&MST}nDtOp} zNt{ISblJhC3tWQJa?%?9Ah*Di0^EY*p8hg(&ida?3XFPsF@kA7@rGbzAxCg_Lvxv0 z@??S&lzsW*@=v0bw|_YTpU%!_)N0f=)N{^pYOXI+Ol}w!0D9jaJAPvXZ9e!2X$nne zFdNSjmoU__b8p$|o14B}m8uQH_f%35{FAGhS}3y;H$cq(tb+xm-Ql;Xrg zs!<^tMD5#cy|r(;IEK?(ABl~jGO%38dh8}p-ZzUn;f{RTZ@+SMtFk@N51uO!K^$m^ zC+TjFrI?Or1zt5xv7EWILfV+)zJE+N?7 zaGTZ*&|Af|2{f=RseOs&%xh-80Y?#X+$$4T%^KTO)$V<1OPX|R9d(q{2X+FKQr3(nS_u7mUT>#^qeY+~h@@xE$L{vhFMf`YSSi0|yhg_a_BJ7?T31ygY( zyx~pFf;PelyQs_0ItgyHuFRnkvE}1=i~OW{nG|bC}{T2fPUc1$s_t& z(z*35sjOQl@AYQ4LI@nEGkal0eM>If)2tn)8GyStU;{C-68L>O#PtM6xw#uf9(p|M zvT}@ju-}rW8=$+zbKWSRp@XkLv@Zd2Wt9Y3KX947a@&kiwnFVIvoENYlo3nTt3%vl zT5GETB7z$Zn-+gm#Ylf~BW+V{CqW7P7My!t^O(h@O?C<^HB8oGKj0pkpQ!rRZQgQJ z?d9ZGND4mLW7PZc_I<-YpCO2ud{ z8rI={D#dLw!6lU}N1}p_>%4-Z+Fw*RajWL|a#;mPp8XutuD-4VV+4oio=tw(P=-%p z1>lZWl1|yZ0?*ay20OPMaeH|G6|zM?J`8!dHS8A?SkR&}jEOlt&ERP@Meq2spxgz1 zy_aK2Da&fhD8{BPaZ*Md)efa9B;c@T>Nok=mcj=70o>I(q@50bVHx!^e#U#!_6rlV zux`lbA6fp`l3n4}#Q|;9x1bQc=-aI+_4sd@3+jQ18DA=Y&i;0sSu|yhvG*n@7r>aG zJ@Nluh7ZHHA3FlP{wlmDaJXFLrw6qd(ca+vG#k=#PLm=uO1$dY>(k|gb{ud6c}~6+ zylGCekWR?p2FV>V4GJzv&(OXEp4Jq|3ui-bk)m-%#tyxQg@#iXx^_&ekoCGo)wAx}Z zqY!TAgOVwUty3^rG28#lT|32FnCsUcS$?)5x1MFca0%iAqULTN9P!E`#>VUlym}sz zVn1_jW<$l#qJOS`_I_nSOX()g({U4?wf!fpdgQ{je(PTKnn?!h0B2|ncFu7l&jMIj z$8tU6hHF=DF`S$}H@bH_+``bZmlwpatOY^6}{$vqGZb5@Lw|8KpU`y-Ma^>rr!;@nPQD zta-frfD&82o;?m(_@i1fb8OhSZ?*JFjzaZj_+<>%N*}JmIo1jT!R|g1jD2Qk&swt* z%{LiWnFVXwC9$v7QfxTR`8}F{-8c#je^LI{C6%;(rd$otJqhcCkgI+M-{fZ(5gy-g zTeN28Sm=uom=}KI8=!q#o?p=Ix#QYta7OTjya9qvy;_3gI<_6B-zvC>!A|wf*z)S{ zB+mL73}1jx_7OI!Kbddx-}Ei7ZiZi~jBT(RX4=ePdd6T{vu;c0)JpQ&v$y|hvYhjJ z2ya7O^G0px=`-GNx0h#R7B_GWy~~uH^^i;~>6hBOx*7DC+3~G@{4Gvh!}*PZzD0I$ zBqSl@t=-^iQCpkuScAP9_$_jff0%#G6;t^Y0w*4>qNUzm!D7^cBjB#`?BMFJ?+`}5 z!IRkAkk37q`|+FzEm#W*@)$wDi*ssnPD%kJ!`~5ggB!Y1};P?{wtTyqf9P>+2_!?QdDgoBZ&^ zb*QV^Z@B=2KkZkM0$@OWOvQp4R+`No`mo;qgb_aafqg!oor1eb_%x_=s%B!@N@1rj zN{8u?yvNz)a!OuAo9-FiI!_PS{IZ(Ci#l7$yJ;wpvdOTcLEV?X5ongo>BXUEcfVN^eFI%n@!pp$GU1=oa>D1y|eBGZ}TvEVskWxXO zDlx8d0due)$Knsaj8J1_e90;WjOdkrmlBczLxV3H$I6PUtTo9t=SrNv_{5GS0}YT( z0Z_{m=wd##S}Dz$0AC8(3JH*Z=LA0e>N0lunU%;^RlD_R{lBq7+g)B`T%V<*DAS>! zw1~Roky#C+7`JCEY>VaBwX4ap9*CcRj2iA;LgpvST3oF>rlbYsHA~yzo;(UQ{`uir zVEd@&7h5cP-r!YKEADS0q)=%z-{*@j_x0jDe!L*9Lry}eBG`Gg{CvQK~jn`Y5RI~ z-P~KsI^-RGuwD`ft-DobHqK1&rpiUl@h?9omZ3(zI`5i)9p-J3r<2kXZaYfX+W;Be zh?EI*5)6?Se?GIjpZyu-FlWi2p)um$q`macgp8CxdblF-a+-OUx>EnEu$!+P1Gnf9 zThfsXjI27+FTP&&4`%VDs_Z(`;$O zuV&>k-P?8nm(*z0-8tR4XY)Ejzjq=SPfDhSTyi?gm#TNv`#PU%kqU6~z#VXFdGC+e)>^hI2&Bwtb~<*RNM^LJk-A~&(8NT7lc}*1UX3g-eFLHR*N6sP%H7OEQM!5 z@2*(nt_3Y|m!%_#DYQ%un(a8XAEi-Q4KbpWo=yHQnAK3_m`o2C&qR8)gbpwtV-8g|-SSH$qe&jELH2Fp0>3W{W3JhE?9a7*jO^4s z!MmU9d*5L52iACa-^TUZ=F*;kIPTr_5@hz!kO1sZZ*%AD4bz{C&jjm_b5Op0x^_86 z;BuAz$cihq#v~ozPOE)o^JYKii)*|NW#m=#@PCE`d;_GdX|wlS%aaF@_s<}&4P}Ko zam6=01ZYDq)0ME7=^}!Zh|A7(nw|ybZtv%_o!Dj${n~B?pX7{_@ul2Y7tT=#iM|4+?#c zJ^sS0*DcrUe$Pf)Z$4lfSnyxVrm7^nrQJ73Rbd!Y+G;uDM9z94q6AqC&A`H*UDkC3 zB=7TGY)+|n4g6@=DnB=LhU@uacp+v_B1fqGxxPi0N2t>=A}5W$+C6(8rg>bq6E5eW z`@$=%4F%O*^_`2Ou|-JZBUT zQ$_>4-jn5O60hCQ+G!FV{uT(3KQK4X=nNWT{C<0%dA@QmHB;65uKxqKiXA$=?thvx z(X0E9^p6r8^m-!a7{T+*^3hxd-B_|TaAU&CKfPJUx#-J%?}pN{r5~)p|MHn z%2APCA(3&fYbTy=;k!`#jmpnES68*TKE0n?BeM25x^%*-+8vr*qiS-Eq0sLg84FRht zu(3t#^|;%O)%xShhh{m94QbGmg_EM54=ack+Fee6gD%CFg_kC=O@e@lb7g~7g{xhK zXH|?JT^|8Pgh8q|3_6v!>UiS7X^#XbA0oq9R^EtRj0>f@cubt5_8WdG{OEPvxN32s zf#=iu=5?J1My0ONi9Yd5v|6%8dBbrIQd4RVo*kC2b)5Ii^t--bzErN=`d%>>4Suuk zQM=aqwg-hYrA@oF#+2j@ve|6RYaN`Ce!+Xe&s%fcxh+*0xr4ExZl`|Wr|n+n<{bCZ z$Awpv9QK>&WVW>_+E1-L9qU*2R+7DZKfJlu$MrS6Yl^cZSWQjJO_)}cp_uvVEo}0@ z&8qOu4}ms7d~XCv(&P6WAA#5cp$w{sW3A;Votg>(KYm3vF|^8Q=NuE{Xnnir^xfoP)^f_8CbE;%;G@O zkLlQSsM>m&LD?34D@PV{MjlP&4~?##@jzf}OhV18rasJkEJZZ_K6-g5Wk^?2q44DK z8e65Z`R=yYT1DE@Hm^0UnjvB#qeA-?*CW*gaCcuac^Bi{{&&98*DzE<#CLM6xdGzb>MESEHE9Rcf zrV*7vru1Ru<7<%>#)p`R22c75M1<`|p?Zij2ELujqpfvb9+=gEmCpfP(|+kMpc4DRdzT5+j__w+3i7==`vM{1;}71y${oHD zI9K)Gy~X47;fsYO+Jd4W-U7YE0_q&@>G{F*<_7X>^%%$Rdfn)3YVr~{wCy|tMkz+% zDkdIGE~n=o*M3|6%n1iF1iGm6P`z$$HRCR!V+kH{B0eXe`Znv$%%o#Nzuk?|tJh+j zB@B_J|4k)K=5r>RO`#oZDa^RCA5xS!7P?d2_ex3Ku-4>hlG%*C%_3&<{t?BJCb- z5wt$_8eBCONuI^UokX^IPa%(5^<`q+3UUiLwnW~I7vXl^0_k5@a%5F+odO)y;^)-t z34v|vkyi#F<+g*BF5g1-F^pZMU_-7Xu+EIZaUB131~s;>j8 zq=~e17ffkd#2mFWj~`q%vt~f!e*~Wd&2-oPw*kPk4gRYDo%gmMK1W+Rue@HZGXPl% z-#s>P4EfXv;p4t2AIwG{t|%oZ)$BM;T*SREGAeU-vhwu&({~DMx0)Dl`dCn1=JVEL zpVPG#f8WTKH6QO5{{>$LOtz_c+87ac&nARE`go?jz3AZC!1i_tQ*YNcIeO9|(@Q^s zCEM7iuB~huJ!<|E@Yq(#>>O;S;N9+)CD1LegOrhJ&9?A1$2-X>3HrH?o?#QLeGf2j j`71r*C`IKP@7%}*-w&DGSm{Fws(|rEr}>9fcbdZc?~Kp`OAo z6n>>}o5CFmcPTVbXr$0Y;U0zGDBP#;fWkux%@kTFv{HCPp^ZX2g~t>+{=<+M_hBzV zX1eb6;`@C__X<+h?y3XnI`sa|^I|Zc<(Y~a%b;{7OtVx^asJ-#YJcq}q;KrtOgZ-k zk^?wB$uXT6-61z2~zdYE?*F(Q`4u$_9*TKWF82Unpy|tDUdEgWSM0{D&{+ zBYnndzwk{bKr;J{OTV%N^Wxr_>(m)gO;Zz&Gue>)ceZ$JpMkW_A&WfbW+d89@AP0U zz?SdIJ@fAJpc2vy3>bruzkMp$v?~W`9~6MUhiC}PgYy4Quv}$yYBP$g$(R<>8M#9I2B}2?3rWyaIi1uIR547 z3w2Z8pPP>?gCgGARwu4>CGzXM2VZSWC}8%5F`ChPLCxk(EVIEIjp|cl0MnQ<#UR2Az-&6x}`gkD(J5D3;6E7K0TNuXg&)jsKd z3q|vZ6&$xjNa>e41?#&&cF(?4Jarz}-oe&JTb4l6RO;Sp6$C}!AGwp9zk$>@$#vFr zK1g#1Wr#%=*y7fU5zULBX}cDEt)dZ%Ld)@2m$*P0ULeY>dI!nP*u%-gVPH228hpgv z&>W1+WY8W$@nY*xcfd(Vw}yFV=gt7Rt7m4VXgN6aR61D_0L_4iX*-w%h1-6Y2_5E; z9^V%-aG?bf$;QivcSeGfKDO+(wiOz;8IyWD8z2{&IY;g8fi&Qd18rg!BvEYV*60Fo z#$5J_qZL8L>Ubvu1@tDJ!d1CbUDo-z4dtjU`fTPMfwxmxC14*F9f+1ze3FG}+G^ zs&5ZW@NGH`xw2|{#>0t78L+SJKYv!Y@BLl+3|@g3QBL1%{S2ydTA=joa>$+Hi`F{z zBBigTF{+^vA_IhhTGlWoDW&q@?Blg*+|**I;6u$xBm)kt()6j z!K*MXWZ!auGST+Pft_)XN%Q7z8dU|?WoU;#pARv?psH8jLc*8<2824bKjGn8iH1JaBR zA+FW@tw9A03@l6_K2RGcNK;;7ZfY@*A%G;vSOO&e!FAe(t=rgn>ls8R<9{^8Oetv) z)eKIBedtSWsi}7c~31lJoP@^m0-iQj1dal2bv(fQ&0f4gm+h+|LJT90$#U+U)nZ+fU$uNO}#3EoMm!uX!1=tJp$`W%*VFAJcj)Kg*5*Qz7cV=#4 z2~3_VuQVqIMF(4PNlI!F%sB4E9H7~GKugQu3V_ySoBAu@4R2P}-~6xa$+Xj@+Q70cBJI&6JZp?KJ>lcdC% zlwH%fRJ@FW#3_%e#x18bzbF$JsHrKSxK1smY68_z5jn&CZ#02Yom?)%k3e25|}0dU^bWRuacl3 zfeNFl3CS->%%NtY40Tq1&#iWNG*MEa)U?htu$%pajkrW<|4jQ(|MHncCQ5bZ`MMGH z*Wn2)ukyQ+&XFvK!u5%BKPO5Pl_;UjO#)RqjwU62>vCoD?Pc$6r;nyeDioETy|PYi zNB$C*D%}?UzxR5Q0CB0(JTL6-mA3m+;6-KQqX{2Z1%(`FXit65>5xEFs)W|z1Uf?m z>uGxRhg5s^d|=?R9!;54NSTq9XNsrz3lf(yZ9~~FE1Mo6GG+enl8)WPmIhCmJNDc& zpEITP!1lu@_U-(AkEHSve13S)mWu18AOjP=W zrb_~)CEkQbr!rHq+U@sO^rSbMCdo;Y(Vy-=C|6}7GEJ7=x)3D(h{ziJbN;i>f0O#) zX>!-2%}1x*TXJCMYa^a(DxxHnnfTKrk!2>$)1}qA@4L6&9Zi>{r%M1-c^j=i8UO%A Cs3WTY literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/rdata/plants_arry.rds b/pandas/tests/io/data/rdata/plants_arry.rds new file mode 100644 index 0000000000000000000000000000000000000000..e1d7032acebeb5dcea077192b55d847154bf3256 GIT binary patch literal 584 zcmV-O0=NAiiwFP!000001D%q+YZO5khu^#1%O-bRJUldl8c!5I3JZfs1e=6Jv4~y~ zHHZ*4*-5hC?STP_nqIo zA2Tyo77<6B0Vpw*(&^b_2gXH;RgCBGOr+4lt}(w0`{5jXg=_Ex4niF!K^{penO8&}8I z_f=ZoZ?~qgoL%UzgRdgyTrXPNvG!3hXI{u@N8zW9o8_+AaV)2EEty7-`n48wng*<~ ze1dVr_Am(#V2}KJ1A70A?OE6A*;t3{p6t$bO~2r_)sEQlo0xkUHNCl}CRc{E9fUl+ z?vrw|V>G4r$Oe%`eoN^mQ@7ghf7>&f4S`xxajb&*B#pypxo_EB@DpWy)!fPb&iP31 zy@J>92HwIucn=?d*XDhK&$$?4W7N8(n_R;e6rz%o`_12s1BncNTbk;AGa~_k=wJVj0XgO>rt150KnRqhpv{zR;smeT+8dU|?WoU}ghiRv?psH8jLc*8(Ue43=PE5CCEk1{MZR zAkFQVm!6qlT##CnTTBBXE~lc({DO?il2mGoaJyIL=Aqe7MIqjRlGLKi6jVE@D~K8q zd@hM4i3%>MX_?8Hsd*((QFhmglFYnh7?;m4zeFJ$OmN40IK~B#7sajMyQRfd5O93M8OP8m<2hBc_s1b zMfs%#P$9PBlEjiySdwB*&MyUe8qKKWoWx?7Q7}QS6ks^%r4<2fK~v6IlwYm~vmNAB XrUoGK|Ns9tKxLsFi4{uHu~U-7ta3^cJ12EiE~^}^5<-n_?AwVFVj}6nR*5BP zAt7Tf<&xTn+-)vn?wc9AeYfw={Jzh}zcOWHa&2+bEv+Qn0IPH8Izwfs+ldCBwu2Qfk zcJ1X!7*lMCBWePqG|6VeeQ59V0?JVmP5`}lzPt4^l~wlgrn|d)OUA41dw~S3%9<7t z8-O@ie$Jhx&BYqW|En^7=&_r2I%l`ueB(r4E}G;UKKrV33O^Qkw#&)EoaQ|uI9$Il zO!34e_YQxf!<>XnLF+kSqNIZd^R4~4=Z5)h-yW?q|L60?sJ9oSPYt`Fk6S%B<)b-= zqpgQu8Q}F!<=z3>JLd+8SnM|y?4OLvKRm47`d*cr=ibr32FDD)-Rj}^*Q@TgVSBfW zFn_8N8UStC$_B(C(o&K0R$#?KK-uS+L1U_joGpKZ9Ll0R4IQYl^kTW{F}%YX9>4Ol z9(N1b<{$wLTDd|G6LM| zq1G8bO$vE{kcEnRSZN>yR?T5wK_|u3I2H9JgdK$p&ng#JkQR|f0m(Xr8v`mKC z3a#U@kEyP!BfSCIAj6mdnj-#Ib(jEF>M$1Zx2eMva8$=qk^a_&VAepG0f;Vyj(}yq zL5MF@%n(GMfjXps_MH9zbhHigK$Xw}?2#i5L)%B0E0oN`P;fJ3D8|IATw4GOIl@V` zm$wil4G`|o04~)c-CN~NGxb>66u<=PU158-N7_ZMT(mt>O>dD+q?*Q}-y_wG79k_m zmM=1mRMS~x9jUf%k!7Ts;UfD;HP|Ba$OWLiG+Zd?=hiY~0WI#zMH<`%&{EZM5GFNV zL>9saP&JbOtCpzMssVSP70?=$HHek6h07OV`k7V0f6!V;Ph|<(N`r;mer7EY3u&TO zt3k_EFi@-BnRmd^q<%)*PH5WiTt&XE|KFm!`fAaOV)YlK-&TXKL2V$JG5$}Np|(#A z+$sHAnAPw9e*^vthWpi=YsvTE8VD^l)GE??aFet^=r>SbV_CJR#&Q8U8V*Bgk{rMY z>5TBiBAtHoD)epYYLqs~1T>Sn2vhsbYti?p>rpF7YeC%z|3!KcYG>4>xk8TtS`9jy zx(cO1ItbpBjtQ;%u~qEbp3!R5H7Fg@4#vSns~BoK7Hwpx?Nk(Gl! zWgWWOL#PXSOUr~|1N>@sG!=$|lN`ZF=}O_ce*G#$3}!X+Hi~!)Otbu>d?TEfxk081<`lRMVp|Fs3mH2@<3lLc@OBs^A#zYNj?x2izfb6FwiX ztqr~x9;3!x##~8S2mT|?7kUk3_N$dt2S;;Pv7`Cbv;8Ht!LeLT!TLqZ1Pj=){My+8 zpGDzSg4^7sOdXOr=pJFKCiM}P_lH*t?r}Akh8&|s%QI0 zK?7ZjEQLw>9eZOH!X?I4%Oh564Qz9`<1AclT;Sn6{vT=?e7zc= ztJ(?vE%g)D_mis^TCpau#c_q2Y6ZAPsv~^4$b0}@LybYL1lFo_BR8pmSEa2&+5o$X z`X6dJuu=7HXpFphqsk>a{ghQ+nV{9&KIdn`;JZ^_zUxpRmzYO7!w4_L?TD-j<3)^wP&l=qING8aqM5-N|E z7ud11toM#2c|SYBM(jR;6@O*rHz*qC_-U{Df(`bZ*^<=dJGAliB=J0v~X{aRm)U zm03Hasn#;zzoh1ODiT`RmczV=;Y@G(>5DB^elQiZN{M|XSTXq|+F`6bS_K@UbA5;c zY5ZNy6Zl`=MEVAmB28{tfIU`!Fv3aPKpIVuKhKyUC&s$_T*yW)X> zBt95lnm0ZvzBT}@vNC4uZ!3RRTJ}(4v`I1>0c>V}ZJ}=>Y;jh8ZgZ{97Pxf+o;k8g z0}DjIEJ$3I#^1 z+(%|6E__>w7F#mNe<^e>=y7IF^e5tj?tx7{Tks_<=y0$%T{Zm`1Kpf>F4-Fz`%u z^5?ami(&}xh#XB`{$cXnQJYnOL!2+!L!bv*8^|!H1Oein;c>;$38m;?>KYMKatRvOi~9979oQvs9VGt{OAO{&H5f|+f#9Jjl6=J1 zhL<^2*AhGm@@$W&){H9hW_nhEWn`RxMifa;AEtV0m81;A1KUYuoU!2|mDCOI*f=IA znl>Z+b8}|OGmIcj+}Tre4giB#04bQmb-}MxcHAH+oLe#Y?vBtQK}j|0XOZcAc=!;H z%RZo0jZu!3SQ4mY?pZE4-VhK^ML26t(78s+NPSNgcc#a(Q@Kp3^-7iLfO%jX_^?CU z?&>7rzQm~|;J`d1w%6h#QA_FjA_^sDJ`5?pJ2SJXgSIIh=&$d%3auGVf0Xc$O&DM< zl)KVN-?@?$tv89s;o`TW~a`}#9< zykjam{`Tb!TJ(GW_9z`52Vusbvm9&qozJ6vo9z$2Kq)RK| z*=3kU-wVpPvkuRt@ZCSUddx?|J_gs9QinkJqwpd9E#%5y1|=KYn5&rHFu^fkDPjGe z+)4)w8puy>va>m`T=A@|bj|ym@Ac3-zv|@c0}to+5mn(VA#R zyYlOWr{v6~il8jjtv^9RiIvPNDhH}>HUqmSUN^6nMM>CPyAsaK}d#^dz;)ZlXG!sHI8E-bqb+M6>p$>ST)OmYv z*vKWgi>fjXPdDXPz>LLDC% zXK}I36Bozxe7$%bwUs-97+IEAxNC@}VuPv(&6gldHRP=y!i_J59hE((FK!4%EA5+s zC8`mMK}C#XaKD7zk{y4*&jjuyqyBr=lhTAn+LwZa*;15<*rM7X z&mRD{c)mc+7>B)ofcVP%5`MDMJn4;O{oL9&w%@3TL)_yUv+A63xv;39%aI2?$cn$f-J-hBAxX3vl)uB{i?9hFhB2N8AM|`< z8$)nv4%byZJ0bgh%J!u#KDN^Npi=SA&H4H5>`lR0DsS<#7)t7WTan4kF}b3vLg01J zp?`wsT*l6B?0H=#$mR|qaXDr5*XDmgQKEdE2vi*7jYzKJ9ceyio+8N{5xb4MnbN;0 zcPR~&$q1Jst=A}Xv>V0bhwP6=h1H;a)u|IKyQ?Dit0J2*{G-{H;H5aW)BEw?chFTf z@>DGs1g-Ddx8hOeX662e9llwhtxWN&;hdWz&YE-?I8kYS{V*iREitN$<^SGUu7Ny5 zRqn1>Nt{z&upO#F+ElP_q!F93wxpiBy~s|PHEz;_V@M>u!ZbpO(n0%s>9weh_t)?RW&3XfJAPV^roM}GFa=Wy=t@pKauVIRK8uYqd>qaUi! zX>_IhSvhUSv!nyNWreA^z*?Ez{8cjXL-=0V`-Ojzhp=}?eE;IwunxiJ!|1DIh27#N zJBLPo$Cv46yx@RsCHv+3&9pjo%iJCTb?@ss`&O4_)K_@R1Mk8SRqxkDRBlps??NQO z-@h{A2MJDy9;as^_#4Q?xAk_N<=Uvela3#Z=1L_yxUd621YQc^CdXo+;Nhpl?0RHP z`CU2Bh|%*QJ*uI69mGilBgITZ;3jS1%#5?0m%N%h0T|=v2g=Nq77$&aA=yR^E0e62 zTf&i>n3`X;zL^)n3G5av?w$(IW@=c4?X!pojVR22P&7%kiccDNh!8h~ZQqVZ2Ae6v zSdwD{=`CMsuYk37--(75oH_yhqTov1kPnail@-F^GX!oqegA@c45GfGzL2SxI&{T@ zoJqIC@ZlM3Gqn=?rY(HwydGH*YpY*=*@G;2WvnQ#CrlSmhyi2dTil))j@LkBy7<#T zdS7Rz!@_`Em&VV0<(*#cOe_1Nr=wC!Y~JTl zLk!gf(REMGGX-ZG=qVU6qLyRX%-_s=STfu5kb4;^3Lbn4X0YWvZVbcl1I-lAGn`o` zv0<7La5N%qq`nUpLskTUNiaJJCT z`~qp0Kho!WZx9mi-Bx&2$&g!78y}egxP3C05&ys9881Zh8Q4b%EotA;6gEn;>^t}N ztZS8>=~kw8h6}QiyOQ>TD*DEoyJOuP8+P=@Mna}j*Af<}F@7mHN|cn};75t2@Gqea zDR$v2ehftN(FbM^9)HPR*--ujd|zpK!+1^iDZ%DRW(;FCoA)>3PkPf8Hh#wySKp_K zphJ#k{abXrj~SO5$@=D{cfsQ~kaEt8$b-tA0LfAr{*uaSFojYgVaHF11&ze^gsN7_ zQt{_>Y$alqG8fu9w0E`g-V1$hQwliu2gnj^yaSlY{5%} zksH(R`8S5;`L5&@xSO%l8tNZsQ&St9jGYzVM#`6PvuifVbPrtS zA=qA-_27sj{zdX^=mAa0?Mo7Q1};lY4!eHbci+F5^)l8L(tnYa&+kqW2R4AY4oH0c z4Wp?>znHv&bit`@vhT^)XZw>cQ^ z5MU>IHwkDn&EpQxcg!W3up0&5Ci4~diQ%Wy9IO>*PdmwoqPpV7auKocjV58A2gew> z`DapCz`{vFW_-g@{(*UP(pH=M@Re%=4YTmIXnz|!XhoIsk7a&{RhkScuKvkz{u17n z=@7D+*&FRZX6)lWH5obFrcX2;#4njM&c=vnEq~i63%^g=w_R~qylzVCEzf?P>}}~ze4lCKavjY+Q8=WPvpYz!|A~BjV z_-LPya_eBw5yj<5SbzEH?}A;a3{WO!N9`Pu{1-yx^?fIjoVB_47+wwZB37MGi}QBb zLo)$OX4aTRYiXsSNVeKx?6^3lbSY;-M&gB%<75?=9M(rszOwj@0c11s!Yyi0C9`hm z3R@t$G-#!|&5-Gi&z3VH4$hT1f1wUKx3@%Ss?OI$Jj5i$6+KZneHZ*UsKMYKNAZ?k zs>IasZuyDw_uy|o;ZA+EqU0RR2rG5)Hn!4Ak8Ogc_nvuPXyl(Obh3G7XJyD%agU6xklLs=PLtGUX`qwU|*j_Plf#;+Ot)yw`#_> zm9=YH(h~8(%Io~Hsrsa7Bkukjinwqh?^o+5c02P|ZIYk02V34c{`_~}!bH#g5Y`Ze zwB0{9Deal_734d%qf&zBIutqQA`E9a^8u2mg*PaQe>Jhp)6OFT8UK1YUq^Kfp3k0Q zvpmbEJ5_#(10HNQ{oV(od`(e|-qwjks&u1g$vsoTE?2z4VWIOCo_gv|4_sZuPF z&fk$k@qwS-|4nqg<99=-9kZWPl%Mq}>J$GuU$>ldjuMEGqI>*%sFRG6p@8A;14Sa} z8tU4m?k>`#-A{_`cJt>-oUs?+UoQD~*6EY7hE;!EqHV90uWBcEO{aI&?eRhy(m!R+ z<^3}6aqRo<(4#8K41-!{8!%(R$_s3I*Sw)U2T_dbp%mkLt0EzuvQLOO!Tf`V|=8qCX``BS`M~vmz z!%-c>pQ%~lKYS8<)26x0%|o@6k+cz2FCrJsj3dn^Pmnm>d&2V-BuaT&Xr_W&;?nI3 zNZ7$W!$CRJxmZW964tFjSH9Jd@+D;Oj33Ll3SrixyYp`;{0CvbQXTV7URAy%y{7XT z5tcKw?mW1ph)HrOOT+yXSR#k`w?>CmPjA%rCN}iNc+cg9`~?}Frp}zTf@=L~;0~z# zTA&Vbj`1MmQEy*E8pN#8a*6Z3W{%V~7d7cWh?VPjBD)ZFqM;*$N9K=1!=0u?J;|UOW(+6GVhS|TzV|B2bJ!Q@zw1`9Qdr^-eJjZOxhW+Wc*m&nI zI-Z8#s?@_4V&n;~Nt1tB5VHlV#1DOcVy#tYm;?lI7@bHT?CPXGNFBV0R3-$cRCZ?1 z9VT)A+tOvhYygT1oTmQe{{!w&E>z9a$#i6)N9eG#?5L`_^o`GklF@C*U(AtoBY1^A z_w}u(^SHj|(y1@(pO~2}!cWvWbBkil2Ie?=g4!Lx*hl7}5`n@>3(c#vTfXuJwAU2` z%EI?GWR>srG2Q!X>fo`d{K!P|;E5rsG!_>RTuC(%TQqe7s4g!3OfP^v&3>W#VM6ZhE{^v8Th;ErAXJTRuG@ zP2oLe537=aZ2CxayWv+Q!7shfHXAyTRgcSreQ+hYj=X%@AND8haywY#R3;1_px!$iP@8!tKcqFfcMQxZBIs%puELdZq7eC7#v;DL;>_l z<}-3bJQ#x4r%^blh4d!m6Dg&SSm39nRK8H247oBNFa@0#=%MB*^i#7t?Xa!xxZ}ZN z7Ujic*Y|^04MS5S?+@0cd4$^m|KH5i-Go_wl4aA{)v=?H995%w8CFy}j=oH7sW`;9 zM@~@oCr05nP6g&hxhh{ezoN?j_C3ybn93vkgKTsWU1(?_S=jr>o)}XO9}FjXSSa)R z2&f*hWqBKUypg=n)t-0fKFTDpF7tK4E3^F~I&b0}sWs}zyh~D@muM;?mooX#h(Hf} zIPLti=W(IykSWTAbAXeFGDQ7E&o?4={c;bTLN?32hJJ_6RVX$pp>EcIsdt)4DsPc<2KEAul(R!`?Tv4S3TBq6B$y zMlOz>-?b)yU+JjAuYZ5SUr<_iwWa&wG7Bb(#f6GyaQ8$7@B1G^#m!=x%v2_d?W0bX zc88?E_W`fdMe^VqjB61 z3R%RE7SZ@rR1fRDX}i{Z#&x#qX6p9K6+amrMQNtM2`wV4XvVtZUB|a_mW}EZ$z$kx z$KC4_$v)7OW3XWPCyb@1H~;Xk9@A#x4d^HM8|Op4Qg>$VItgj#`HCirYZvTybQd;d{uJWz zdU~F6L1JW&no^#WzuT&{8~F%U!gres=m;aCKJp{S{N_p&k&hJw6HJJIhrxt-W|J;8 zERm`kJ0JP)M84zA3YkQ39|t_U+#X!6bzYfD%$;t%j9j=`CVUGjue4OzkZ=_H;uCzH zu4T&D$o-57Zqre!I?4rUEB!QW1d zKN?HnmU)pG)0v-nc-e<6qIi6)uM0W-NS`NZEH|z85!q+Y&i8`|uDX>h!EhYEk18z|m?Ln_QQZ!3=Rlltd#v4VyoIVOknSs0J>mih(%3+J| z=kSZQHPA^t2iqO9{2y)ivG%&E7Uo~JZt>yMlyrB$gbQ)TcwcJJU5LF62Ei>}=L`$vJ)LM#^CE`YrN@OzxvLJ{^St6nZ zL=K;Iu=jL(@3zg87*SC=K9s@y#DXw4wFrUV{xmb zm&$@BjCUfntOFRGE{xRmH1#~_D1WJZ-h>HAq!Kz9msdb9l=+dm-&n>k6A8o9f|p8K zr0y8Y_(dXoWSVewh4Dfuh}0D}8LqG>4?3uCr{VD{h!;vyq%MVJ1Si7ZOat*NZZDMI zv5a1c@X=}Ll@)?rGu+`9yXu0mesO*Ne*5cF@BRRvjG}yTeer&~sQlU7tK%zYCG!bXm<}^7d5?I76t}>p(zX}9Q{AJ1Np9j)SK$j$gT_!P>7^_Y7x{9rp zUBKkr<=fA7bn=^}AN?X@mSXdgUUW^I{<*v>qOPh5oL=_fsW@D?ukqB}<@D!S8_D>m zwtsz`3SZi1_x)VhZ>jjFmVb%;GW;ClmV#yf*QI^e|9{XYzLaqLdB)9$U5~g?)PWvl zNm^ObyVG4K{XXrRds6j3o74mZr}}^0mL-9K(eLN^Q<~~e+(MJGl!xz0oHk9`#rwSv zHuPovLZm44S$`QK#NDlHcEs; zE+eLK#+|+G`)31-OtVvfpuylaTuU-$O}kyjOS->axkW4w8UDfCRiIjiRu4!F%Tuin z6F>o(q!jer{uFH>bLI-VJUK=!s)?J^zDW=?=?1AVGSiMnJ=pLvOQ-i7N*D8G!pCJZ zH8&gM%}u`)NxB~mIQFscp+dP^>Jks7?yD_t$b|>#bWcBJ3qlusyGjR~bK=h*F%=rQ z11Q`sw9v_wV7YubAe*p0&6>$$qNx2x6yECjonzd*$Kv(U_I%EMhW*zom_q^*6jvfV zjWHvFFm)=!V=8mMb|ucDjzJPg$@GvOWrvlvUh+nO|2f3~q8=`pHDlb-`(QH4h{Vfi!b6(k&|;W5Wcv!~T@@nGcEUWhDY(dTn)rAH81X+&Qb8 zj8WUG9cvv7VIX~rB>08_GNg;f?Se8toT!1YVwzGHIK1h2m$g;99#V+g%`axN>w#<| zSQG{w4@sJb))EV6$-VDe0-00k)wTGsQ~p@Pj}F+t$xsWmw!=oYR4W0?5V}AG@<|l7 zmRgSs#d?9VuIEyVcg_UUk}d`*XYQ2vssnmgc1e6?*Sv@YJ*3hsE8{P{iS9ICs~ zNt7A6aR6tQ>D-&hB`gUWODIeIVPNZpA8AR~gwIRdo`pvA-l}mjHf@SItQ>bjv)gE2 z$r}3+)|!d)K=^zub#vO!<5||hU-gF|t|-oi+Sm(QcHuWutr|8!?1fFc8h)47Hr}3v z-3xOJ{7)QWFL!4>Ibz!w$zL^;x;9^|J9!5534kx z$Q)D6gdT^6F;O`j{Jj}39_Sw8WEWHJrHnF%*&Yd_YqHnUKEvfJRDD^!U2~hnVC}4# z1bnk?;Fa}4mjwqRoLw*W<&o;*Ey%#JlBgoj9Jt~0KID@iq1#7^&7`2D7S^-#7V%5S z$V~p&o)(=jH0T{>WNn>h=PovcD;`j-*Drf>0OVp>GFeJ4artc>(pl?UNm>v0tBtq9 zTn~}l;c-*#0%vYh(xtP<5LZ}NUI*X&TvWUky(zX*-Jc%S#t1mzLm8A@Q9s{h^47&V zs`s~~w(MoQ7+Z^CV~~3lXD{k4m3B&)={)Afn#NrUDm4YCEnf?pEEIkGfxHoL*Dtr( zAa{_8(T~(s0dgm149)FuX_@Y*OQw@}zt~KRS(rL1u~v^0tpH44vR31ypBP72cP-J+ zLZkDkP6{EZ9s~&=3eRIm!u4=_Xae;Z`lpSEQJ{eIb_q@uQ@E0$&}D>+)RB1HEl|=M zQH63@F3b3m6H1n{Z&7U0I!bZ*mzsr}~>7eRpz%EH4!JDA#&kLWAUxLJn~Y^XUb{q3kA*x)jsGMCM8S!m4- zR>er@a2(pszuC*QFYU81^e{bfy>a^v(#Wg5HH?=n+nEgSo9!Fi3VS2Xfo*9XdmIF# zGMzVi_Grym)rq9S*!g(dBaP~*6GEzGjWd08RbIl&Jf+Ph-;NdB<5@2t?E-jj14`wzn*!PAv^N5M%+^7N9q@%R2(ulju538UxKo^r*Fgpv6Xlt|1I`_(2OP=X4f$o&1sZCvgi@b05rl{kw7#B&r3be&51oin^7%T!r_?_HaL0+|fVjBIjCzf0rh9-=s?{hRzK|4)sV6qSl*LrJ)l$`?K(hq^1jr+&?VcOV)u$=Jv+GE@o9qQdh~6#O3|zfFQ&3Z(Cw`EAK0X?MZy5L)D6V${~;wDqQg|W%sm_Ug#m}?>$9K z!M8{FC+1IeeFNjCIM>n6ig$2sht?o38`(ie(oQjiR-Lg!BW{&pzJxbg=SQ#(hTcn2 zoR_LIvh17{@YZa_aJfndBhEg7z^Nv=}cl6otWV z5!BIWJ9Vjzrs{%&-6EZWan5?oC2XK-<7UkvoNwSHplYC}7+TrtS`GbqtUoU1`{3#s z$$9^aurkRYxC17f;5;zx!1)_bt9$V!bvQk|C^c9KIk3l16EhfGhtPj5@LWFLZQV3%Po2Ppv7~WBQ&M}*=tUF)l8FUriuCnv>VE$W!|^Z&mYJAYS@(J9D-E4 zeRLEdR|AdX0R?LfwwXI~pi3uM4ZRLnr@+>hr?{I~b)=!%=<0vl?E`Sqb9F)(l|oqq z_OQ;WlUXlsnAvEZ;)pE2=9w*n_~2&50XO;Q_B@cB%RK9qREt2r`#P4cDN!*ui73~w zIcrgufTnreNT4%*VhSiuEy69)sY$Mb8N=$aWNvdE{wvWCIzK(1pO?{i%kA2Qc-d51 zP4XWoZ)G8Ups!q$A`3c8^8>ezjX!Z)`&^iC3{fUHyXht%lfRz(oge#OS8SC8w ztpgqgvj0hbp-;KagYTB?jb5-{3z?5BKj*nLrx|K?SgnA{J8%Alu|>8Qh@RA6pDl{L z9S`wQhb6roJ~E?@?s*hZB;Pq1Zb2^4*WbWEOR+4}#K~e+Tlq7)L@3h@^{wJptoKq= ziyx7qdPR|z#g$L@C|dD@UCf$!##%k(S4k-WZOrM>#qJ343KK)>Hhjn8~$khM%yv<~7#6 zE^PuW#?rco1MpFU75TZeJI7FgysHIcn?_PaW@qDXQ1kGcWhGsCj8WQoRlg~`%}fyK z(AXO`L9X4Sj9KneKixvutBAUYsRXvxX!eROvpU`~BkzoPE6jJYn%CO5iT=L5^G|R> zY_WmARco_a#!5ww53HxUNoTds(ihEyLG9Szuv>#zY-O$B+W@90_@+Brez){rd$~h} ztYfrjP7a+#R*3t`(?7WATQwsdqW48JuQ|w(mjN&}*Uq?I-ko#|2vCwX!dX7P(n4op z;=z1}&5N;^(Jsb8M6>gCxo#hu{yvoP zkF>WT_Vr6M<7sZQ50Kd(kU4vllgM9jp#{lT61g4XmpQYDl$f@$T>aI`_?QIyU}7#q zJMy~**TP?QXg3v#wxBIoGmJ8$=2^->Az7={S_TZb-6+# z6MAuv6LUbSj%~`w-Qq7VF-l&>dZHPqSFi`Y5;Vgr0Y0=bT}tymq%KkC06e{;tXUxY z3^#q;cv=bENq#6HsyFh=?<70vNP>l;vL)f04DiQ5o|A&FDd<9LI52(0KueK)iFE>v zFNg9xcqe$?(w~}vTEPx{H>4rirSlWK*(xfG1qi9ljj;`Qz@~Ev-bCaaC2Cjw^?$C> z!{hc+8yJe|YEn&n#X!r|?4eK?J*}zj&4rdO*?NpefFM=dU#447UOdB58~o!T$y@AC z3d0?dCHnJHp4W{c{C7;AX4+1g#p#vzPyiHoQa-%@<0^!KiOIYLPcvO5+?BuPdh8D4 zwI6{WsO~mpw!APm-j_^K%0n9zI{^FA8F$%{Z-_PVO*wut9diZMHwk~^kRf)LtqfTW z#_@qhW`qxsi~UZ&h%#4U7(SWU@lMG4Os;4aau3CS_o8t4aQzPPR z^jG(sXNGhOk>OYZ`_g0Jto4rg9`SA*7B+FIdPo)C^xjprd?_CbW965qlR#ZMmOm4R zrWQnJzO~Pi&lc(xj%->|rtpN(P|}Wf)iI$*O{*@?^}7WX`Gi$6`BvJEwZL}Q%(cJ( zQL~v^r7w`-C4EwcZ028+dlIv-Cwld^u~HHs4TLg0k*n+_nuPOF=R*u)pD`H!v*bDm z#o`^6dt>!KJ|5lItadCsD&%Acc6ic3Aral_}TM3?$V?h(5|xiGrd zSlJ637iup2aLd|G-K~h&T~?_+Q&6`0D9%_(mXKAq`ZcD@^{u@FouD~#7C~33f(*~e z&LIipzzlJygi6cJ)2XoB{2I&r`e2jwRV*aZ1<7Bj^ il|EA$>2siOAAlmp#-SK!6=5yb1aDye?S!YUTlZgkDF9di literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/rdata/species_mtx.rds b/pandas/tests/io/data/rdata/species_mtx.rds new file mode 100644 index 0000000000000000000000000000000000000000..aa9ebe379e50a81a2dc3ad5a7b3bef3d04154e0b GIT binary patch literal 1907 zcmV-(2aNb1iwFP!000001MOEyY#c=tt?n7yGoHmx2ni%1nUEkQI0UgNVG*R=)0t`9 z(>-+e*q#6)PHe&wJH|0W9FQUrApzoo5C|c$h)saRc0%ET1RTN^Aub3B5JwQoB^*)U z{jRE>>Yg#d1a6e{yQ+Tu|Lgx(zm}dHWQ=84rXSRYChKd3doJ6B#wm~v(pZ+|Q5PRr zn3})i?zzR8gEI@MvF!e-BQt_miuk-f!+AQ|nIF*hK${sac~QRrs))A6l0}c4ON?m+ zW&9@40H`Ho)4EhPEMa@OC7bG}?Uh7*XAd8gk~xOYV2`oBW%^T2`J(ln@2`A0=aR;R z%7CahD7;?SQ!cbYmx#X1hgaD0^5H^dd^acunqW)k!&0`FTe7Kk+TJQrA5(Z4)>Sg! z;PW8pNnzuBF|ZmmB-+xpj6bW)FW3DMr9TE*30nUb{%=k7v+Uo=k+cn$^R#Cb|8m`G zS;C;7o%_0^=Ra3_|D^QV7e!HDbQay6pl?ARFz&nDN5?^zi?*)Rln;d4fWjlZUlM(? z@i=H5=m=YSU(^+O8N)xx2Q~H_>(2Q(ZdbofnXkaQO73CEz+e3t#^b9B*&i9tsT`Ac z5TC;la`Ju=`$E61)7NHEKLPrQb?{PTBN>NvFpzt#sL08p`rPa6y+CPSBYcwu_Vf=~ zbcUCY3th^K-QDwS{2gw54>z`;vlcWdM83esWc=QdYejd+yx>)mwse^zgvocPkso^c zZd78>ofMzi*Y=3>LPbu+M`E%?;Tg&p(Hgi+=NzzHcLU>i?_+#@Q~vBk?AedOI*HC_ zpc^?qLorY;t!8oVN48&QJV#b0{mGDiUbOLk(6Bc8cJynC|NSh!kMy-HxhJT7NQp_# zDc?OZAC;Vn$|gnUX4cskWD|EqmiFf`aClGXH9>(ghp=>3+OvO)^`H+ z=b#Z0cOQ>IJHYtfZwXr>r*l{E`ABX6cZUwSKGJm++zEq%u>Zc|b6#L1eU-5iU7_&$ zgpRJz?FtSb30qPYXJ6r0)Y}-dX#FT(zhr#BG9e>yc=9fm%r~|4x{v=(QO~L#cT`mV zi+5D7BHu$#oki|lxU`6HUq$;J7QZ_(KjTC1c$tsVZ+g4ad3GsFY_XL%z zL2t4+AGDVR|0~eF+_vgl|2*p7ShVH^$Y<|?T7z}YSt`o?k^LkUGv%-P){$qT>Ti$t zJX@j2QoUX9#XnPq-pN?g!rKBP+NVvHEYP)ggN?IeJ@hlvU_HwDBTu7GcM`p&u)g{! zZae!fYWhb#A?6l$!20LWrFYXXw#gzp-UC(_BV&R%_s@qav8>#G>Q<4ATti_Y>2#kmh+vwUtHe0`5H2VY0Mg!?(G z{;OxE@1E_r%SO4236UH`=B8$6DD3lu+!+o5WVwuPO~$PW(Hai!-@kZxaq3WIt2H@d zO>7hVe$(IDv}A7}>cxYVl+s9rOOFUKDW{u$JfSOF<1Nuz+p)5E*TT%y;>_I4bmhqW z{0^z#=lGW&m=>La8&(c4RKS{Blq|;-tV0K+`D#@vhh$s85J7h|PBwxr^G9z;vwtWM z4o73$VA(K3+x42E<+p3LErB;0&AL5m8*$$a>)u{Z_Vb2sn042#O2(iOz=CI2Ey>9m z<5nQmN9}RL8IR2sM}5OJ>$cbQVhbxxt64W3BWT)Hui^QaUjo8MRN*|76O6K1_uQ&) zoAt2Xj^`aT9Xqrf#|UG^s@+T|4q7nfs*d+tfWzsGyP;Qejj$ebYKYZWZItSsxLDZ$@x|@v^$fR0>d}cc~J;k{wM+J$8zOX)$&}} zN=#qGido>aP0O(2RI5SH^crA>@fr;oK2poI)9fk`1)M2%*#gR8T6nl-c{Rrig0vCG zG7Z=BQ|yvsb=xSqURW`Lz^=KL+-;?%wb$O8HX3+-6@jGKD_Q~IGipet)NBS4(;_s2 z&VCt=Hk66Q3={nPsNYU7N?0VHfHV{&!E`)71#iJ@n?^OEJhTU51Y%OM2q(vCUP5#b zVJBT#adXS9S$@NCu{a4)2WgVvl#xr6rAcn7VbokNB|6kVTGbJKlAUkTA*7n51nn|8 tMBI?Jd4gRGkZ!f4fnsQk8DmNHLdzLXVG$pw|KBO>%%7I+L+6DT007bjz$gF! literal 0 HcmV?d00001 diff --git a/pandas/tests/io/rdata/test_pyreadr.py b/pandas/tests/io/rdata/test_pyreadr.py new file mode 100644 index 0000000000000..6aa6840fc0499 --- /dev/null +++ b/pandas/tests/io/rdata/test_pyreadr.py @@ -0,0 +1,596 @@ +from io import BytesIO +import os +from urllib.error import HTTPError + +import pytest + +from pandas import DataFrame +import pandas._testing as tm + +from pandas.io.rdata import read_rdata + +pyreadr = pytest.importorskip("pyreadr") + + +ghg_df = DataFrame( + { + "gas": { + 141: "Carbon dioxide", + 142: "Methane", + 143: "Nitrous oxide", + 144: "Fluorinated gases", + 145: "Total", + }, + "year": {141: 2018, 142: 2018, 143: 2018, 144: 2018, 145: 2018}, + "emissions": { + 141: 5424.881502132882, + 142: 634.4571270782675, + 143: 434.52855537666636, + 144: 182.78243246177678, + 145: 6676.649617049592, + }, + } +).rename_axis("rownames") + +plants_df = DataFrame( + { + "plant_group": { + 16: "Pteridophytes", + 17: "Pteridophytes", + 18: "Pteridophytes", + 19: "Pteridophytes", + 20: "Pteridophytes", + }, + "status": { + 16: "Data Deficient", + 17: "Extinct", + 18: "Not Threatened", + 19: "Possibly Threatened", + 20: "Threatened", + }, + "count": {16: 398, 17: 65, 18: 1294, 19: 408, 20: 1275}, + } +).rename_axis("rownames") + +sea_ice_df = DataFrame( + { + "year": {1012: 2016, 1013: 2017, 1014: 2018, 1015: 2019, 1016: 2020}, + "mo": {1012: 12, 1013: 12, 1014: 12, 1015: 12, 1016: 12}, + "data.type": { + 1012: "Goddard", + 1013: "Goddard", + 1014: "Goddard", + 1015: "Goddard", + 1016: "NRTSI-G", + }, + "region": {1012: "S", 1013: "S", 1014: "S", 1015: "S", 1016: "S"}, + "extent": {1012: 8.28, 1013: 9.48, 1014: 9.19, 1015: 9.41, 1016: 10.44}, + "area": {1012: 5.51, 1013: 6.23, 1014: 5.59, 1015: 6.59, 1016: 6.5}, + } +).rename_axis("rownames") + + +@pytest.fixture(params=["rda", "rds"]) +def rtype(request): + return request.param + + +@pytest.fixture(params=[None, False, "gzip"]) +def ok_comp(request): + return request.param + + +@pytest.fixture(params=[True, "bzip2", "xz"]) +def bad_comp(request): + return request.param + + +def adj_int(df): + """ + Convert int32 columns to int64. + + Since pyreadr engine reads ints int int32 and writes ints + to floats this method converts such columns for testing. + """ + int_cols = df.select_dtypes("int32").columns + df[int_cols] = df[int_cols].astype("int64") + + if "index" in df.columns: + df["index"] = df["index"].astype("int64") + + if "year" in df.columns: + df["year"] = df["year"].astype("int64") + if "mo" in df.columns: + df["mo"] = df["mo"].astype("int64") + + return df + + +# RDA READER + +# PATH_OR_BUFFER + + +def test_read_rds_file(datapath): + filename = datapath("io", "data", "rdata", "ghg_df.rds") + r_df = read_rdata(filename, engine="pyreadr") + r_df = adj_int(r_df) + + tm.assert_frame_equal(ghg_df, r_df.tail()) + + +def test_read_rda_file(datapath): + filename = datapath("io", "data", "rdata", "env_data_dfs.rda") + r_dfs = read_rdata(filename, engine="pyreadr") + + r_dfs = {k: adj_int(v) for k, v in r_dfs.items()} + + assert list(r_dfs.keys()) == ["ghg_df", "plants_df", "sea_ice_df"] + + tm.assert_frame_equal(ghg_df, r_dfs["ghg_df"].tail()) + tm.assert_frame_equal(plants_df, r_dfs["plants_df"].tail()) + tm.assert_frame_equal(sea_ice_df, r_dfs["sea_ice_df"].tail()) + + +def test_bytes_read_rds(datapath): + filename = datapath("io", "data", "rdata", "sea_ice_df.rds") + + with open(filename, "rb") as f: + r_df = read_rdata(f.read(), file_format="rds", engine="pyreadr") + + r_df = adj_int(r_df) + + tm.assert_frame_equal(sea_ice_df, r_df.tail()) + + +def test_bytes_read_rda(datapath): + filename = datapath("io", "data", "rdata", "env_data_dfs.rda") + + with open(filename, "rb") as f: + r_dfs = read_rdata(f.read(), file_format="rda", engine="pyreadr") + + r_dfs = {k: adj_int(v) for k, v in r_dfs.items()} + + assert list(r_dfs.keys()) == ["ghg_df", "plants_df", "sea_ice_df"] + + tm.assert_frame_equal(ghg_df, r_dfs["ghg_df"].tail()) + tm.assert_frame_equal(plants_df, r_dfs["plants_df"].tail()) + tm.assert_frame_equal(sea_ice_df, r_dfs["sea_ice_df"].tail()) + + +def test_bytesio_rds(datapath): + filename = datapath("io", "data", "rdata", "sea_ice_df.rds") + + with open(filename, "rb") as f: + with BytesIO(f.read()) as b_io: + r_df = read_rdata(b_io, file_format="rds", engine="pyreadr") + + r_df = adj_int(r_df) + + tm.assert_frame_equal(sea_ice_df, r_df.tail()) + + +def test_bytesio_rda(datapath): + filename = datapath("io", "data", "rdata", "env_data_dfs.rda") + + with open(filename, "rb") as f: + with BytesIO(f.read()) as b_io: + r_dfs = read_rdata(b_io, file_format="rda", engine="pyreadr") + + r_dfs = {k: adj_int(v) for k, v in r_dfs.items()} + + assert list(r_dfs.keys()) == ["ghg_df", "plants_df", "sea_ice_df"] + + tm.assert_frame_equal(ghg_df, r_dfs["ghg_df"].tail()) + tm.assert_frame_equal(plants_df, r_dfs["plants_df"].tail()) + tm.assert_frame_equal(sea_ice_df, r_dfs["sea_ice_df"].tail()) + + +# FILE FORMAT + + +def test_read_wrong_format(datapath): + with pytest.raises(ValueError, match="not a valid value for file_format"): + filename = datapath("io", "data", "rdata", "plants_df.rds") + read_rdata(filename, engine="pyreadr", file_format="r") + + +def test_read_wrong_file(): + with pytest.raises(FileNotFoundError, match="file cannot be found"): + filename = os.path.join("data", "rdata", "plants_df.rda") + read_rdata(filename, engine="pyreadr") + + +def test_read_rds_non_df(datapath): + from pyreadr import custom_errors + + with pytest.raises( + custom_errors.LibrdataError, + match="Invalid file, or file has unsupported features", + ): + filename = datapath("io", "data", "rdata", "ppm_ts.rds") + read_rdata(filename, engine="pyreadr") + + +def test_read_rda_non_dfs(datapath): + from pyreadr import custom_errors + + with pytest.raises( + custom_errors.LibrdataError, + match="Invalid file, or file has unsupported features", + ): + filename = datapath("io", "data", "rdata", "env_data_non_dfs.rda") + read_rdata(filename, engine="pyreadr") + + +def test_read_not_rda_file(datapath): + from pyreadr import custom_errors + + with pytest.raises( + custom_errors.LibrdataError, match="The file contains an unrecognized object" + ): + filename = datapath("io", "data", "rdata", "ppm_df.csv") + read_rdata(filename, file_format="rda", engine="pyreadr") + + +def test_bytes_read_infer_rds(datapath): + filename = datapath("io", "data", "rdata", "sea_ice_df.rds") + + with pytest.raises(ValueError, match="Unable to infer file format from file name"): + with open(filename, "rb") as f: + read_rdata(f.read(), engine="pyreadr") + + +def test_bytes_read_infer_rda(datapath): + filename = datapath("io", "data", "rdata", "env_data_dfs.rda") + + with pytest.raises(ValueError, match="Unable to infer file format from file name"): + with open(filename, "rb") as f: + read_rdata(f.read(), engine="pyreadr") + + +# URL + + +@tm.network +def test_read_rda_url(): + url_df = DataFrame( + { + "carrier": {1: "9E", 2: "AA", 3: "AS", 4: "B6", 5: "DL"}, + "name": { + 1: "Endeavor Air Inc.", + 2: "American Airlines Inc.", + 3: "Alaska Airlines Inc.", + 4: "JetBlue Airways", + 5: "Delta Air Lines Inc.", + }, + } + ).rename_axis("rownames") + + url = ( + "https://github.com/hadley/nycflights13/blob/master/data/airlines.rda?raw=true" + ) + r_dfs = read_rdata(url, file_format="rda", engine="pyreadr") + + tm.assert_frame_equal(url_df, r_dfs["airlines"].head()) + + +@tm.network +def test_read_unable_infer_format(): + with pytest.raises(ValueError, match="Unable to infer file format from file name"): + url = ( + "https://github.com/hadley/nycflights13/" + "blob/master/data/airlines.rda?raw=true" + ) + read_rdata(url, engine="pyreadr") + + +@tm.network +def test_read_wrong_url(): + with pytest.raises(HTTPError, match="HTTP Error 404: Not Found"): + url = "https://example.com/data.rdata" + read_rdata(url, engine="pyreadr") + + +# S3 + + +@tm.network +@pytest.mark.slow +def test_read_rda_s3(): + s3 = "s3://assets.datacamp.com/production/course_1478/datasets/wine.RData" + s3_df = DataFrame( + { + "Alcohol": {1: 13.2, 2: 13.16, 3: 14.37, 4: 13.24, 5: 14.2}, + "Malic.acid": {1: 1.78, 2: 2.36, 3: 1.95, 4: 2.59, 5: 1.76}, + "Ash": {1: 2.14, 2: 2.67, 3: 2.5, 4: 2.87, 5: 2.45}, + "Alcalinity.of.ash": {1: 11.2, 2: 18.6, 3: 16.8, 4: 21.0, 5: 15.2}, + "Magnesium": {1: 100, 2: 101, 3: 113, 4: 118, 5: 112}, + "Total.phenols": {1: 2.65, 2: 2.8, 3: 3.85, 4: 2.8, 5: 3.27}, + "Flavanoids": {1: 2.76, 2: 3.24, 3: 3.49, 4: 2.69, 5: 3.39}, + "Nonflavanoid.phenols": {1: 0.26, 2: 0.3, 3: 0.24, 4: 0.39, 5: 0.34}, + "Proanthocyanins": {1: 1.28, 2: 2.81, 3: 2.18, 4: 1.82, 5: 1.97}, + "Color.intensity": {1: 4.38, 2: 5.68, 3: 7.8, 4: 4.32, 5: 6.75}, + "Hue": {1: 3.4, 2: 3.17, 3: 3.45, 4: 2.93, 5: 2.85}, + "Proline": {1: 1050, 2: 1185, 3: 1480, 4: 735, 5: 1450}, + } + ).rename_axis("rownames") + r_dfs = read_rdata(s3, engine="pyreadr") + r_dfs["wine"] = adj_int(r_dfs["wine"]) + + # pyreadr remove dots in colnames + r_dfs["wine"].columns = r_dfs["wine"].columns.str.replace(" ", ".") + + tm.assert_frame_equal(s3_df, r_dfs["wine"].head()) + + +# ENGINE + + +def test_read_rds_df_output(datapath): + filename = datapath("io", "data", "rdata", "sea_ice_df.rds") + r_df = read_rdata(filename, engine="pyreadr") + + assert isinstance(r_df, DataFrame) + + +def test_read_rda_dict_output(datapath): + filename = datapath("io", "data", "rdata", "env_data_dfs.rda") + r_dfs = read_rdata(filename, engine="pyreadr") + + assert isinstance(r_dfs, dict) + assert list(r_dfs.keys()) == ["ghg_df", "plants_df", "sea_ice_df"] + + +def test_read_wrong_engine(datapath): + with pytest.raises(ValueError, match="not a supported engine"): + filename = datapath("io", "data", "rdata", "sea_ice_df.rds") + read_rdata(filename, engine="rpy2") + + +# MODE + +# IGNORED OPTION FOR pyreadr ENGINE + + +# USE_OBJECTS + + +def test_read_select_frames_rda_dfs(datapath): + filename = datapath("io", "data", "rdata", "env_data_dfs.rda") + r_dfs = read_rdata( + filename, engine="pyreadr", select_frames=["ghg_df", "sea_ice_df"] + ) + + assert "plants_df" not in list(r_dfs.keys()) + assert "ghg_df" in list(r_dfs.keys()) + assert "sea_ice_df" in list(r_dfs.keys()) + + +def test_read_wrong_select_frames(datapath): + with pytest.raises(TypeError, match="not a valid type for select_frames"): + filename = datapath("io", "data", "rdata", "env_data_dfs.rda") + read_rdata(filename, engine="pyreadr", select_frames="plants_df") + + +# ROWNAMES + + +def test_read_rownames_true_rds(datapath): + filename = datapath("io", "data", "rdata", "sea_ice_df.rds") + r_df = read_rdata(filename, engine="pyreadr", rownames=True) + + assert r_df.index.name == "rownames" + + +def test_read_rownames_false_rds(datapath): + filename = datapath("io", "data", "rdata", "sea_ice_df.rds") + r_df = read_rdata(filename, engine="pyreadr", rownames=False) + + assert r_df.index.name != "rownames" + + +def test_read_rownames_true_rda(datapath): + filename = datapath("io", "data", "rdata", "env_data_dfs.rda") + r_dfs = read_rdata(filename, engine="pyreadr", rownames=True) + + assert r_dfs["ghg_df"].index.name == "rownames" + assert r_dfs["plants_df"].index.name == "rownames" + assert r_dfs["sea_ice_df"].index.name == "rownames" + + +def test_read_rownames_false_rda(datapath): + filename = datapath("io", "data", "rdata", "env_data_dfs.rda") + r_dfs = read_rdata(filename, engine="pyreadr", rownames=False) + + assert r_dfs["ghg_df"].index.name != "rownames" + assert r_dfs["plants_df"].index.name != "rownames" + assert r_dfs["sea_ice_df"].index.name != "rownames" + + +# ENCODING + + +def test_non_utf8_data(datapath, rtype): + filename = datapath("io", "data", "rdata", f"climate_non_utf8_df.{rtype}") + with pytest.raises(UnicodeDecodeError, match=("'utf-8' codec can't decode byte")): + read_rdata(filename, engine="pyreadr") + + +# RDA WRITER + +# PATH_OR_BUFFER + + +def test_write_read_file(rtype): + with tm.ensure_clean("test.out") as path: + ghg_df.to_rdata(path, file_format=rtype, engine="pyreadr", index=False) + r_dfs = read_rdata(path, file_format=rtype, engine="pyreadr", rownames=False) + + expected = ghg_df.reset_index(drop=True) + output = ( + adj_int(r_dfs["pandas_dataframe"]) if rtype == "rda" else adj_int(r_dfs) + ) + + tm.assert_frame_equal(output, expected) + + +def test_write_read_pathlib(rtype): + from pathlib import Path + + with tm.ensure_clean_dir() as tmp_dir: + tmp_file = Path(tmp_dir).joinpath("test.out") + sea_ice_df.to_rdata(tmp_file, file_format=rtype, engine="pyreadr", index=False) + r_dfs = read_rdata( + tmp_file, file_format=rtype, engine="pyreadr", rownames=False + ) + + expected = sea_ice_df.reset_index(drop=True) + output = ( + adj_int(r_dfs["pandas_dataframe"]) if rtype == "rda" else adj_int(r_dfs) + ) + + tm.assert_frame_equal(output, expected) + + +def test_write_read_filelike(rtype): + with BytesIO() as b_io: + sea_ice_df.to_rdata(b_io, file_format=rtype, engine="pyreadr", index=False) + r_dfs = read_rdata( + b_io.getvalue(), file_format=rtype, engine="pyreadr", rownames=False + ) + + expected = sea_ice_df.reset_index(drop=True) + output = ( + adj_int(r_dfs["pandas_dataframe"]) if rtype == "rda" else adj_int(r_dfs) + ) + + tm.assert_frame_equal(output, expected) + + +# FILE FORMAT + + +def test_write_wrong_format(): + with tm.ensure_clean("test.rda") as path: + with pytest.raises(ValueError, match=("not a valid value for file_format")): + ghg_df.to_rdata(path, engine="pyreadr", file_format="csv") + + +def test_write_unable_to_infer(): + with tm.ensure_clean("test") as path: + with pytest.raises( + ValueError, match=("Unable to infer file format from file name") + ): + ghg_df.to_rdata(path, engine="pyreadr") + + +# ENGINE + + +def test_write_wrong_engine(): + with tm.ensure_clean("test.rda") as path: + with pytest.raises(ValueError, match=("not a supported engine")): + ghg_df.to_rdata(path, engine="rpy2") + + +# MODE + +# IGNORED OPTION FOR pyreadr ENGINE + + +# INDEX + + +def test_index_true(rtype): + with tm.ensure_clean("test.out") as path: + plants_df.rename_axis(None).to_rdata( + path, file_format=rtype, engine="pyreadr", index=True + ) + r_dfs = read_rdata(path, file_format=rtype, engine="pyreadr") + + r_df = r_dfs if rtype == "rds" else r_dfs["pandas_dataframe"] + + assert "index" in r_df.columns + + +def test_index_false(rtype): + with tm.ensure_clean("test.out") as path: + plants_df.rename_axis(None).to_rdata( + path, file_format=rtype, engine="pyreadr", index=False + ) + r_dfs = read_rdata(path, file_format=rtype, engine="pyreadr") + + r_df = r_dfs if rtype == "rds" else r_dfs["pandas_dataframe"] + + assert "index" not in r_df.columns + + +# ASCII + +# IGNORED OPTION FOR pyreadr ENGINE + + +# COMPRESS + + +def test_compress_ok_comp(rtype, ok_comp): + with tm.ensure_clean("test.out") as path: + ghg_df.to_rdata( + path, file_format=rtype, engine="pyreadr", compress=ok_comp, index=False + ) + r_dfs = read_rdata(path, file_format=rtype, engine="pyreadr", rownames=False) + + expected = ghg_df.reset_index(drop=True) + output = ( + adj_int(r_dfs["pandas_dataframe"]) if rtype == "rda" else adj_int(r_dfs) + ) + + tm.assert_frame_equal(output, expected) + + +def test_compress_bad_comp(rtype, bad_comp): + from pyreadr import custom_errors + + with tm.ensure_clean("test.out") as path: + with pytest.raises( + custom_errors.PyreadrError, + match=(f"compression {bad_comp} not implemented!"), + ): + ghg_df.to_rdata( + path, + file_format=rtype, + engine="pyreadr", + index=False, + compress=bad_comp, + ) + + +def test_compress_zip(rtype): + with tm.ensure_clean("test.out") as path: + with pytest.raises(ValueError, match=("not a supported value for compress")): + ghg_df.to_rdata( + path, file_format=rtype, engine="pyreadr", index=False, compress="zip" + ) + + +# OTHER_FRAMES + +# IGNORED OPTION FOR pyreadr ENGINE + + +# RDA_NAMES + + +def test_new_rda_name(): + with tm.ensure_clean("test.rda") as path: + ghg_df.to_rdata(path, engine="pyreadr", rda_names=["py_df"]) + r_dfs = read_rdata(path, engine="pyreadr") + + assert "py_df" in list(r_dfs.keys()) + + +def test_type_rda_name(): + with tm.ensure_clean("test.rds") as path: + with pytest.raises(TypeError, match=("not a valid type for rda_names")): + ghg_df.to_rdata(path, engine="rscript", rda_names="py)df") diff --git a/pandas/tests/io/rdata/test_rscript.py b/pandas/tests/io/rdata/test_rscript.py new file mode 100644 index 0000000000000..660187d3c2a1b --- /dev/null +++ b/pandas/tests/io/rdata/test_rscript.py @@ -0,0 +1,972 @@ +from io import BytesIO +import os +import subprocess +from urllib.error import HTTPError + +import pytest + +from pandas.compat._optional import import_optional_dependency +import pandas.util._test_decorators as td + +from pandas import DataFrame +import pandas._testing as tm + +from pandas.io.rdata import ( + RSCRIPT_EXISTS, + RScriptError, + read_rdata, +) + +pytestmark = pytest.mark.skipif(not RSCRIPT_EXISTS, reason="R is not installed.") + +ghg_df = DataFrame( + { + "gas": { + "141": "Carbon dioxide", + "142": "Methane", + "143": "Nitrous oxide", + "144": "Fluorinated gases", + "145": "Total", + }, + "year": {"141": 2018, "142": 2018, "143": 2018, "144": 2018, "145": 2018}, + "emissions": { + "141": 5424.88150213288, + "142": 634.457127078267, + "143": 434.528555376666, + "144": 182.782432461777, + "145": 6676.64961704959, + }, + } +).rename_axis("rownames") + +plants_df = DataFrame( + { + "plant_group": { + "16": "Pteridophytes", + "17": "Pteridophytes", + "18": "Pteridophytes", + "19": "Pteridophytes", + "20": "Pteridophytes", + }, + "status": { + "16": "Data Deficient", + "17": "Extinct", + "18": "Not Threatened", + "19": "Possibly Threatened", + "20": "Threatened", + }, + "count": {"16": 398, "17": 65, "18": 1294, "19": 408, "20": 1275}, + } +).rename_axis("rownames") + +sea_ice_df = DataFrame( + { + "year": {"1012": 2016, "1013": 2017, "1014": 2018, "1015": 2019, "1016": 2020}, + "mo": {"1012": 12, "1013": 12, "1014": 12, "1015": 12, "1016": 12}, + "data.type": { + "1012": "Goddard", + "1013": "Goddard", + "1014": "Goddard", + "1015": "Goddard", + "1016": "NRTSI-G", + }, + "region": {"1012": "S", "1013": "S", "1014": "S", "1015": "S", "1016": "S"}, + "extent": { + "1012": 8.28, + "1013": 9.48, + "1014": 9.19, + "1015": 9.41, + "1016": 10.44, + }, + "area": {"1012": 5.51, "1013": 6.23, "1014": 5.59, "1015": 6.59, "1016": 6.5}, + } +).rename_axis("rownames") + + +def r_package_installed(name): + """ + Check if R package is installed. + + Method runs a quick command line call to Rscript to + check if library call succeeds on named package. + """ + + p = subprocess.Popen( + ["Rscript", "-e", f"suppressPackageStartupMessages(library({name}))"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + out, err = p.communicate() + + return len(err) == 0 + + +def run_rscript(cmds) -> str: + """ + Run R script at command line. + + This method will read write_rdata output and check + console output. + """ + + r_batch = """ + args <- commandArgs(trailingOnly=TRUE) + + switch(args[2], + "rda" = load(args[1]), + "rds" = { + pandas_dataframe <- readRDS(args[1]) + } + ) + + rm(args) + mget(ls()) + """ + with open(cmds[1], "w") as f: + f.write(r_batch) + + p = subprocess.Popen( + cmds, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + output, error = p.communicate() + if len(error) != 0: + raise ValueError(error.decode("UTF-8")) + + return output.decode("UTF-8") + + +def adj_int(df): + """ + Convert int32 columns to int64. + + Since parquet and feather modes parses ints int int32, + this method converts for testing. + """ + for col in df.select_dtypes("int32").columns: + df[col] = df[col].astype("int64") + + return df + + +def handle_index_rownames(df): + df = df.drop(["rownames"], axis=1).set_index("index").rename_axis(None) + + return df + + +R_ARROW = r_package_installed("arrow") +R_RSQLITE = r_package_installed("RSQLite") +PYARROW = import_optional_dependency("pyarrow") + + +@pytest.fixture(params=["rda", "rds"]) +def rtype(request): + return request.param + + +@pytest.fixture( + params=[ + "csv", + pytest.param( + "parquet", + marks=pytest.mark.skipif( + R_ARROW is None or PYARROW is None, + reason="R arrow or pyarrow not installed", + ), + ), + pytest.param( + "feather", + marks=pytest.mark.skipif( + R_ARROW is None or PYARROW is None, + reason="R arrow or pyarrow not installed", + ), + ), + pytest.param( + "sqlite", + marks=pytest.mark.skipif( + R_RSQLITE is None, reason="R RSQLite not installed" + ), + ), + ] +) +def mode(request): + return request.param + + +@pytest.fixture(params=[True, False, None]) +def ascii(request): + return request.param + + +@pytest.fixture(params=[False, "gzip", "bzip2", "xz"]) +def comp(request): + return request.param + + +# RDA READER + +# PATH_OR_BUFFER + + +def test_read_rds_file(datapath): + filename = datapath("io", "data", "rdata", "ghg_df.rds") + r_df = read_rdata(filename, engine="rscript") + + tm.assert_frame_equal(ghg_df, r_df.tail()) + + +def test_read_rda_file(datapath): + filename = datapath("io", "data", "rdata", "env_data_dfs.rda") + r_dfs = read_rdata(filename, engine="rscript") + + assert list(r_dfs.keys()) == ["plants_df", "sea_ice_df", "ghg_df"] + + tm.assert_frame_equal(ghg_df, r_dfs["ghg_df"].tail()) + tm.assert_frame_equal(plants_df, r_dfs["plants_df"].tail()) + tm.assert_frame_equal(sea_ice_df, r_dfs["sea_ice_df"].tail()) + + +def test_buffer_read_rds(datapath): + filename = datapath("io", "data", "rdata", "sea_ice_df.rds") + + with open(filename, "rb") as f: + r_df = read_rdata(f, file_format="rds", engine="rscript") + + r_df = adj_int(r_df) + + tm.assert_frame_equal(sea_ice_df, r_df.tail()) + + +def test_bytes_read_rda(datapath): + filename = datapath("io", "data", "rdata", "env_data_dfs.rda") + + with open(filename, "rb") as f: + r_dfs = read_rdata(f.read(), file_format="rda", engine="rscript") + + r_dfs = {k: adj_int(v) for k, v in r_dfs.items()} + + assert list(r_dfs.keys()) == ["plants_df", "sea_ice_df", "ghg_df"] + + tm.assert_frame_equal(ghg_df, r_dfs["ghg_df"].tail()) + tm.assert_frame_equal(plants_df, r_dfs["plants_df"].tail()) + tm.assert_frame_equal(sea_ice_df, r_dfs["sea_ice_df"].tail()) + + +def test_bytesio_rds(datapath): + filename = datapath("io", "data", "rdata", "sea_ice_df.rds") + + with open(filename, "rb") as f: + with BytesIO(f.read()) as b_io: + r_df = read_rdata(b_io, file_format="rds", engine="rscript") + + r_df = adj_int(r_df) + + tm.assert_frame_equal(sea_ice_df, r_df.tail()) + + +def test_bytesio_rda(datapath): + filename = datapath("io", "data", "rdata", "env_data_dfs.rda") + + with open(filename, "rb") as f: + with BytesIO(f.read()) as b_io: + r_dfs = read_rdata(b_io, file_format="rda", engine="rscript") + + r_dfs = {k: adj_int(v) for k, v in r_dfs.items()} + + assert list(r_dfs.keys()) == ["plants_df", "sea_ice_df", "ghg_df"] + + tm.assert_frame_equal(ghg_df, r_dfs["ghg_df"].tail()) + tm.assert_frame_equal(plants_df, r_dfs["plants_df"].tail()) + tm.assert_frame_equal(sea_ice_df, r_dfs["sea_ice_df"].tail()) + + +# FILE FORMAT + + +def test_read_wrong_format(datapath): + with pytest.raises(ValueError, match="not a valid value for file_format"): + filename = datapath("io", "data", "rdata", "plants_df.rds") + read_rdata(filename, engine="rscript", file_format="r") + + +def test_read_wrong_file(): + with pytest.raises(FileNotFoundError, match="file cannot be found"): + filename = os.path.join("data", "rdata", "plants_df.rda") + read_rdata(filename, engine="rscript") + + +@pytest.mark.slow +def test_read_rds_non_dfs(datapath, mode): + with pytest.raises( + ValueError, match="No actual data frame or coercible data frames" + ): + filename = datapath("io", "data", "rdata", "ghg_t_tests.rds") + read_rdata(filename, engine="rscript", mode=mode) + + +@pytest.mark.slow +def test_read_rda_non_dfs(datapath, mode): + with pytest.raises( + ValueError, match="No actual data frame or coercible data frames" + ): + filename = datapath("io", "data", "rdata", "env_data_non_dfs.rda") + read_rdata(filename, engine="rscript", mode=mode) + + +def test_read_not_rda_file(datapath, mode): + with pytest.raises(RScriptError, match="bad restore file magic number"): + read_rdata( + datapath("io", "data", "rdata", "ppm_df.csv"), + file_format="rda", + engine="rscript", + mode=mode, + ) + + +def test_read_not_rds_file(datapath, mode): + with pytest.raises(RScriptError, match="unknown input format"): + read_rdata( + datapath("io", "data", "rdata", "ppm_df.csv"), + file_format="rds", + engine="rscript", + mode=mode, + ) + + +def test_bytes_read_infer_rds(datapath): + filename = datapath("io", "data", "rdata", "sea_ice_df.rds") + + with pytest.raises(ValueError, match="Unable to infer file format from file name"): + with open(filename, "rb") as f: + read_rdata(f.read(), engine="rscript") + + +def test_bytes_read_infer_rda(datapath): + filename = datapath("io", "data", "rdata", "env_data_dfs.rda") + + with pytest.raises(ValueError, match="Unable to infer file format from file name"): + with open(filename, "rb") as f: + read_rdata(f.read(), engine="rscript") + + +# URL + + +@tm.network +def test_read_rda_url(): + url_df = DataFrame( + { + "carrier": {"1": "9E", "2": "AA", "3": "AS", "4": "B6", "5": "DL"}, + "name": { + "1": "Endeavor Air Inc.", + "2": "American Airlines Inc.", + "3": "Alaska Airlines Inc.", + "4": "JetBlue Airways", + "5": "Delta Air Lines Inc.", + }, + } + ).rename_axis("rownames") + + url = ( + "https://github.com/hadley/nycflights13/blob/master/data/airlines.rda?raw=true" + ) + r_df = read_rdata(url, file_format="rda", engine="rscript")["airlines"] + + tm.assert_frame_equal(url_df, r_df.head()) + + +@tm.network +def test_read_unable_infer_format(): + with pytest.raises(ValueError, match="Unable to infer file format from file name"): + url = ( + "https://github.com/hadley/nycflights13/" + "blob/master/data/airlines.rda?raw=true" + ) + read_rdata(url, engine="rscript") + + +@tm.network +def test_read_wrong_url(): + with pytest.raises(HTTPError, match="HTTP Error 404: Not Found"): + url = "https://example.com/data.rdata" + read_rdata(url, engine="rscript") + + +# S3 + + +@tm.network +@pytest.mark.slow +def test_read_rda_s3(): + s3 = "s3://assets.datacamp.com/production/course_1478/datasets/wine.RData" + s3_df = DataFrame( + { + "Alcohol": {"1": 13.2, "2": 13.16, "3": 14.37, "4": 13.24, "5": 14.2}, + "Malic acid": {"1": 1.78, "2": 2.36, "3": 1.95, "4": 2.59, "5": 1.76}, + "Ash": {"1": 2.14, "2": 2.67, "3": 2.5, "4": 2.87, "5": 2.45}, + "Alcalinity of ash": { + "1": 11.2, + "2": 18.6, + "3": 16.8, + "4": 21.0, + "5": 15.2, + }, + "Magnesium": {"1": 100, "2": 101, "3": 113, "4": 118, "5": 112}, + "Total phenols": {"1": 2.65, "2": 2.8, "3": 3.85, "4": 2.8, "5": 3.27}, + "Flavanoids": {"1": 2.76, "2": 3.24, "3": 3.49, "4": 2.69, "5": 3.39}, + "Nonflavanoid phenols": { + "1": 0.26, + "2": 0.3, + "3": 0.24, + "4": 0.39, + "5": 0.34, + }, + "Proanthocyanins": {"1": 1.28, "2": 2.81, "3": 2.18, "4": 1.82, "5": 1.97}, + "Color intensity": {"1": 4.38, "2": 5.68, "3": 7.8, "4": 4.32, "5": 6.75}, + "Hue": {"1": 3.4, "2": 3.17, "3": 3.45, "4": 2.93, "5": 2.85}, + "Proline": {"1": 1050, "2": 1185, "3": 1480, "4": 735, "5": 1450}, + } + ).rename_axis("rownames") + r_dfs = read_rdata(s3, engine="rscript") + + tm.assert_frame_equal(s3_df, r_dfs["wine"].head()) + + +# ENGINE + + +def test_read_rds_df_output(datapath): + filename = datapath("io", "data", "rdata", "sea_ice_df.rds") + r_dfs = read_rdata(filename, engine="rscript") + + assert isinstance(r_dfs, DataFrame) + + +def test_read_rda_dict_output(datapath): + filename = datapath("io", "data", "rdata", "env_data_dfs.rda") + r_dfs = read_rdata(filename, engine="rscript") + + assert isinstance(r_dfs, dict) + assert list(r_dfs.keys()) == ["plants_df", "sea_ice_df", "ghg_df"] + + +def test_read_wrong_engine(datapath): + with pytest.raises(ValueError, match="not a supported engine"): + filename = datapath("io", "data", "rdata", "sea_ice_df.rds") + read_rdata(filename, engine="rpy2") + + +# MODE + + +@pytest.mark.slow +def test_read_rds_mode_file(datapath, mode): + filename = datapath("io", "data", "rdata", "ghg_df.rds") + r_df = read_rdata(filename, engine="rscript", mode=mode) + + r_df = adj_int(r_df) + + tm.assert_frame_equal(ghg_df, r_df.tail()) + + +@pytest.mark.slow +def test_read_rda_mode_file(datapath, mode): + filename = datapath("io", "data", "rdata", "env_data_dfs.rda") + r_dfs = read_rdata(filename, engine="rscript", mode=mode) + + if mode in ["parquet", "feather"]: + (r_dfs["ghg_df"], r_dfs["plants_df"], r_dfs["sea_ice_df"]) = ( + adj_int(r_dfs["ghg_df"]), + adj_int(r_dfs["plants_df"]), + adj_int(r_dfs["sea_ice_df"]), + ) + + assert list(r_dfs.keys()) == ["plants_df", "sea_ice_df", "ghg_df"] + + tm.assert_frame_equal(ghg_df, r_dfs["ghg_df"].tail()) + tm.assert_frame_equal(plants_df, r_dfs["plants_df"].tail()) + tm.assert_frame_equal(sea_ice_df, r_dfs["sea_ice_df"].tail()) + + +def test_read_wrong_mode(datapath): + with pytest.raises(ValueError, match="not supported value for mode"): + filename = datapath("io", "data", "rdata", "plants_df.rds") + read_rdata(filename, engine="rscript", mode="pickle") + + +# USE_OBJECTS + + +def test_read_select_frames_rda_dfs(datapath): + filename = datapath("io", "data", "rdata", "env_data_dfs.rda") + r_dfs = read_rdata( + filename, engine="rscript", select_frames=["ghg_df", "sea_ice_df"] + ) + + assert "plants_df" not in list(r_dfs.keys()) + assert "ghg_df" in list(r_dfs.keys()) + assert "sea_ice_df" in list(r_dfs.keys()) + + +def test_read_select_frames_rda_objs(datapath): + filename = datapath("io", "data", "rdata", "env_data_objs.rda") + r_dfs = read_rdata( + filename, + engine="rscript", + select_frames=["ppm_ts", "species_mtx", "plants_arry"], + ) + + assert "species_vec" not in list(r_dfs.keys()) + assert "ghg_df" not in list(r_dfs.keys()) + + assert "ppm_ts" in list(r_dfs.keys()) + assert "species_mtx" in list(r_dfs.keys()) + assert "plants_arry" in list(r_dfs.keys()) + + +def test_read_wrong_select_frames(datapath): + with pytest.raises(TypeError, match="not a valid type for select_frames"): + filename = datapath("io", "data", "rdata", "env_data_dfs.rda") + read_rdata(filename, engine="rscript", select_frames="plants_df") + + +# ROWNAMES + + +def test_read_rownames_true_rds(datapath): + filename = datapath("io", "data", "rdata", "sea_ice_df.rds") + r_df = read_rdata(filename, engine="rscript", rownames=True) + + assert r_df.index.name == "rownames" + + +def test_read_rownames_false_rds(datapath): + filename = datapath("io", "data", "rdata", "sea_ice_df.rds") + r_df = read_rdata(filename, engine="rscript", rownames=False) + + assert r_df.index.name != "rownames" + + +def test_read_rownames_true_rda(datapath): + filename = datapath("io", "data", "rdata", "env_data_dfs.rda") + r_dfs = read_rdata(filename, engine="rscript", rownames=True) + + assert r_dfs["ghg_df"].index.name == "rownames" + assert r_dfs["plants_df"].index.name == "rownames" + assert r_dfs["sea_ice_df"].index.name == "rownames" + + +def test_read_rownames_false_rda(datapath): + filename = datapath("io", "data", "rdata", "env_data_dfs.rda") + r_dfs = read_rdata(filename, engine="rscript", rownames=False) + + assert r_dfs["ghg_df"].index.name != "rownames" + assert r_dfs["plants_df"].index.name != "rownames" + assert r_dfs["sea_ice_df"].index.name != "rownames" + + +# ENCODING + + +def test_non_utf8_data(datapath, rtype): + filename = datapath("io", "data", "rdata", f"climate_non_utf8_df.{rtype}") + + expected = DataFrame( + { + "número": { + "1": 1, + "2": 2, + "3": 3, + "4": 4, + "5": 5, + "6": 6, + "7": 7, + "8": 8, + "9": 9, + "10": 10, + }, + "punto central del climatismo": { + "1": "Parada de la circulación de vuelco meridional del Atlántico", + "2": "Desintegración de la capa de hielo de la Antártida occidental", + "3": "Muerte de la selva amazónica", + "4": "Cambio de monzón en África occidental", + "5": "Permafrost e hidratos de metano", + "6": "Muerte de los arrecifes de coral", + "7": "Cambio de monzón de la India", + "8": "Desintegración de la capa de hielo de Groenlandia", + "9": "Desplazamiento del bosque boreal", + "10": "Reducción del hielo marino del Ártico ", + }, + }, + index=[str(i) for i in range(1, 11)], + ).rename_axis("rownames") + + rdfs = read_rdata(filename, engine="rscript", encoding="iso-8859-1", mode="csv") + + output = rdfs["climate_df"] if rtype == "rda" else rdfs + + tm.assert_frame_equal(output, expected) + + +# RDA WRITER + +# PATH_OR_BUFFER + + +@pytest.mark.slow +def test_write_read_file(datapath, rtype, mode): + with tm.ensure_clean("test.out") as path: + ghg_df.to_rdata( + path, file_format=rtype, engine="rscript", mode=mode, index=False + ) + r_dfs = read_rdata( + path, file_format=rtype, engine="rscript", mode=mode, rownames=False + ) + + expected = ghg_df.reset_index(drop=True) + output = r_dfs if rtype == "rds" else r_dfs["pandas_dataframe"] + output["year"] = output["year"].astype("int64") + + tm.assert_frame_equal(output, expected) + + +@pytest.mark.slow +def test_write_read_bytes_io(datapath, rtype, mode): + with BytesIO() as b_io: + sea_ice_df.to_rdata( + b_io, file_format=rtype, engine="rscript", mode=mode, index=False + ) + r_dfs = read_rdata( + b_io.getvalue(), + file_format=rtype, + engine="rscript", + mode=mode, + rownames=False, + ) + + expected = sea_ice_df.reset_index(drop=True) + output = r_dfs if rtype == "rds" else r_dfs["pandas_dataframe"] + output["year"] = output["year"].astype("int64") + output["mo"] = output["mo"].astype("int64") + + tm.assert_frame_equal(output, expected) + + +# FILE_FORMAT + + +def test_write_rda_file(rtype): + expected = """\ +$pandas_dataframe + rownames year mo data.type region extent area +1 1012 2016 12 Goddard S 8.28 5.51 +2 1013 2017 12 Goddard S 9.48 6.23 +3 1014 2018 12 Goddard S 9.19 5.59 +4 1015 2019 12 Goddard S 9.41 6.59 +5 1016 2020 12 NRTSI-G S 10.44 6.50 + +""" + with tm.ensure_clean_dir() as tmp_dir: + out_file = os.path.join(tmp_dir, "rdata.out") + r_code = os.path.join(tmp_dir, "r_test.R") + + sea_ice_df.to_rdata(out_file, file_format=rtype, engine="rscript") + + cmds = ["Rscript", r_code, out_file, rtype, "pandas_dataframe"] + output = run_rscript(cmds) + + assert output == expected + + +def test_write_wrong_format(): + with tm.ensure_clean("test.rda") as path: + with pytest.raises(ValueError, match=("not a valid value for file_format")): + ghg_df.to_rdata(path, engine="rscript", file_format="csv") + + +def test_write_unable_to_infer(): + with tm.ensure_clean("test") as path: + with pytest.raises( + ValueError, match=("Unable to infer file format from file name") + ): + ghg_df.to_rdata(path, engine="rscript") + + +# ENGINE + + +@td.skip_if_no("pyreadr") +def test_write_engine_consistency(rtype): + expected = """\ +$pandas_dataframe + rownames plant_group status count +1 16 Pteridophytes Data Deficient 398 +2 17 Pteridophytes Extinct 65 +3 18 Pteridophytes Not Threatened 1294 +4 19 Pteridophytes Possibly Threatened 408 +5 20 Pteridophytes Threatened 1275 + +""" + with tm.ensure_clean_dir() as tmp_dir: + out_file = os.path.join(tmp_dir, "rdata.out") + r_code = os.path.join(tmp_dir, "r_test.R") + + plants_df.to_rdata(out_file, file_format=rtype, engine="pyreadr") + cmds = ["Rscript", r_code, out_file, rtype, "pandas_dataframe"] + pyr_output = run_rscript(cmds) + + plants_df.to_rdata(out_file, file_format=rtype, engine="rscript") + cmds = ["Rscript", r_code, out_file, rtype, "pandas_dataframe"] + rcomp_output = run_rscript(cmds) + + assert pyr_output == expected + assert pyr_output == rcomp_output + + +def test_write_wrong_engine(): + with tm.ensure_clean("test.rda") as path: + with pytest.raises(ValueError, match=("not a supported engine")): + ghg_df.to_rdata(path, engine="rpy2") + + +# MODE + + +@pytest.mark.slow +def test_write_mode(rtype, mode): + expected = """\ +$pandas_dataframe + rownames gas year emissions +1 141 Carbon dioxide 2018 5424.8815 +2 142 Methane 2018 634.4571 +3 143 Nitrous oxide 2018 434.5286 +4 144 Fluorinated gases 2018 182.7824 +5 145 Total 2018 6676.6496 + +""" + with tm.ensure_clean_dir() as tmp_dir: + out_file = os.path.join(tmp_dir, "rdata.out") + r_code = os.path.join(tmp_dir, "r_test.R") + + ghg_df.to_rdata(out_file, file_format=rtype, engine="rscript", mode=mode) + cmds = ["Rscript", r_code, out_file, rtype, "pandas_dataframe"] + output = run_rscript(cmds) + + assert output == expected + + +def test_write_wrong_mode(): + with tm.ensure_clean("test.rds") as path: + with pytest.raises(ValueError, match=("not supported value for mode")): + ghg_df.to_rdata(path, engine="rscript", mode="pickle") + + +# INDEX + + +@pytest.mark.slow +def test_write_index_false(rtype, mode): + expected = """\ +$pandas_dataframe + gas year emissions +1 Carbon dioxide 2018 5424.8815 +2 Methane 2018 634.4571 +3 Nitrous oxide 2018 434.5286 +4 Fluorinated gases 2018 182.7824 +5 Total 2018 6676.6496 + +""" + with tm.ensure_clean_dir() as tmp_dir: + out_file = os.path.join(tmp_dir, "rdata.out") + r_code = os.path.join(tmp_dir, "r_test.R") + + ghg_df.to_rdata( + out_file, file_format=rtype, index=False, engine="rscript", mode=mode + ) + + cmds = ["Rscript", r_code, out_file, rtype, "pandas_dataframe"] + output = run_rscript(cmds) + + assert output == expected + + +# ASCII + + +@pytest.mark.slow +def test_write_ascii_output(rtype, mode, ascii): + expected = """\ +$pandas_dataframe + rownames gas year emissions +1 141 Carbon dioxide 2018 5424.8815 +2 142 Methane 2018 634.4571 +3 143 Nitrous oxide 2018 434.5286 +4 144 Fluorinated gases 2018 182.7824 +5 145 Total 2018 6676.6496 + +""" + with tm.ensure_clean_dir() as tmp_dir: + out_file = os.path.join(tmp_dir, "rdata.out") + r_code = os.path.join(tmp_dir, "r_test.R") + + ghg_df.to_rdata( + out_file, file_format=rtype, engine="rscript", mode=mode, ascii=ascii + ) + + cmds = ["Rscript", r_code, out_file, rtype, "pandas_dataframe"] + output = run_rscript(cmds) + + assert output == expected + + +def test_write_read_ascii(rtype): + with tm.ensure_clean_dir() as tmp_dir: + out_file = os.path.join(tmp_dir, "rdata.out") + + ghg_df.to_rdata( + out_file, + file_format=rtype, + engine="rscript", + index=False, + ascii=True, + compress=False, + ) + + with open(out_file) as f: + r_dfs = read_rdata(f, file_format=rtype, engine="rscript", rownames=False) + + expected = ghg_df.reset_index(drop=True) + output = r_dfs if rtype == "rds" else r_dfs["pandas_dataframe"] + output["year"] = output["year"].astype("int64") + + tm.assert_frame_equal(output, expected) + + +# COMPRESS + + +@pytest.mark.slow +def test_write_compress_types(rtype, mode, comp): + expected = """\ +$pandas_dataframe + rownames year mo data.type region extent area +1 1012 2016 12 Goddard S 8.28 5.51 +2 1013 2017 12 Goddard S 9.48 6.23 +3 1014 2018 12 Goddard S 9.19 5.59 +4 1015 2019 12 Goddard S 9.41 6.59 +5 1016 2020 12 NRTSI-G S 10.44 6.50 + +""" + with tm.ensure_clean_dir() as tmp_dir: + out_file = os.path.join(tmp_dir, "rdata.out") + r_code = os.path.join(tmp_dir, "r_test.R") + + sea_ice_df.to_rdata( + out_file, file_format=rtype, engine="rscript", mode=mode, compress=comp + ) + + cmds = ["Rscript", r_code, out_file, rtype, "pandas_dataframe"] + output = run_rscript(cmds) + + assert output == expected + + +def test_write_wrong_comp(): + with tm.ensure_clean("test.rds") as path: + with pytest.raises(ValueError, match=("not a supported value for compress")): + ghg_df.to_rdata(path, engine="rscript", compress="zip") + + +def test_write_none_comp(): + with tm.ensure_clean("test.rds") as path: + with pytest.raises(RScriptError, match=("invalid 'compress' argument")): + ghg_df.to_rdata(path, engine="rscript", compress=None) + + +# OTHER_FRAMES + + +@pytest.mark.slow +def test_write_other_frames(mode): + expected = """\ +$ghg_df + rownames gas year emissions +1 141 Carbon dioxide 2018 5424.8815 +2 142 Methane 2018 634.4571 +3 143 Nitrous oxide 2018 434.5286 +4 144 Fluorinated gases 2018 182.7824 +5 145 Total 2018 6676.6496 + +$plants_df + rownames plant_group status count +1 16 Pteridophytes Data Deficient 398 +2 17 Pteridophytes Extinct 65 +3 18 Pteridophytes Not Threatened 1294 +4 19 Pteridophytes Possibly Threatened 408 +5 20 Pteridophytes Threatened 1275 + +$sea_ice_df + rownames year mo data.type region extent area +1 1012 2016 12 Goddard S 8.28 5.51 +2 1013 2017 12 Goddard S 9.48 6.23 +3 1014 2018 12 Goddard S 9.19 5.59 +4 1015 2019 12 Goddard S 9.41 6.59 +5 1016 2020 12 NRTSI-G S 10.44 6.50 + +""" + with tm.ensure_clean_dir() as tmp_dir: + out_file = os.path.join(tmp_dir, "rdata.rda") + r_code = os.path.join(tmp_dir, "r_test.R") + + ghg_df.to_rdata( + out_file, + engine="rscript", + other_frames=[plants_df, sea_ice_df], + rda_names=["ghg_df", "plants_df", "sea_ice_df"], + mode=mode, + ) + + cmds = ["Rscript", r_code, out_file, "rda", ""] + output = run_rscript(cmds) + + assert output == expected + + +def test_write_other_frames_wrong_type(): + with tm.ensure_clean("test.rds") as path: + with pytest.raises( + TypeError, match=("objects in other_frames is not a DataFrame") + ): + ghg_df.to_rdata( + path, engine="rscript", other_frames=plants_df, rda_names=["plants_df"] + ) + + +def test_write_read_other_frames(datapath): + with tm.ensure_clean("test.rda") as path: + ghg_df.to_rdata( + path, + engine="rscript", + other_frames=[plants_df, sea_ice_df], + rda_names=["ghg_df", "plants_df", "sea_ice_df"], + ) + r_dfs = read_rdata(path, engine="rscript") + + assert list(r_dfs.keys()) == ["plants_df", "sea_ice_df", "ghg_df"] + + +# RDA NAMES + + +def test_write_mismatched_names_frames(): + with tm.ensure_clean("test.rds") as path: + with pytest.raises( + ValueError, + match=("does not match number of current DataFrame and other_frames"), + ): + ghg_df.to_rdata( + path, + engine="rscript", + other_frames=[plants_df, sea_ice_df], + rda_names=["plants_df", "sea_ice_df"], + ) diff --git a/requirements-dev.txt b/requirements-dev.txt index 1817d79f96139..5325ffb3c52b3 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -74,6 +74,7 @@ sqlalchemy xarray cftime pyreadstat +pyreadr tabulate>=0.8.3 natsort git+https://github.com/pydata/pydata-sphinx-theme.git@master From 3379fa1e562c0bd17de9d645abb58047513ec29f Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Sun, 11 Apr 2021 16:17:33 -0500 Subject: [PATCH 2/8] Fix skipif logic for test params, move package checks, add to test_api --- pandas/io/rdata.py | 22 +++++++++++++++++ pandas/tests/api/test_api.py | 1 + pandas/tests/io/rdata/test_rscript.py | 35 ++++++--------------------- 3 files changed, 30 insertions(+), 28 deletions(-) diff --git a/pandas/io/rdata.py b/pandas/io/rdata.py index ffd726e8cfbff..72c207b6adb88 100644 --- a/pandas/io/rdata.py +++ b/pandas/io/rdata.py @@ -72,6 +72,28 @@ def _executable_exists(name) -> bool: RSCRIPT_EXISTS = _executable_exists("Rscript") +def r_package_installed(name): + """ + Check if R package is installed. + + Method runs a quick command line call to Rscript to + check if library call succeeds on named package. + """ + + p = subprocess.Popen( + ["Rscript", "-e", f"suppressPackageStartupMessages(library({name}))"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + out, err = p.communicate() + + return len(err) == 0 + + +R_ARROW = r_package_installed("arrow") if RSCRIPT_EXISTS else None +R_RSQLITE = r_package_installed("RSQLite") if RSCRIPT_EXISTS else None + + @doc(storage_options=_shared_docs["storage_options"]) def read_rdata( path_or_buffer: FilePathOrBuffer, diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 11bb554a0dc5a..d9934c89e9cf5 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -162,6 +162,7 @@ class TestPDApi(Base): "read_xml", "read_json", "read_pickle", + "read_rdata", "read_sas", "read_sql", "read_sql_query", diff --git a/pandas/tests/io/rdata/test_rscript.py b/pandas/tests/io/rdata/test_rscript.py index 660187d3c2a1b..8050ab2baaca2 100644 --- a/pandas/tests/io/rdata/test_rscript.py +++ b/pandas/tests/io/rdata/test_rscript.py @@ -12,6 +12,8 @@ import pandas._testing as tm from pandas.io.rdata import ( + R_ARROW, + R_RSQLITE, RSCRIPT_EXISTS, RScriptError, read_rdata, @@ -19,6 +21,8 @@ pytestmark = pytest.mark.skipif(not RSCRIPT_EXISTS, reason="R is not installed.") +PYARROW = import_optional_dependency("pyarrow") + ghg_df = DataFrame( { "gas": { @@ -83,24 +87,6 @@ ).rename_axis("rownames") -def r_package_installed(name): - """ - Check if R package is installed. - - Method runs a quick command line call to Rscript to - check if library call succeeds on named package. - """ - - p = subprocess.Popen( - ["Rscript", "-e", f"suppressPackageStartupMessages(library({name}))"], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) - out, err = p.communicate() - - return len(err) == 0 - - def run_rscript(cmds) -> str: """ Run R script at command line. @@ -154,11 +140,6 @@ def handle_index_rownames(df): return df -R_ARROW = r_package_installed("arrow") -R_RSQLITE = r_package_installed("RSQLite") -PYARROW = import_optional_dependency("pyarrow") - - @pytest.fixture(params=["rda", "rds"]) def rtype(request): return request.param @@ -170,22 +151,20 @@ def rtype(request): pytest.param( "parquet", marks=pytest.mark.skipif( - R_ARROW is None or PYARROW is None, + not R_ARROW or not PYARROW, reason="R arrow or pyarrow not installed", ), ), pytest.param( "feather", marks=pytest.mark.skipif( - R_ARROW is None or PYARROW is None, + not R_ARROW or not PYARROW, reason="R arrow or pyarrow not installed", ), ), pytest.param( "sqlite", - marks=pytest.mark.skipif( - R_RSQLITE is None, reason="R RSQLite not installed" - ), + marks=pytest.mark.skipif(not R_RSQLITE, reason="R RSQLite not installed"), ), ] ) From 966cb789ec5abae2ce5cb7555b9347c6a792fb24 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Sun, 11 Apr 2021 19:11:56 -0500 Subject: [PATCH 3/8] Refactor from built-in filter, add encoding to subprocess and locale skip --- pandas/io/rdata.py | 43 ++++++++------------------- pandas/tests/io/rdata/test_rscript.py | 38 ++++++++++++++++++----- 2 files changed, 44 insertions(+), 37 deletions(-) diff --git a/pandas/io/rdata.py b/pandas/io/rdata.py index 72c207b6adb88..f9f800128ef5d 100644 --- a/pandas/io/rdata.py +++ b/pandas/io/rdata.py @@ -72,28 +72,6 @@ def _executable_exists(name) -> bool: RSCRIPT_EXISTS = _executable_exists("Rscript") -def r_package_installed(name): - """ - Check if R package is installed. - - Method runs a quick command line call to Rscript to - check if library call succeeds on named package. - """ - - p = subprocess.Popen( - ["Rscript", "-e", f"suppressPackageStartupMessages(library({name}))"], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) - out, err = p.communicate() - - return len(err) == 0 - - -R_ARROW = r_package_installed("arrow") if RSCRIPT_EXISTS else None -R_RSQLITE = r_package_installed("RSQLite") if RSCRIPT_EXISTS else None - - @doc(storage_options=_shared_docs["storage_options"]) def read_rdata( path_or_buffer: FilePathOrBuffer, @@ -640,13 +618,14 @@ def run_rscript(self, tmp_dir, r_batch, cmds) -> str: stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, + encoding=self.encoding, cwd=tmp_dir, ) output, error = p.communicate() if len(error) != 0: - raise RScriptError(error.decode(self.encoding)) + raise RScriptError(error) - return output.decode(self.encoding) + return output def parse_data(self) -> Union[DataFrame, Dict[str, DataFrame]]: self.r_to_py_types = { @@ -750,10 +729,11 @@ def read_rdata_csv(self) -> Dict[str, DataFrame]: rda_file = self.buffer_to_disk(tmp_dir) output = self.run_rscript(tmp_dir, r_batch, ["Rscript", r_file, rda_file]) + output = [i for i in output.strip().split("\n") if i != ""] oline: str dfs: Dict[str, DataFrame] = {} - for oline in filter(None, output.strip().split("\n")): + for oline in output: with open( os.path.join(tmp_dir, f"meta_{oline}.txt"), encoding=self.encoding, @@ -821,11 +801,12 @@ def read_rdata_feather(self) -> Dict[str, DataFrame]: rda_file = self.buffer_to_disk(tmp_dir) output = self.run_rscript(tmp_dir, r_batch, ["Rscript", r_file, rda_file]) + output = [i for i in output.strip().split("\n") if i != ""] oline: str dfs: Dict[str, DataFrame] = { oline: read_feather(os.path.join(tmp_dir, f"data_{oline}.feather")) - for oline in filter(None, output.strip().split("\n")) + for oline in output } return dfs @@ -870,11 +851,12 @@ def read_rdata_parquet(self) -> Dict[str, DataFrame]: rda_file = self.buffer_to_disk(tmp_dir) output = self.run_rscript(tmp_dir, r_batch, ["Rscript", r_file, rda_file]) + output = [i for i in output.strip().split("\n") if i != ""] oline: str dfs: Dict[str, DataFrame] = { oline: read_parquet(os.path.join(tmp_dir, f"data_{oline}.parquet")) - for oline in filter(None, output.strip().split("\n")) + for oline in output } return dfs @@ -923,12 +905,12 @@ def read_rdata_sqlite(self) -> Dict[str, DataFrame]: rda_file = self.buffer_to_disk(tmp_dir) output = self.run_rscript(tmp_dir, r_batch, ["Rscript", r_file, rda_file]) + output = [i for i in output.strip().split("\n") if i != ""] oline: str conn = sqlite3.connect(r_db) dfs: Dict[str, DataFrame] = { - oline: read_sql(f"SELECT * FROM data_{oline}", conn) - for oline in filter(None, output.strip().split("\n")) + oline: read_sql(f"SELECT * FROM data_{oline}", conn) for oline in output } conn.close() @@ -1375,11 +1357,12 @@ def run_rscript(self, tmp_dir, r_batch, cmds) -> None: stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, + encoding=self.encoding, cwd=tmp_dir, ) output, error = a.communicate() if len(error) != 0: - raise RScriptError(error.decode(self.encoding)) + raise RScriptError(error) return None diff --git a/pandas/tests/io/rdata/test_rscript.py b/pandas/tests/io/rdata/test_rscript.py index 8050ab2baaca2..b003c16f3abb3 100644 --- a/pandas/tests/io/rdata/test_rscript.py +++ b/pandas/tests/io/rdata/test_rscript.py @@ -12,8 +12,6 @@ import pandas._testing as tm from pandas.io.rdata import ( - R_ARROW, - R_RSQLITE, RSCRIPT_EXISTS, RScriptError, read_rdata, @@ -21,8 +19,6 @@ pytestmark = pytest.mark.skipif(not RSCRIPT_EXISTS, reason="R is not installed.") -PYARROW = import_optional_dependency("pyarrow") - ghg_df = DataFrame( { "gas": { @@ -112,13 +108,40 @@ def run_rscript(cmds) -> str: f.write(r_batch) p = subprocess.Popen( - cmds, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE + cmds, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + encoding="UTF-8", ) output, error = p.communicate() if len(error) != 0: - raise ValueError(error.decode("UTF-8")) + raise ValueError(error) + + return output + + +def r_package_installed(name): + """ + Check if R package is installed. + + Method runs a quick command line call to Rscript to + check if library call succeeds on named package. + """ + + p = subprocess.Popen( + ["Rscript", "-e", f"suppressPackageStartupMessages(library({name}))"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + out, err = p.communicate() + + return len(err) == 0 + - return output.decode("UTF-8") +R_ARROW = r_package_installed("arrow") if RSCRIPT_EXISTS else None +R_RSQLITE = r_package_installed("RSQLite") if RSCRIPT_EXISTS else None +PYARROW = import_optional_dependency("pyarrow", errors="ignore") def adj_int(df): @@ -547,6 +570,7 @@ def test_read_rownames_false_rda(datapath): # ENCODING +@td.skip_if_not_us_locale def test_non_utf8_data(datapath, rtype): filename = datapath("io", "data", "rdata", f"climate_non_utf8_df.{rtype}") From 22c7ade5fa905687d2d6367ff73cab199754deb5 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Sun, 11 Apr 2021 23:03:04 -0500 Subject: [PATCH 4/8] Fix tests for OS newline and mypy, mark xfail, use default mode in io.rst --- doc/source/user_guide/io.rst | 2 +- pandas/io/rdata.py | 17 +++++----- pandas/tests/io/rdata/test_pyreadr.py | 49 ++++++++++++++++----------- pandas/tests/io/rdata/test_rscript.py | 45 ++++++++++++++---------- 4 files changed, 67 insertions(+), 46 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 75a3626ef80b5..f264ec0aba0f6 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -6058,7 +6058,7 @@ to read data natively in R and transfer content with several options of ``mode`` .. ipython:: python rds_file = os.path.join(file_path, "plants_df.rds") - plants_df = pd.read_rdata(rds_file, engine="rscript", mode="sqlite").tail() + plants_df = pd.read_rdata(rds_file, engine="rscript", mode="csv").tail() plants_df .. note:: diff --git a/pandas/io/rdata.py b/pandas/io/rdata.py index f9f800128ef5d..2595149c03444 100644 --- a/pandas/io/rdata.py +++ b/pandas/io/rdata.py @@ -729,11 +729,11 @@ def read_rdata_csv(self) -> Dict[str, DataFrame]: rda_file = self.buffer_to_disk(tmp_dir) output = self.run_rscript(tmp_dir, r_batch, ["Rscript", r_file, rda_file]) - output = [i for i in output.strip().split("\n") if i != ""] + output_list = [i for i in output.strip().split("\n") if i != ""] oline: str dfs: Dict[str, DataFrame] = {} - for oline in output: + for oline in output_list: with open( os.path.join(tmp_dir, f"meta_{oline}.txt"), encoding=self.encoding, @@ -801,12 +801,12 @@ def read_rdata_feather(self) -> Dict[str, DataFrame]: rda_file = self.buffer_to_disk(tmp_dir) output = self.run_rscript(tmp_dir, r_batch, ["Rscript", r_file, rda_file]) - output = [i for i in output.strip().split("\n") if i != ""] + output_list = [i for i in output.strip().split("\n") if i != ""] oline: str dfs: Dict[str, DataFrame] = { oline: read_feather(os.path.join(tmp_dir, f"data_{oline}.feather")) - for oline in output + for oline in output_list } return dfs @@ -851,12 +851,12 @@ def read_rdata_parquet(self) -> Dict[str, DataFrame]: rda_file = self.buffer_to_disk(tmp_dir) output = self.run_rscript(tmp_dir, r_batch, ["Rscript", r_file, rda_file]) - output = [i for i in output.strip().split("\n") if i != ""] + output_list = [i for i in output.strip().split("\n") if i != ""] oline: str dfs: Dict[str, DataFrame] = { oline: read_parquet(os.path.join(tmp_dir, f"data_{oline}.parquet")) - for oline in output + for oline in output_list } return dfs @@ -905,12 +905,13 @@ def read_rdata_sqlite(self) -> Dict[str, DataFrame]: rda_file = self.buffer_to_disk(tmp_dir) output = self.run_rscript(tmp_dir, r_batch, ["Rscript", r_file, rda_file]) - output = [i for i in output.strip().split("\n") if i != ""] + output_list = [i for i in output.strip().split("\n") if i != ""] oline: str conn = sqlite3.connect(r_db) dfs: Dict[str, DataFrame] = { - oline: read_sql(f"SELECT * FROM data_{oline}", conn) for oline in output + oline: read_sql(f"SELECT * FROM data_{oline}", conn) + for oline in output_list } conn.close() diff --git a/pandas/tests/io/rdata/test_pyreadr.py b/pandas/tests/io/rdata/test_pyreadr.py index 6aa6840fc0499..a7565eb729a42 100644 --- a/pandas/tests/io/rdata/test_pyreadr.py +++ b/pandas/tests/io/rdata/test_pyreadr.py @@ -114,16 +114,16 @@ def adj_int(df): def test_read_rds_file(datapath): filename = datapath("io", "data", "rdata", "ghg_df.rds") r_df = read_rdata(filename, engine="pyreadr") - r_df = adj_int(r_df) + output = adj_int(r_df).tail() - tm.assert_frame_equal(ghg_df, r_df.tail()) + tm.assert_frame_equal(ghg_df, output) def test_read_rda_file(datapath): filename = datapath("io", "data", "rdata", "env_data_dfs.rda") r_dfs = read_rdata(filename, engine="pyreadr") - r_dfs = {k: adj_int(v) for k, v in r_dfs.items()} + r_dfs = {str(k): adj_int(v) for k, v in r_dfs.items()} assert list(r_dfs.keys()) == ["ghg_df", "plants_df", "sea_ice_df"] @@ -136,20 +136,20 @@ def test_bytes_read_rds(datapath): filename = datapath("io", "data", "rdata", "sea_ice_df.rds") with open(filename, "rb") as f: - r_df = read_rdata(f.read(), file_format="rds", engine="pyreadr") + r_df = read_rdata(f, file_format="rds", engine="pyreadr") - r_df = adj_int(r_df) + output = adj_int(r_df).tail() - tm.assert_frame_equal(sea_ice_df, r_df.tail()) + tm.assert_frame_equal(sea_ice_df, output) def test_bytes_read_rda(datapath): filename = datapath("io", "data", "rdata", "env_data_dfs.rda") with open(filename, "rb") as f: - r_dfs = read_rdata(f.read(), file_format="rda", engine="pyreadr") + r_dfs = read_rdata(f, file_format="rda", engine="pyreadr") - r_dfs = {k: adj_int(v) for k, v in r_dfs.items()} + r_dfs = {str(k): adj_int(v) for k, v in r_dfs.items()} assert list(r_dfs.keys()) == ["ghg_df", "plants_df", "sea_ice_df"] @@ -165,9 +165,9 @@ def test_bytesio_rds(datapath): with BytesIO(f.read()) as b_io: r_df = read_rdata(b_io, file_format="rds", engine="pyreadr") - r_df = adj_int(r_df) + output = adj_int(r_df).tail() - tm.assert_frame_equal(sea_ice_df, r_df.tail()) + tm.assert_frame_equal(sea_ice_df, output) def test_bytesio_rda(datapath): @@ -177,7 +177,7 @@ def test_bytesio_rda(datapath): with BytesIO(f.read()) as b_io: r_dfs = read_rdata(b_io, file_format="rda", engine="pyreadr") - r_dfs = {k: adj_int(v) for k, v in r_dfs.items()} + r_dfs = {str(k): adj_int(v) for k, v in r_dfs.items()} assert list(r_dfs.keys()) == ["ghg_df", "plants_df", "sea_ice_df"] @@ -238,7 +238,7 @@ def test_bytes_read_infer_rds(datapath): with pytest.raises(ValueError, match="Unable to infer file format from file name"): with open(filename, "rb") as f: - read_rdata(f.read(), engine="pyreadr") + read_rdata(f, engine="pyreadr") def test_bytes_read_infer_rda(datapath): @@ -246,7 +246,7 @@ def test_bytes_read_infer_rda(datapath): with pytest.raises(ValueError, match="Unable to infer file format from file name"): with open(filename, "rb") as f: - read_rdata(f.read(), engine="pyreadr") + read_rdata(f, engine="pyreadr") # URL @@ -370,7 +370,11 @@ def test_read_select_frames_rda_dfs(datapath): def test_read_wrong_select_frames(datapath): with pytest.raises(TypeError, match="not a valid type for select_frames"): filename = datapath("io", "data", "rdata", "env_data_dfs.rda") - read_rdata(filename, engine="pyreadr", select_frames="plants_df") + read_rdata( + filename, + engine="pyreadr", + select_frames="plants_df", # type: ignore[arg-type] + ) # ROWNAMES @@ -380,14 +384,16 @@ def test_read_rownames_true_rds(datapath): filename = datapath("io", "data", "rdata", "sea_ice_df.rds") r_df = read_rdata(filename, engine="pyreadr", rownames=True) - assert r_df.index.name == "rownames" + if isinstance(r_df, DataFrame): + assert r_df.index.name == "rownames" def test_read_rownames_false_rds(datapath): filename = datapath("io", "data", "rdata", "sea_ice_df.rds") r_df = read_rdata(filename, engine="pyreadr", rownames=False) - assert r_df.index.name != "rownames" + if isinstance(r_df, DataFrame): + assert r_df.index.name != "rownames" def test_read_rownames_true_rda(datapath): @@ -457,7 +463,10 @@ def test_write_read_filelike(rtype): with BytesIO() as b_io: sea_ice_df.to_rdata(b_io, file_format=rtype, engine="pyreadr", index=False) r_dfs = read_rdata( - b_io.getvalue(), file_format=rtype, engine="pyreadr", rownames=False + b_io.getvalue(), # type: ignore[arg-type] + file_format=rtype, + engine="pyreadr", + rownames=False, ) expected = sea_ice_df.reset_index(drop=True) @@ -511,7 +520,8 @@ def test_index_true(rtype): r_df = r_dfs if rtype == "rds" else r_dfs["pandas_dataframe"] - assert "index" in r_df.columns + if isinstance(r_df, DataFrame): + assert "index" in r_df.columns def test_index_false(rtype): @@ -523,7 +533,8 @@ def test_index_false(rtype): r_df = r_dfs if rtype == "rds" else r_dfs["pandas_dataframe"] - assert "index" not in r_df.columns + if isinstance(r_df, DataFrame): + assert "index" not in r_df.columns # ASCII diff --git a/pandas/tests/io/rdata/test_rscript.py b/pandas/tests/io/rdata/test_rscript.py index b003c16f3abb3..95c0a6714c645 100644 --- a/pandas/tests/io/rdata/test_rscript.py +++ b/pandas/tests/io/rdata/test_rscript.py @@ -214,7 +214,8 @@ def test_read_rds_file(datapath): filename = datapath("io", "data", "rdata", "ghg_df.rds") r_df = read_rdata(filename, engine="rscript") - tm.assert_frame_equal(ghg_df, r_df.tail()) + if isinstance(r_df, DataFrame): + tm.assert_frame_equal(ghg_df, r_df.tail()) def test_read_rda_file(datapath): @@ -234,18 +235,18 @@ def test_buffer_read_rds(datapath): with open(filename, "rb") as f: r_df = read_rdata(f, file_format="rds", engine="rscript") - r_df = adj_int(r_df) + output = adj_int(r_df).tail() - tm.assert_frame_equal(sea_ice_df, r_df.tail()) + tm.assert_frame_equal(sea_ice_df, output) def test_bytes_read_rda(datapath): filename = datapath("io", "data", "rdata", "env_data_dfs.rda") with open(filename, "rb") as f: - r_dfs = read_rdata(f.read(), file_format="rda", engine="rscript") + r_dfs = read_rdata(f, file_format="rda", engine="rscript") - r_dfs = {k: adj_int(v) for k, v in r_dfs.items()} + r_dfs = {str(k): adj_int(v) for k, v in r_dfs.items()} assert list(r_dfs.keys()) == ["plants_df", "sea_ice_df", "ghg_df"] @@ -261,9 +262,9 @@ def test_bytesio_rds(datapath): with BytesIO(f.read()) as b_io: r_df = read_rdata(b_io, file_format="rds", engine="rscript") - r_df = adj_int(r_df) + output = adj_int(r_df).tail() - tm.assert_frame_equal(sea_ice_df, r_df.tail()) + tm.assert_frame_equal(sea_ice_df, output) def test_bytesio_rda(datapath): @@ -273,7 +274,7 @@ def test_bytesio_rda(datapath): with BytesIO(f.read()) as b_io: r_dfs = read_rdata(b_io, file_format="rda", engine="rscript") - r_dfs = {k: adj_int(v) for k, v in r_dfs.items()} + r_dfs = {str(k): adj_int(v) for k, v in r_dfs.items()} assert list(r_dfs.keys()) == ["plants_df", "sea_ice_df", "ghg_df"] @@ -340,7 +341,7 @@ def test_bytes_read_infer_rds(datapath): with pytest.raises(ValueError, match="Unable to infer file format from file name"): with open(filename, "rb") as f: - read_rdata(f.read(), engine="rscript") + read_rdata(f, engine="rscript") def test_bytes_read_infer_rda(datapath): @@ -348,7 +349,7 @@ def test_bytes_read_infer_rda(datapath): with pytest.raises(ValueError, match="Unable to infer file format from file name"): with open(filename, "rb") as f: - read_rdata(f.read(), engine="rscript") + read_rdata(f, engine="rscript") # URL @@ -466,9 +467,9 @@ def test_read_rds_mode_file(datapath, mode): filename = datapath("io", "data", "rdata", "ghg_df.rds") r_df = read_rdata(filename, engine="rscript", mode=mode) - r_df = adj_int(r_df) + output = adj_int(r_df).tail() - tm.assert_frame_equal(ghg_df, r_df.tail()) + tm.assert_frame_equal(ghg_df, output) @pytest.mark.slow @@ -529,7 +530,11 @@ def test_read_select_frames_rda_objs(datapath): def test_read_wrong_select_frames(datapath): with pytest.raises(TypeError, match="not a valid type for select_frames"): filename = datapath("io", "data", "rdata", "env_data_dfs.rda") - read_rdata(filename, engine="rscript", select_frames="plants_df") + read_rdata( + filename, + engine="rscript", + select_frames="plants_df", # type: ignore[arg-type] + ) # ROWNAMES @@ -539,14 +544,16 @@ def test_read_rownames_true_rds(datapath): filename = datapath("io", "data", "rdata", "sea_ice_df.rds") r_df = read_rdata(filename, engine="rscript", rownames=True) - assert r_df.index.name == "rownames" + if isinstance(r_df, DataFrame): + assert r_df.index.name == "rownames" def test_read_rownames_false_rds(datapath): filename = datapath("io", "data", "rdata", "sea_ice_df.rds") r_df = read_rdata(filename, engine="rscript", rownames=False) - assert r_df.index.name != "rownames" + if isinstance(r_df, DataFrame): + assert r_df.index.name != "rownames" def test_read_rownames_true_rda(datapath): @@ -570,7 +577,9 @@ def test_read_rownames_false_rda(datapath): # ENCODING -@td.skip_if_not_us_locale +@pytest.mark.xfail( + reason="R encoding is locale specific. Need to think about workaround." +) def test_non_utf8_data(datapath, rtype): filename = datapath("io", "data", "rdata", f"climate_non_utf8_df.{rtype}") @@ -640,7 +649,7 @@ def test_write_read_bytes_io(datapath, rtype, mode): b_io, file_format=rtype, engine="rscript", mode=mode, index=False ) r_dfs = read_rdata( - b_io.getvalue(), + b_io.getvalue(), # type: ignore[arg-type] file_format=rtype, engine="rscript", mode=mode, @@ -835,7 +844,7 @@ def test_write_read_ascii(rtype): compress=False, ) - with open(out_file) as f: + with open(out_file, newline="") as f: r_dfs = read_rdata(f, file_format=rtype, engine="rscript", rownames=False) expected = ghg_df.reset_index(drop=True) From 8b1aa9c4513ec1f5cf3bcd5cac8e949e61e5abfa Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Mon, 12 Apr 2021 08:03:35 -0500 Subject: [PATCH 5/8] Added needed test skips and fixed io docs ref in whatsnew --- doc/source/whatsnew/v1.3.0.rst | 4 ++-- pandas/tests/io/rdata/test_pyreadr.py | 4 +++- pandas/tests/io/rdata/test_rscript.py | 5 ++++- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index a039c2d959f02..6b5e7d3eccd15 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -115,7 +115,7 @@ For more, see :ref:`io.xml` in the user guide on IO tools. Read and write R data files ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -We added I/O support to read and write R data files (.rda, .Rdata, .rds) using +We added I/O support to read and write R data files (.RData, .rda, .rds) using :func:`pandas.read_rdata` and :meth:`DataFrame.to_rdata`. Equipped with two engines, `pyreadr`_ and command line caller, `rscript`_, these methods will maintain fast and durable support for open source data migration between R and Python. (:issue:`40287`) @@ -211,7 +211,7 @@ Even exported back out to R data files: ...: rda_names=["ppm_df", "species_mtx"] ...: ) -For more, see :ref:`io.read_rdata` in the user guide on IO tools. +For more, see :ref:`io.rdata` in the user guide on IO tools. Styler Upgrades ^^^^^^^^^^^^^^^ diff --git a/pandas/tests/io/rdata/test_pyreadr.py b/pandas/tests/io/rdata/test_pyreadr.py index a7565eb729a42..fbcc9b06523fc 100644 --- a/pandas/tests/io/rdata/test_pyreadr.py +++ b/pandas/tests/io/rdata/test_pyreadr.py @@ -4,6 +4,8 @@ import pytest +import pandas.util._test_decorators as td + from pandas import DataFrame import pandas._testing as tm @@ -296,7 +298,7 @@ def test_read_wrong_url(): @tm.network -@pytest.mark.slow +@td.skip_if_no("s3fs") def test_read_rda_s3(): s3 = "s3://assets.datacamp.com/production/course_1478/datasets/wine.RData" s3_df = DataFrame( diff --git a/pandas/tests/io/rdata/test_rscript.py b/pandas/tests/io/rdata/test_rscript.py index 95c0a6714c645..5df2b499a66a3 100644 --- a/pandas/tests/io/rdata/test_rscript.py +++ b/pandas/tests/io/rdata/test_rscript.py @@ -316,6 +316,7 @@ def test_read_rda_non_dfs(datapath, mode): read_rdata(filename, engine="rscript", mode=mode) +@td.skip_if_not_us_locale def test_read_not_rda_file(datapath, mode): with pytest.raises(RScriptError, match="bad restore file magic number"): read_rdata( @@ -326,6 +327,7 @@ def test_read_not_rda_file(datapath, mode): ) +@td.skip_if_not_us_locale def test_read_not_rds_file(datapath, mode): with pytest.raises(RScriptError, match="unknown input format"): read_rdata( @@ -399,7 +401,7 @@ def test_read_wrong_url(): @tm.network -@pytest.mark.slow +@td.skip_if_no("s3fs") def test_read_rda_s3(): s3 = "s3://assets.datacamp.com/production/course_1478/datasets/wine.RData" s3_df = DataFrame( @@ -831,6 +833,7 @@ def test_write_ascii_output(rtype, mode, ascii): assert output == expected +@td.skip_if_windows def test_write_read_ascii(rtype): with tm.ensure_clean_dir() as tmp_dir: out_file = os.path.join(tmp_dir, "rdata.out") From 2341dffb0a20a4307c9ca9d6bf23d468c3b62680 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Tue, 13 Apr 2021 22:57:09 -0500 Subject: [PATCH 6/8] Remove rscript implementation from code, tests, and docs --- ci/deps/actions-37.yaml | 1 + ci/deps/azure-macos-37.yaml | 1 + ci/deps/azure-windows-37.yaml | 2 + doc/source/getting_started/install.rst | 1 - doc/source/user_guide/io.rst | 165 +-- doc/source/whatsnew/v1.3.0.rst | 28 +- pandas/core/frame.py | 141 +- pandas/io/rdata.py | 1308 +---------------- pandas/tests/io/rdata/test_rscript.py | 987 ------------- .../{rdata/test_pyreadr.py => test_rdata.py} | 176 +-- 10 files changed, 196 insertions(+), 2614 deletions(-) delete mode 100644 pandas/tests/io/rdata/test_rscript.py rename pandas/tests/io/{rdata/test_pyreadr.py => test_rdata.py} (73%) diff --git a/ci/deps/actions-37.yaml b/ci/deps/actions-37.yaml index 61f431256dd4a..9292e2aa7db39 100644 --- a/ci/deps/actions-37.yaml +++ b/ci/deps/actions-37.yaml @@ -25,4 +25,5 @@ dependencies: - flask - tabulate - pyreadstat + - pyreadr - pip diff --git a/ci/deps/azure-macos-37.yaml b/ci/deps/azure-macos-37.yaml index d667adddda859..f39f63c66d102 100644 --- a/ci/deps/azure-macos-37.yaml +++ b/ci/deps/azure-macos-37.yaml @@ -33,4 +33,5 @@ dependencies: - pip: - cython>=0.29.21 - pyreadstat + - pyreadr - pyxlsb diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml index e7ac4c783b855..e9707030a4def 100644 --- a/ci/deps/azure-windows-37.yaml +++ b/ci/deps/azure-windows-37.yaml @@ -37,6 +37,8 @@ dependencies: - xlsxwriter - xlwt - pyreadstat + - pyreadr + - pyreadr - pip - pip: - pyxlsb diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 8bdd01007516b..14379fc0dd0da 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -361,7 +361,6 @@ fastparquet 0.4.0 Parquet reading / writing pyarrow 0.15.0 Parquet, ORC, and feather reading / writing pyreadstat SPSS files (.sav) reading pyreadr R files (.RData, .rda, .rds) reading / writing -Rscript R files (.RData, .rda, .rds) reading / writing ========================= ================== ============================================================= Access data in the cloud diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index f264ec0aba0f6..0023f70e699bd 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -5925,13 +5925,12 @@ For .rds types that only contains a single R object, method will return a single .. note:: - Since *any* R object can be saved in these types, this method will only return + Since any R object can be saved in these types, this method will only return data.frame objects or objects coercible to data.frames including matrices, - tibbles, and data.tables even 3D arrays. Depending on engine used, either - an error raises for non-data.frame objects or such objects are ignored. + tibbles, and data.tables and to some extent, arrays. -For example, consider the following generated data.frames in R using samples from -US EPA, UK BGCI, and NOAA pubilc data: +For example, consider the following generated data.frames in R using environment +data samples from US EPA, UK BGCI, and NOAA pubilc data: .. code-block:: r @@ -5974,7 +5973,7 @@ US EPA, UK BGCI, and NOAA pubilc data: save(ghg_df, plants_df, sea_ice_df, file="env_data_dfs.rda") -Then in pandas you can read the .rds or .rda files: +With ``read_rdata``, you can read these above .rds or .rda files: .. ipython:: python :suppress: @@ -5990,7 +5989,7 @@ Then in pandas you can read the .rds or .rda files: rda_file = os.path.join(file_path, "env_data_dfs.rda") env_dfs = pd.read_rdata(rda_file) - env_dfs + {k: df.tail() for k, df in env_dfs.items()} To ignore the rownames of data.frame, use option ``rownames=False``: @@ -6009,16 +6008,6 @@ To select specific objects in .rda, pass a list of names into ``select_frames``: env_dfs = pd.read_rdata(rda_file, select_frames=["sea_ice_df"]) env_dfs -To read from URL, pass link directly into method: - -.. ipython:: python - - url = ("https://github.com/hadley/nycflights13/" - "blob/master/data/airlines.rda?raw=true") - - airlines = pd.read_rdata(url, file_format="rda") - airlines - To read from a file-like object, read object in argument, ``path_or_buffer``: .. ipython:: python @@ -6029,42 +6018,34 @@ To read from a file-like object, read object in argument, ``path_or_buffer``: sea_ice_df -With ``rscript`` as ``engine``, a direct command line call to Rscript is run -to read data natively in R and transfer content with several options of ``mode``. - -.. note:: +To read from URL, pass link directly into method: - If you do not have R installed and attempt to use the ``rscript`` ``engine``, - then an ``ImportError`` will raise. Do note: Rscript must be recognized as a - top-level command on machine. Hence, R's bin folder must be in Path environment - variable for the OS. If Rscript is not recognized even if you have R installed, - you will receive same ``ImportError``. +.. ipython:: python -- For the ``csv`` mode (default), no other package in R is required. - Data types are adhered in this data exchange following a text approach. + url = ("https://github.com/hadley/nycflights13/" + "blob/master/data/airlines.rda?raw=true") -- For the ``feather`` mode, the ``arrow`` package in R must be installed. - Additionally, the counterpart ``pyarrow`` package in Python must be - installed. This binary approach allows faster data exchange than text approach. + airlines = pd.read_rdata(url, file_format="rda") + airlines -- For the ``parquet`` mode, again the ``arrow`` package in R must be installed. - and again ``pyarrow`` package in Python must be installed. Similarly, this - binary approach allows faster data exchange than text approach. +To read from an Amazon S3 bucket, point to the storage path. This also raises +another issue. Any R data encoded in non utf-8 is currently not supported: -- For the ``sqlite`` mode, the ``RSQLite`` package in R (part of DBI family of - database APIs) must be installed with no additional package needed for Python. - This database approach ensures data type integrity. +.. code-block:: ipython -.. ipython:: python + In [608]: ghcran = pd.read_rdata("s3://public-r-data/ghcran.Rdata") + ... + UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 45: +invalid continuation byte - rds_file = os.path.join(file_path, "plants_df.rds") - plants_df = pd.read_rdata(rds_file, engine="rscript", mode="csv").tail() - plants_df +Also, remember if R data files do not contain any data frame object, a parsing error +will occur: -.. note:: +.. code-block:: ipython - The above selected options for ``mode`` will not generate such formats but - uses them under the hood in disk transfer of data between R and Python. + In [608]: rds_file = os.path.join(file_path, "env_data_non_dfs.rda") + ... + LibrdataError: Invalid file, or file has unsupported features .. _io.rdata_writer: @@ -6075,80 +6056,90 @@ Writing R data .. versionadded:: 1.3.0 The method :func:`~pandas.core.frame.DataFrame.to_rdata` will write a DataFrame -or multiple DataFrames into R data files (.Rdata, .rda, and .rds). +or multiple DataFrames into R data files (.RData, .rda, and .rds). -For single object in rds type: +For a single DataFrame in rds type, pass in a file or buffer in method: .. ipython:: python plants_df.to_rdata("plants_df.rds") -For multiple objects in RData or rda types using the ``rscript`` engine, -use the ``other_frames`` argument and be sure to provide ``rda_names`` for all -DataFrames: +For a single DataFrame in RData or rda types, pass in a file or buffer in method +and optionally give it a name: .. ipython:: python - plants_df.to_rdata( - "env_dfs.rda", - engine="rscript", - other_frames=[ghg_df, sea_ice_df], - rda_names=["plants_df", "ghg_df", "sea_ice_df"] - ) + plants_df.to_rdata("plants_df.rda", rda_name="plants_df") -With either engine, pandas index will not map into R rownames. Using the default -``index=True`` will output an index column or multiple columns for MultiIndex. +While RData and rda types can hold multiple R objects, this method currently +only supports writing out a single DataFrame. + +Even write to a buffer and read its content: .. ipython:: python - (ghg_df.rename_axis(None) - .to_rdata("ghg_df.rds", engine="rscript") - ) - pd.read_rdata("ghg_df.rds").tail() + with BytesIO() as b_io: + sea_ice_df.to_rdata(b_io, file_format="rda", index=False) + print( + pd.read_rdata( + b_io.getvalue(), + file_format="rda", + rownames=False, + )["pandas_dataframe"].tail() + ) -Otherwise, use ``index=False``: +While DataFrame index will not map into R rownames, by default ``index=True`` +will output as a named column or multiple columns for MultiIndex. .. ipython:: python - (ghg_df.rename_axis(None) - .to_rdata("ghg_df.rds", engine="rscript", index=False) - ) + ghg_df.rename_axis(None).to_rdata("ghg_df.rds") + pd.read_rdata("ghg_df.rds").tail() -With both engines, the default compression of R data files will be ``gzip``. -Notice the different sizes of compressed and uncompressed files: +To ignore the index, use ``index=False``: .. ipython:: python - plants_df.to_rdata("plants_df_uncomp.rds", compress=False) - - os.stat("plants_df.rds").st_size - os.stat("plants_df_uncomp.rds").st_size + ghg_df.rename_axis(None).to_rdata("ghg_df.rds", index=False) -The ``rscript`` engine supports all listed compression types including: -``gzip``, ``bzip2``, and ``xz``. + pd.read_rdata("ghg_df.rds").tail() -Additionally, with ``rscript`` engine, data files can be written in ascii (text) -rather than default binary with ``ascii`` argument: +By default, these R serialized types are compressed files in either gzip, bzip2, +or xz algorithms. Similarly to R, the default type in this method is "gzip" or +"gz". Notice difference of compressed and uncompressed files .. ipython:: python - sea_ice_df.to_rdata("sea_ice_df_ascii.rda", engine="rscript", - ascii=True, compress=False) + plants_df.to_rdata("plants_df_gz.rds") + plants_df.to_rdata("plants_df_bz2.rds", compression="bz2") + plants_df.to_rdata("plants_df_xz.rds", compression="xz") + plants_df.to_rdata("plants_df_non_comp.rds", compression=None) + + os.stat("plants_df_gz.rds").st_size + os.stat("plants_df_bz2.rds").st_size + os.stat("plants_df_xz.rds").st_size + os.stat("plants_df_non_comp.rds").st_size - with open("sea_ice_df_ascii.rda", "r") as f: - for i in range(10): - line = next(f).strip() - print(line) +Like other IO methods, ``storage_options`` are enabled to write to those platforms: + +.. code-block:: ipython + + ghg_df.to_rdata( + "s3://path/to/my/storage/pandas_df.rda", + storage_options={"user": "xxx", "password": "???"} + ) .. ipython:: python :suppress: os.remove("ghg_df.rds") os.remove("plants_df.rds") - os.remove("env_dfs.rda") - os.remove("plants_df_uncomp.rds") - os.remove("sea_ice_df_ascii.rda") + os.remove("plants_df.rda") + os.remove("plants_df_gz.rds") + os.remove("plants_df_bz2.rds") + os.remove("plants_df_xz.rds") + os.remove("plants_df_non_comp.rds") Once exported, the single DataFrame can be read back in R or multiple DataFrames loaded in R: @@ -6191,9 +6182,9 @@ loaded in R: 144 Fluorinated gases 2018 182.7824 145 Total 2018 6676.6496 -For more information of ``pyreadr`` engine, see main page of `pyreadr`_ package for -further notes on support and limitations. For more information of R serialization -data types, see docs on `rds`_ and `rda`_ data files. +For more information of the underlying ``pyreadr`` package, see main page of +`pyreadr`_ for further notes on support and limitations. For more information of R +serialization data types, see docs on `rds`_ and `rda`_ data files. .. _pyreadr: https://github.com/ofajardo/pyreadr diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 22f26f19c229e..b02c456a6ac0b 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -116,14 +116,13 @@ Read and write R data files ^^^^^^^^^^^^^^^^^^^^^^^^^^^ We added I/O support to read and write R data files (.RData, .rda, .rds) using -:func:`pandas.read_rdata` and :meth:`DataFrame.to_rdata`. Equipped with two engines, -`pyreadr`_ and command line caller, `rscript`_, these methods will maintain fast and -durable support for open source data migration between R and Python. (:issue:`40287`) +:func:`pandas.read_rdata` and :meth:`DataFrame.to_rdata`. Both methods rely on +the `pyreadr`_ package to support open source data migration between R and +Python pandas. (:issue:`40287`) .. _pyreadr: https://github.com/ofajardo/pyreadr -.. _rscript: https://www.rdocumentation.org/packages/utils/versions/3.6.2/topics/Rscript -In R, the below generated data frame and matrix: +For example, consider the below generated data frame and matrix in R: .. code-block:: r @@ -160,11 +159,12 @@ In R, the below generated data frame and matrix: In [3]: saveRDS(ppm_df, "ppm_df_r.rds") In [4]: save(ppm_df, iucn_species_mtx, "env_objs_r.rda") -Can then be read in pandas with either engine: +Now, both R data files can be read in pandas to return either DataFrame +for .rds types or ``dict`` of DataFrames for .RData and .rda types: .. code-block:: ipython - In [1]: ppm_df = pd.read_rdata("ppm_df_r.rds", engine="pyreadr") + In [1]: ppm_df = pd.read_rdata("ppm_df_r.rds") In [2]: ppm_df Out[3]: year month monthly_average num_days st_dev_of_days unc_mon_mean @@ -174,7 +174,7 @@ Can then be read in pandas with either engine: 3 2021 1 415.52 29 0.44 0.16 4 2021 2 416.75 28 1.01 0.36 - In [4]: env_objs = pd.read_rdata("env_objs_r.rda", engine="rscript") + In [4]: env_objs = pd.read_rdata("env_objs_r.rda") Out[5]: {'carbon_ppm_df': year month monthly_average num_days st_dev_of_days unc_mon_mean @@ -185,7 +185,7 @@ Can then be read in pandas with either engine: 4 2021 2 416.75 28 1.01 0.36 [5 rows x 6 columns], - 'species_matrix': + 'iucn_species_mtx': EX EW CR(PE) CR(PEW) CR EN VU DD Total rownames MAGNOLIOPSIDA 102 30 409 29 3770 6972 7089 2990 43885 @@ -199,16 +199,14 @@ Can then be read in pandas with either engine: [8 rows x 9 columns]} -Even exported back out to R data files: +Additionally, pandas data can be written back out into the same R data files: .. code-block:: ipython In [5]: ppm_df.to_rdata("ppm_df_py.rds") - In [6]: ppm_df.to_rdata( - ...: "env_objs_py.rda", - ...: engine="rscript", - ...: other_frames=env_objs["species_matrix"], - ...: rda_names=["ppm_df", "species_mtx"] + In [6]: env_objs['iucn_species_mtx'].to_rdata( + ...: "iucn_species_py.rda", + ...: rda_name="iucn_species_df" ...: ) For more, see :ref:`io.rdata` in the user guide on IO tools. diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b121720744f28..6ea26ee04c307 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2295,18 +2295,13 @@ def to_rdata( self, path_or_buffer: FilePathOrBuffer, file_format: str = "infer", - engine: str = "pyreadr", - mode: str = "csv", - other_frames: list[DataFrame] | None = None, - rda_names: list[str] = ["pandas_dataframe"], + rda_name: str = "pandas_dataframe", index: bool = True, - ascii: bool = False, - compress: bool | str = "gzip", - encoding: str = "utf-8", + compression: CompressionOptions = "gzip", storage_options: StorageOptions = None, ) -> None: """ - Render one or more DataFrames to R data (.rda, .Rdata, .rds). + Render one or more DataFrames to R data (.RData, .rda, .rds). .. versionadded:: 1.3.0 @@ -2321,58 +2316,24 @@ def to_rdata( single object to disk). Default 'infer' will use extension in file name to determine the format type. - engine : {{'pyreadr', 'rscript'}}, default 'pyreadr' - Engine used to write to R data files. Currently, two types are - supported: ``pyreadr`` which requires the pyreadr package to be - installed and ``rscript`` which requires R to be installed on machine. - For ``rscript``, be sure the R bin installation folder is included in - the system Path environment variable. The ``pyreadr`` is the faster - parser to handle most needs but ``rscript`` engine provides fuller - support of rda and rds formats since it calls native R commands. - - mode : {{'csv', 'parquet', 'feather'}}, default 'csv' - Python and R I/O transfer mode that only applies to ``rscript`` - engine (ignored for ``pyreadr``). Using ``csv`` (text approach), no - additional packages are required. Using ``parquet`` or ``feather`` - (binary approach) requires pyarrow installed in Python and arrow - package installed in R. Using ``sqlite`` (database approach) requires - RSQLite package installed in R. Binary will usually be faster to process - than text data. Database usually ensures data type integrity. - - other_frames : list, optional - Other DataFrames to be included in rda (not rds) files that can - contain multiple objects. Ignored ``pyreadr`` engine that currently - supports only a single DataFrame written to rda files. - - rda_names : list, default ["pandas_dataframe"] - Names for current and other DataFrames in rda file. The number of names - should equal the number of current DataFrame and ``other_frames``. - For ``pyreadr`` engine that can only write one DataFrame to rda file, - only the first name in list will be used. + rda_name : str, default "pandas_dataframe" + Name for R data.frame in RData/rda file. index : bool, default True Include index or MulitIndex in output as separate columns. Since - DataFrame indexes can include multiple columns and R rownames can only - include one column, neither ``pyreadr`` nor ``rscript`` engines will - map DataFrame index to R data.frame rownames. - - ascii : bool, default False - Write data into ASCII (text) representation. Only supported with - ``rscript`` engine. - - compress : bool or {{'gzip', 'bzip2', 'xz'}}, default 'gzip' - Compression types for R data files. Use False for uncompressed - files. For ``pyreadr`` engine, False and 'gzip' is supported. + DataFrame indexes can include multiple columns and R rownames can + only include one column, DataFrame index will not map to R data.frame + rownames. - encoding : str, optional, default 'utf-8' - Encoding of R data. + compression : {{'gzip', 'bz2', 'xz', None}}, default 'gzip' + Compression type for on-the-fly decompression of on-disk data. {storage_options} Returns ------- None - Either None or ValueError is raised. + Either None for successful output or raises an error. See Also -------- @@ -2406,8 +2367,7 @@ def to_rdata( ... 5 4 Total 2018 6676.65 ... ''' - To save an .rda or .RData file which can contains one or more - DataFrames: + To save an .rda or .RData file: >>> plants_df = pd.DataFrame( ... {{'plant_group': ['Pteridophytes', @@ -2422,37 +2382,15 @@ def to_rdata( ... 'Threatened'], ... 'count': [398, 65, 1294, 408, 1275] ... }}) - >>> sea_ice_df = pd.DataFrame( - ... {{'year': [2016, 2017, 2018, 2019, 2020], - ... 'mo': [12, 12, 12, 12, 12], - ... 'data.type': ['Goddard', - ... 'Goddard', - ... 'Goddard', - ... 'Goddard', - ... 'NRTSI-G'], - ... 'region': ['S', 'S', 'S', 'S', 'S'], - ... 'extent': [8.28, 9.48, 9.19, 9.41, 10.44], - ... 'area': [5.51, 6.23, 5.59, 6.59, 6.5] - ... }}) - >>> ghg_df.to_rdata( - ... "env_data_df.rda", - ... engine="rscript", - ... other_frames=[plants_df, sea_ice_df], - ... rda_names=["ghg_df", "plants_df", "sea_ice_df"] + >>> plants_df.to_rdata( + ... "plants_df.rda", + ... rda_name="plants_df", ... ) # doctest: +SKIP >>> R_code = ''' - ... load("env_data_df.rds") + ... load("plants_df.rda") ... ... mget(ls()) - ... $ghg_df - ... index gas year emissions - ... 1 0 Carbon dioxide 2018 5424.88 - ... 2 1 Methane 2018 634.46 - ... 3 2 Nitrous oxide 2018 434.53 - ... 4 3 Fluorinated gases 2018 182.78 - ... 5 4 Total 2018 6676.65 - ... ... $plants_df ... index plant_group status count ... 1 0 Pteridophytes Data Deficient 398 @@ -2460,56 +2398,19 @@ def to_rdata( ... 3 2 Pteridophytes Not Threatened 1294 ... 4 3 Pteridophytes Possibly Threatened 408 ... 5 4 Pteridophytes Threatened 1275 - ... - ... $sea_ice_df - ... index year mo data.type region extent area - ... 1 0 2016 12 Goddard S 8.28 5.51 - ... 2 1 2017 12 Goddard S 9.48 6.23 - ... 3 2 2018 12 Goddard S 9.19 5.59 - ... 4 3 2019 12 Goddard S 9.41 6.59 - ... 5 4 2020 12 NRTSI-G S 10.44 6.50 ... ''' """ - from pandas.io.rdata import ( - RSCRIPT_EXISTS, - PyReadrWriter, - RscriptWriter, - ) - - pyreadr = import_optional_dependency("pyreadr", errors="ignore") - pyarrow = import_optional_dependency("pyarrow", errors="ignore") - - RDataWriter: type[PyReadrWriter] | type[RscriptWriter] + from pandas.io.rdata import PyReadrWriter - if engine == "pyreadr": - if pyreadr is None: - raise ImportError("pyreadr not found, please install for this engine.") - RDataWriter = PyReadrWriter + import_optional_dependency("pyreadr") - elif engine == "rscript": - if RSCRIPT_EXISTS is None: - raise FileNotFoundError( - "R is either not installed on this system or its " - "bin folder is not in Path environment variable." - ) - if pyarrow is None and mode in ["parquet", "feather"]: - raise ImportError("pyarrow not found, please install for this mode.") - RDataWriter = RscriptWriter - else: - raise ValueError(f"{engine} is not a supported engine.") - - rdata_writer = RDataWriter( + rdata_writer = PyReadrWriter( self, path_or_buffer=path_or_buffer, file_format=file_format, - engine=engine, - mode=mode, - other_frames=other_frames, - rda_names=rda_names, + rda_name=rda_name, index=index, - ascii=ascii, - compress=compress, - encoding=encoding, + compression=compression, storage_options=storage_options, ) diff --git a/pandas/io/rdata.py b/pandas/io/rdata.py index 2595149c03444..91852f5bd281a 100644 --- a/pandas/io/rdata.py +++ b/pandas/io/rdata.py @@ -1,27 +1,22 @@ from datetime import datetime import io import os -import platform -import subprocess from tempfile import TemporaryDirectory from typing import ( Dict, List, Optional, - Type, Union, ) from pandas._typing import ( Buffer, + CompressionOptions, FilePathOrBuffer, StorageOptions, ) from pandas.compat._optional import import_optional_dependency -from pandas.errors import ( - AbstractMethodError, - ParserError, -) +from pandas.errors import AbstractMethodError from pandas.util._decorators import doc from pandas.core.dtypes.common import is_list_like @@ -36,51 +31,14 @@ is_url, stringify_path, ) -from pandas.io.feather_format import read_feather -from pandas.io.parquet import read_parquet -from pandas.io.parsers import read_csv -from pandas.io.sql import read_sql - - -class RScriptError(Exception): - """ - Exception raises when command line call to RScript throws a non-empty - error message. Message will capture verbatim R output in console. - """ - - pass - - -def _executable_exists(name) -> bool: - """ - Internal method to check if R exists on system. - - This method will return True if R is installed for Rscript command - line call and if machine recognizes Rscript in Path env variable. - """ - - WHICH_CMD = "where" if platform.system() == "Windows" else "which" - - return ( - subprocess.call( - [WHICH_CMD, name], stdout=subprocess.PIPE, stderr=subprocess.PIPE - ) - == 0 - ) - - -RSCRIPT_EXISTS = _executable_exists("Rscript") @doc(storage_options=_shared_docs["storage_options"]) def read_rdata( path_or_buffer: FilePathOrBuffer, file_format: str = "infer", - engine: str = "pyreadr", - mode: str = "csv", select_frames: Optional[List[str]] = None, rownames: bool = True, - encoding: str = "utf-8", storage_options: StorageOptions = None, ) -> Union[DataFrame, Dict[str, DataFrame]]: r""" @@ -99,24 +57,6 @@ def read_rdata( commands. Default 'infer' will use extension in file name to to determine the format type. - engine : {{'pyreadr'. 'rscript'}}, default 'pyreadr' - Engine used to parse or read R data. Currently, two types are - supported: ``pyreadr`` which requires the pyreadr package to be - installed and ``rscript`` which requires R to be installed on machine. - For ``rscript``, be sure the R bin installation folder is included in - the system Path environment variable. The ``pyreadr`` is the faster - parser to handle most needs but ``rscript`` engine provides fuller - support of rda and rds formats since it calls native R commands. - - mode : {{'csv', 'parquet', 'feather', 'sqlite'}}, default 'csv' - Python and R I/O transfer mode that only applies to ``rscript`` - engine (ignored for ``pyreadr``). Using ``csv`` (text approach), no - additional packages are required. Using ``parquet`` or ``feather`` - (binary approach) requires pyarrow installed in Python and arrow - package installed in R. Using ``sqlite`` (database approach) requires - RSQLite package installed in R. Binary will usually be faster to process - than text data. Database usually ensures data type integrity. - select_frames : list, default None Selected names of DataFrames to return from R rda and RData types that can contain multiple objects. @@ -124,10 +64,6 @@ def read_rdata( rownames : bool, default True Include original rownames in R data frames to map into a DataFrame index. - encoding : str, optional, default 'utf-8' - Encoding of R data. Currently, ``pyreadr`` engine only supports utf-8 - encoded data. - {storage_options} Returns @@ -144,16 +80,9 @@ def read_rdata( Notes ----- - For ``pyreadr`` engine, any R data file that contains a non-data.frame object - may raise parsing errors. For ``rscript`` engine, such objects will be - ignored. Both methods will or attempt to return data.frame objects or any - object that is coercible to R's data.frame such as matrix, tibble, - and data.table. For arrays, method will attempt to convert to 2D - structure and may not reproduce original R object representation. - - If object in rds types or all objects in rda or RData types are not data - frames, this method will raise an error and will not return None or an empty - dictionary. + Any R data file that contains a non-data.frame object may raise parsing errors. + Method will return data.frame, matrix, and data.frame like object such as + tibbles and data.tables. For ``pyreadr`` engine, ``select_frames`` above is synonymous to ``use_objects`` in package's `read_r` method. Also, ``timezone`` argument defaults to current @@ -161,7 +90,7 @@ def read_rdata( Examples -------- - To read an .rds file which only contains a single object, below returns a + For an .rds file which only contains a single R object, method returns a DataFrame: >>> R_code = ''' @@ -195,7 +124,7 @@ def read_rdata( 4 Fluorinated gases 2018 182.78 5 Total 2018 6676.65 - To read an .rda or .RData file which can contain multiple objects, blue + For an .RData or .rda file which can contain multiple R objects, method returns a ``dict`` of DataFrames: >>> R_code = ''' @@ -255,79 +184,13 @@ def read_rdata( 5 2020 12 NRTSI-G S 10.44 6.50}} """ - return _parse( - path_or_buffer=path_or_buffer, - file_format=file_format, - engine=engine, - mode=mode, - select_frames=select_frames, - rownames=rownames, - encoding=encoding, - storage_options=storage_options, - ) - + import_optional_dependency("pyreadr") -def _parse( - path_or_buffer, - file_format, - engine, - mode, - select_frames, - rownames, - encoding, - storage_options, - **kwargs, -) -> Union[DataFrame, Dict[str, DataFrame]]: - """ - Call internal parser classes. - - This method will conditionally call internal parsers: - _PyReadrParser or _RscriptParser. - - Raises - ------ - FileNotFoundError - * If Rscript bin executable is not installed or found on machine. - - ImportError - * If pyreadr for engine and pyarrow for mode is not installed. - - ValueError - * If engine is neither pyreadr or rscript. - """ - pyreadr = import_optional_dependency("pyreadr", errors="ignore") - pyarrow = import_optional_dependency("pyarrow", errors="ignore") - - RDataReader: Union[Type[_PyReadrParser], Type[_RscriptParser]] - - if engine == "pyreadr": - if pyreadr is None: - raise ImportError("pyreadr not found, please install for this engine.") - - RDataReader = _PyReadrParser - - elif engine == "rscript": - if RSCRIPT_EXISTS is None: - raise FileNotFoundError( - "R is either not installed on this system or its " - "bin folder is not in Path environment variable." - ) - - if pyarrow is None and mode in ["parquet", "feather"]: - raise ImportError("pyarrow not found, please install for this mode.") - - RDataReader = _RscriptParser - else: - raise ValueError(f"{engine} is not a supported engine.") - - rdr = RDataReader( + rdr = _PyReadrParser( path_or_buffer, file_format, - engine, - mode, select_frames, rownames, - encoding, storage_options, ) @@ -409,21 +272,12 @@ class _RDataReader: file_format : {{'infer', 'rda', 'rdata', 'rds'}}, default 'infer' R serialization type. - engine : {{'pyreadr', 'rscript'}}, default 'pyreadr' - Engine used to parse or read R data. - - mode : {{'csv', 'parquet', 'feather', 'sqlite'}}, default 'csv' - Python and R i/o transfer mode. - select_frames : list, default None Selected names of DataFrames to return from R data. rownames : bool, default True Include original rownames in R data frames. - encoding : str, optional, default 'utf-8' - Encoding of R data. - storage_options : dict, optional Extra options that make sense for a particular storage connection, e.g. host, port, username, password, etc., @@ -431,7 +285,6 @@ class _RDataReader: See also -------- pandas.io.rdata._PyReadrParser - pandas.io.rdata._RscriptParser Notes ----- @@ -447,20 +300,14 @@ def __init__( self, path_or_buffer, file_format, - engine, - mode, select_frames, rownames, - encoding, storage_options, ) -> None: self.path_or_buffer = path_or_buffer self.file_format = file_format.lower() - self.engine = engine - self.mode = mode self.select_frames = select_frames self.rownames = rownames - self.encoding = encoding self.storage_options = storage_options def verify_params(self) -> None: @@ -489,14 +336,6 @@ def verify_params(self) -> None: if self.file_format == "infer": self.file_format = os.path.splitext(self.path_or_buffer.lower())[1][1:] - if self.mode is not None and self.mode not in [ - "csv", - "feather", - "parquet", - "sqlite", - ]: - raise ValueError(f"'{self.mode}' is not supported value for mode.") - if self.select_frames is not None and not is_list_like(self.select_frames): raise TypeError( f"{type(self.select_frames).__name__} is " @@ -508,14 +347,14 @@ def buffer_to_disk(self, tmp_dir: str) -> str: Convert path or buffer to disk file. This method will convert path_or_buffer to temp file - for pyreadr to parse and rscript to import. + for pyreadr to parse from disk. """ r_temp = os.path.join(tmp_dir, "rdata.rda") handle_data = _get_data_from_filepath( filepath_or_buffer=self.path_or_buffer, - encoding=self.encoding, + encoding="utf-8", compression=None, storage_options=self.storage_options, ) @@ -584,524 +423,6 @@ def parse_data(self) -> Union[DataFrame, Dict[str, DataFrame]]: return rdata -class _RscriptParser(_RDataReader): - """ - Internal class to parse R data types using temp script and data - files and command line call to installed Rscript executable. - """ - - def __init__(self, *args, **kwargs) -> None: - super().__init__(*args, **kwargs) - self.verify_params() - - def handle_rownames(self, df) -> DataFrame: - if self.rownames: - df = df.set_index("rownames") - else: - df = df.drop(["rownames"], axis=1) - - return df - - def run_rscript(self, tmp_dir, r_batch, cmds) -> str: - """ - Run R script at command line. - - This method will call subprocess.Popen to run R script that - saves temp data and meta files and returns R's console output. - """ - - with open(cmds[1], "w") as f: - f.write(r_batch) - - p = subprocess.Popen( - cmds, - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - encoding=self.encoding, - cwd=tmp_dir, - ) - output, error = p.communicate() - if len(error) != 0: - raise RScriptError(error) - - return output - - def parse_data(self) -> Union[DataFrame, Dict[str, DataFrame]]: - self.r_to_py_types = { - "logical": "bool", - "integer": "int64", - "numeric": "float64", - "factor": "category", - "character": "str", - "Date": "date", - "POSIXct": "date", - } - - switch_board = { - "rda": { - "csv": self.read_rdata_csv, - "feather": self.read_rdata_feather, - "parquet": self.read_rdata_parquet, - "sqlite": self.read_rdata_sqlite, - }, - "rdata": { - "csv": self.read_rdata_csv, - "feather": self.read_rdata_feather, - "parquet": self.read_rdata_parquet, - "sqlite": self.read_rdata_sqlite, - }, - "rds": { - "csv": self.read_rds_csv, - "feather": self.read_rds_feather, - "parquet": self.read_rds_parquet, - "sqlite": self.read_rds_sqlite, - }, - } - - rdata: Union[DataFrame, Dict[str, DataFrame], None] - rdata = switch_board[self.file_format][self.mode]() - - rdata = ( - {k: v for k, v in rdata.items() if k in self.select_frames} - if self.select_frames - else rdata - ) - rdata = {k: self.handle_rownames(df) for k, df in rdata.items()} - - rdata = rdata or None - rdata = ( - rdata["r_df"] - if (self.file_format == "rds" and rdata is not None) - else rdata - ) - - if rdata is None: - raise ValueError( - "No actual data frame or coercible data frames found in R data file." - ) - return rdata - - def read_rdata_csv(self) -> Dict[str, DataFrame]: - """ - Read R rda data via IO csv. - - This method will call `load` and `write.csv` in R to export all - data frames and metadata into temp csv files for pandas `read_csv`. . - """ - - r_batch = """ - args <- commandArgs(trailingOnly=TRUE) - load(args[1], temp_env <- new.env()) - - env_list <- as.list.environment(temp_env) - rm(temp_env) - - output_data_meta <- function(obj, nm) { - df <- tryCatch(data.frame(obj, - check.names=FALSE, - stringsAsFactors=FALSE - ), error=function(e) NULL) - - if (!is.null(df)) { - cat(nm, "\n", sep="") - - df <- data.frame(rownames = row.names(df), df, - check.names=FALSE, - stringsAsFactors=FALSE) - writeLines( - c(paste0(colnames(df), collapse=","), - paste0(sapply(df, - function(x) class(x)[1]), collapse=",")), - con=paste0("meta_", nm, ".txt") - ) - - write.csv(df, paste0("data_", nm, ".csv"), - row.names=FALSE, na="") - } - } - - output <- mapply(output_data_meta, env_list, names(env_list)) - """ - - with TemporaryDirectory() as tmp_dir: - r_file = os.path.join(tmp_dir, "r_batch.R") - rda_file = self.buffer_to_disk(tmp_dir) - - output = self.run_rscript(tmp_dir, r_batch, ["Rscript", r_file, rda_file]) - output_list = [i for i in output.strip().split("\n") if i != ""] - - oline: str - dfs: Dict[str, DataFrame] = {} - for oline in output_list: - with open( - os.path.join(tmp_dir, f"meta_{oline}.txt"), - encoding=self.encoding, - ) as f: - flines = [fline.strip() for fline in f] - - r_hdrs: List[List[str]] = [h.split(",") for h in flines] - py_types = {n: self.r_to_py_types[d] for n, d in zip(*r_hdrs)} - - dt_cols = [col for col, d in py_types.items() if d == "date"] - py_types = {k: v for k, v in py_types.items() if v != "date"} - - try: - dfs[oline] = read_csv( - os.path.join(tmp_dir, f"data_{oline}.csv"), - dtype=py_types, # type: ignore[arg-type] - parse_dates=dt_cols, - encoding=self.encoding, - ) - except (ParserError, ValueError): - dfs[oline] = read_csv( - os.path.join(tmp_dir, f"data_{oline}.csv"), - encoding=self.encoding, - ) - - return dfs - - def read_rdata_feather(self) -> Dict[str, DataFrame]: - """ - Read R rda data via IO feather. - - This method will call `readRDS` and `write_feather` in R to export all - data frames into temp feather files for pandas `read_feather`. - """ - - r_batch = """ - suppressPackageStartupMessages(library(arrow)) - - args <- commandArgs(trailingOnly=TRUE) - - load(args[1], temp_env <- new.env()) - env_list <- as.list.environment(temp_env) - rm(temp_env) - - output_data_meta <- function(obj, nm) { - df <- tryCatch(data.frame(obj, - check.names=FALSE, - stringsAsFactors=FALSE - ), error=function(e) NULL) - - if (!is.null(df)) { - cat(nm, "\n", sep="") - df <- data.frame(rownames = row.names(df), df, - check.names=FALSE, - stringsAsFactors=FALSE) - arrow::write_feather(df, paste0("data_", nm, ".feather")) - } - } - - output <- mapply(output_data_meta, env_list, names(env_list)) - """ - - with TemporaryDirectory() as tmp_dir: - r_file = os.path.join(tmp_dir, "r_batch.R") - rda_file = self.buffer_to_disk(tmp_dir) - - output = self.run_rscript(tmp_dir, r_batch, ["Rscript", r_file, rda_file]) - output_list = [i for i in output.strip().split("\n") if i != ""] - - oline: str - dfs: Dict[str, DataFrame] = { - oline: read_feather(os.path.join(tmp_dir, f"data_{oline}.feather")) - for oline in output_list - } - - return dfs - - def read_rdata_parquet(self) -> Dict[str, DataFrame]: - """ - Read R rda data via IO parquet. - - This method will call `load` and `write_parquet` in R to export all - data frames into temp parquet files for pandas `read_parquet`. - """ - - r_batch = """ - suppressPackageStartupMessages(library(arrow)) - - args <- commandArgs(trailingOnly=TRUE) - - load(args[1], temp_env <- new.env()) - env_list <- as.list.environment(temp_env) - rm(temp_env) - - output_data_meta <- function(obj, nm) { - df <- tryCatch(data.frame(obj, - check.names=FALSE, - stringsAsFactors=FALSE - ), error=function(e) NULL) - - if (!is.null(df)) { - cat(nm, "\n", sep="") - df <- data.frame(rownames = row.names(df), df, - check.names=FALSE, - stringsAsFactors=FALSE) - arrow::write_parquet(df, paste0("data_", nm, ".parquet")) - } - } - - output <- mapply(output_data_meta, env_list, names(env_list)) - """ - - with TemporaryDirectory() as tmp_dir: - r_file = os.path.join(tmp_dir, "r_batch.R") - rda_file = self.buffer_to_disk(tmp_dir) - - output = self.run_rscript(tmp_dir, r_batch, ["Rscript", r_file, rda_file]) - output_list = [i for i in output.strip().split("\n") if i != ""] - - oline: str - dfs: Dict[str, DataFrame] = { - oline: read_parquet(os.path.join(tmp_dir, f"data_{oline}.parquet")) - for oline in output_list - } - - return dfs - - def read_rdata_sqlite(self) -> Dict[str, DataFrame]: - """ - Read R rda data via IO sql. - - This method will call `load` and `dbWriteTable` in R to export all - data frames into a temp SQLite database for pandas `read_sql`. - """ - import sqlite3 - - r_batch = """ - suppressPackageStartupMessages(library(RSQLite)) - - args <- commandArgs(trailingOnly=TRUE) - - load(args[1], temp_env <- new.env()) - env_list <- as.list.environment(temp_env) - rm(temp_env) - - conn <- dbConnect(RSQLite::SQLite(), "r_data.db") - output_data_meta <- function(obj, nm) { - df <- tryCatch(data.frame(obj, - check.names=FALSE, - stringsAsFactors=FALSE - ), error=function(e) NULL) - - if (!is.null(df)) { - cat(nm, "\n", sep="") - df <- data.frame(rownames = row.names(df), df, - check.names=FALSE, - stringsAsFactors=FALSE) - dbWriteTable(conn, paste0("data_", nm), df, row.names=FALSE) - } - } - - output <- mapply(output_data_meta, env_list, names(env_list)) - dbDisconnect(conn) - """ - - with TemporaryDirectory() as tmp_dir: - r_db = os.path.join(tmp_dir, "r_data.db") - r_file = os.path.join(tmp_dir, "r_batch.R") - rda_file = self.buffer_to_disk(tmp_dir) - - output = self.run_rscript(tmp_dir, r_batch, ["Rscript", r_file, rda_file]) - output_list = [i for i in output.strip().split("\n") if i != ""] - - oline: str - conn = sqlite3.connect(r_db) - dfs: Dict[str, DataFrame] = { - oline: read_sql(f"SELECT * FROM data_{oline}", conn) - for oline in output_list - } - conn.close() - - return dfs - - def read_rds_csv(self) -> Dict[str, DataFrame]: - """ - Read R rds data via IO csv. - - This method will call `readRDS` and `write.csv` in R to export single - data frame and metadata into temp csv files for pandas `read_csv`. - """ - - r_batch = """ - args <- commandArgs(trailingOnly=TRUE) - - raw <- readRDS(args[1]) - df <- tryCatch(data.frame(raw, - check.names=FALSE, - stringsAsFactors=FALSE - ), error = function(e) NULL) - - if(!is.null(df)) { - df <- data.frame(rownames = row.names(df), df, - check.names=FALSE, - stringsAsFactors=FALSE) - write.csv(df, file=args[2], row.names=FALSE) - - cat(paste0(colnames(df), collapse=","),"|", - paste0(sapply(df, function(x) - class(x)[1]), collapse=","), - sep="") - } - """ - - dfs: Dict[str, DataFrame] = {} - with TemporaryDirectory() as tmp_dir: - r_data = os.path.join(tmp_dir, "r_data.csv") - r_file = os.path.join(tmp_dir, "r_batch.R") - - rds_file = self.buffer_to_disk(tmp_dir) - output = self.run_rscript( - tmp_dir, r_batch, ["Rscript", r_file, rds_file, r_data] - ) - - if os.path.isfile(r_data): - r_hdrs = [h.split(",") for h in output.split("|")] - n: str - py_types = {n: self.r_to_py_types[d] for n, d in zip(*r_hdrs)} - - dt_cols = [col for col, d in py_types.items() if d == "date"] - py_types = {k: v for k, v in py_types.items() if v != "date"} - - try: - dfs["r_df"] = read_csv( - r_data, - dtype=py_types, # type: ignore[arg-type] - parse_dates=dt_cols, - encoding=self.encoding, - ) - except (ParserError, ValueError): - dfs["r_df"] = read_csv(r_data) - - return dfs - - def read_rds_feather(self) -> Dict[str, DataFrame]: - """ - Read R rds data via IO feather. - - This method will call `readRDS` and `write_feather` in R to export single - data frame into a temp feather file for pandas `read_feather`. - """ - - r_batch = """ - suppressPackageStartupMessages(library(arrow)) - args <- commandArgs(trailingOnly=TRUE) - - raw <- readRDS(args[1]) - df <- tryCatch(data.frame(raw, - check.names=FALSE, - stringsAsFactors=FALSE - ), error = function(e) NULL) - - if(!is.null(df)) { - df <- data.frame(rownames = row.names(df), df, - check.names=FALSE, - stringsAsFactors=FALSE) - arrow::write_feather(df, args[2]) - } - """ - - with TemporaryDirectory() as tmp_dir: - r_data = os.path.join(tmp_dir, "r_data.feather") - r_file = os.path.join(tmp_dir, "r_batch.R") - - rds_file = self.buffer_to_disk(tmp_dir) - self.run_rscript(tmp_dir, r_batch, ["Rscript", r_file, rds_file, r_data]) - - dfs: Dict[str, DataFrame] = ( - {"r_df": read_feather(r_data)} if os.path.isfile(r_data) else {} - ) - - return dfs - - def read_rds_parquet(self) -> Dict[str, DataFrame]: - """ - Read R rds data via IO parquet. - - This method will call `readRDS` and `write_parquet` in R to export - single data frame into a temp parquet file for pandas `read_parquet`. - """ - - r_batch = """ - suppressPackageStartupMessages(library(arrow)) - args <- commandArgs(trailingOnly=TRUE) - - raw <- readRDS(args[1]) - df <- tryCatch(data.frame(raw, - check.names=FALSE, - stringsAsFactors=FALSE - ), error = function(e) NULL) - - if(!is.null(df)) { - df <- data.frame(rownames = row.names(df), df, - check.names=FALSE, - stringsAsFactors=FALSE) - arrow::write_parquet(df, args[2]) - } - """ - - with TemporaryDirectory() as tmp_dir: - r_data = os.path.join(tmp_dir, "r_data.parquet") - r_file = os.path.join(tmp_dir, "r_batch.R") - - rds_file = self.buffer_to_disk(tmp_dir) - self.run_rscript(tmp_dir, r_batch, ["Rscript", r_file, rds_file, r_data]) - - dfs: Dict[str, DataFrame] = ( - {"r_df": read_parquet(r_data, engine="pyarrow")} - if os.path.isfile(r_data) - else {} - ) - - return dfs - - def read_rds_sqlite(self) -> Dict[str, DataFrame]: - """ - Read R rds data via IO sql. - - This method will call `readRDS` and `dbWriteTable` in R to export - single data frame into a temp SQLite database for pandas `read_sql`. - """ - import sqlite3 - - r_batch = """ - suppressPackageStartupMessages(library(RSQLite)) - args <- commandArgs(trailingOnly=TRUE) - - raw <- readRDS(args[1]) - df <- tryCatch(data.frame(raw, - check.names=FALSE, - stringsAsFactors=FALSE - ), error = function(e) NULL) - - if(!is.null(df)) { - conn <- dbConnect(RSQLite::SQLite(), args[2]) - df <- data.frame(rownames = row.names(df), df, - check.names=FALSE, - stringsAsFactors=FALSE) - dbWriteTable(conn, "rdata", df, row.names=FALSE) - dbDisconnect(conn) - } - """ - - dfs: Dict[str, DataFrame] = {} - with TemporaryDirectory() as tmp_dir: - r_data = os.path.join(tmp_dir, "r_data.db") - r_file = os.path.join(tmp_dir, "r_batch.R") - - rds_file = self.buffer_to_disk(tmp_dir) - self.run_rscript(tmp_dir, r_batch, ["Rscript", r_file, rds_file, r_data]) - - if os.path.isfile(r_data): - conn = sqlite3.connect(r_data) - dfs["r_df"] = read_sql("SELECT * FROM rdata", conn) - conn.close() - - return dfs - - class RDataWriter: """ Subclass to write pandas DataFrames into R data files. @@ -1114,31 +435,14 @@ class RDataWriter: file_format : {{'infer', 'rda', 'rdata', 'rds'}}, default 'infer' R serialization type. - engine : {{'rscript','pyreadr'}}, default 'utf-8' - Engine used to write R data. - - mode : {{'csv', 'parquet', 'feather'}}, default 'csv' - Python and R i/o transfer mode. - - other_frames : list, optional - Other DataFrames to be included in rda (not rds) files - that can contain multiple objects. - - rda_names : list, default ["pandas_dataframe"] - Names for all exported objects in rda file. + rda_name : str, default "pandas_dataframe" + Name for exported DataFrame in rda file. index : bool, default True Include index or MultiIndex in output as separate columns. - ascii : bool, default False - Write data in ASCII representation. - - compress : bool or {{'gzip', 'bzip2', 'xz'}}, default 'gzip' - Compression types for R data. For pyreadr engine, only gzip - is supported. Use False for uncompressed files. - - encoding : str, optional, default 'utf-8' - Encoding of R data. + compression : {'gzip', 'bz2', 'xz', None}, default 'gzip' + Compression type for on-the-fly decompression of on-disk data. storage_options : dict, optional Extra options that make sense for a particular storage connection, @@ -1147,7 +451,6 @@ class RDataWriter: See also -------- pandas.io.rdata.PyReadrWriter - pandas.io.rdata.RscriptWriter Notes ----- @@ -1163,27 +466,17 @@ def __init__( frame: DataFrame, path_or_buffer: FilePathOrBuffer, file_format: str = "infer", - engine: str = "rscript", - mode: str = "csv", - other_frames: Optional[List[DataFrame]] = None, - rda_names: List[str] = ["pandas_dataframe"], + rda_name: str = "pandas_dataframe", index: bool = True, - ascii: bool = False, - compress: Union[bool, str] = "gzip", - encoding: str = "utf-8", + compression: CompressionOptions = "gzip", storage_options: StorageOptions = None, ) -> None: self.frame = frame self.path_or_buffer = path_or_buffer self.file_format = file_format.lower() - self.engine = engine - self.mode = mode - self.other_frames = other_frames - self.rda_names = rda_names + self.rda_name = rda_name self.index = index - self.ascii = ascii - self.compress = compress - self.encoding = encoding + self.compression = compression self.storage_options = storage_options def verify_params(self) -> None: @@ -1212,40 +505,14 @@ def verify_params(self) -> None: if self.file_format == "infer" and isinstance(self.path_or_buffer, str): self.file_format = os.path.splitext(self.path_or_buffer.lower())[1][1:] - if self.mode is not None and self.mode not in [ - "csv", - "feather", - "parquet", - "sqlite", - ]: - raise ValueError(f"{self.mode} is not supported value for mode.") - - if self.other_frames is not None and not is_list_like(self.other_frames): - raise TypeError( - f"{type(self.other_frames).__name__} is not " - " a valid type for other_frames." - ) - elif self.other_frames is not None: - for df in self.other_frames: - if not isinstance(df, DataFrame): - raise TypeError( - "One or more of the objects in " - "other_frames is not a DataFrame." - ) - - if self.rda_names is not None and not is_list_like(self.rda_names): - raise TypeError( - f"{type(self.rda_names).__name__} is not a valid type for rda_names." - ) - - if self.compress is not None and self.compress not in [ - True, - False, + if self.compression is not None and self.compression not in [ "gzip", - "bzip2", + "bz2", "xz", ]: - raise ValueError(f"{self.compress} is not a supported value for compress.") + raise ValueError( + f"{self.compression} is not a supported value for compression." + ) def disk_to_buffer(self, r_file: str) -> None: """ @@ -1258,7 +525,7 @@ def disk_to_buffer(self, r_file: str) -> None: with get_handle( self.path_or_buffer, "wb", - compression=None, + compression=self.compression, storage_options=self.storage_options, is_text=False, ) as handles: @@ -1306,527 +573,16 @@ def write_data(self) -> None: write_rdata( path=r_temp, df=self.frame, - df_name=self.rda_names[0], - compress=self.compress, + df_name=self.rda_name, + compress=None, ) elif self.file_format == "rds": - write_rds(path=r_temp, df=self.frame, compress=self.compress) - - self.disk_to_buffer(r_temp) - - return None - - -class RscriptWriter(RDataWriter): - """ - Main class called in `pandas.core.frame` to write DataFrame(s) to R - data types using command line to Rscript. - """ - - def __init__(self, *args, **kwargs) -> None: - super().__init__(*args, **kwargs) - self.verify_params() - self.handle_objects() - - def handle_objects(self) -> None: - - self.all_frames = ( - [self.frame] + self.other_frames if self.other_frames else [self.frame] - ) - - if len(self.rda_names) != len(self.all_frames): - raise ValueError( - f"Length of {self.rda_names} does not match number " - "of current DataFrame and other_frames" - ) - - return None - - def run_rscript(self, tmp_dir, r_batch, cmds) -> None: - """ - Run R script at command line. - - This method will call subprocess.Popen to run R script - and return only non-empty error R output in console. - """ - - with open(cmds[1], "w") as f: - f.write(r_batch) - - a = subprocess.Popen( - cmds, - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - encoding=self.encoding, - cwd=tmp_dir, - ) - output, error = a.communicate() - if len(error) != 0: - raise RScriptError(error) - - return None - - def write_data(self) -> None: - self.py_to_r_types = { - "int32": "integer", - "int64": "integer", - "float64": "numeric", - "category": "factor", - "object": "character", - "bool": "logical", - "datetime64[ns]": "POSIXct", - } - - switch_board = { - "rda": { - "csv": self.write_rdata_csv, - "feather": self.write_rdata_feather, - "parquet": self.write_rdata_parquet, - "sqlite": self.write_rdata_sqlite, - }, - "rdata": { - "csv": self.write_rdata_csv, - "feather": self.write_rdata_feather, - "parquet": self.write_rdata_parquet, - "sqlite": self.write_rdata_sqlite, - }, - "rds": { - "csv": self.write_rds_csv, - "feather": self.write_rds_feather, - "parquet": self.write_rds_parquet, - "sqlite": self.write_rds_sqlite, - }, - } - - switch_board[self.file_format][self.mode]() - - return None - - def write_rdata_csv(self) -> None: - """ - Write R rda data via IO csv. - - This method will export one or more DataFrames into temp data - and metadata csv files and call `read.csv` and `save` in R. - """ - - r_batch = """ - args <- commandArgs(trailingOnly=TRUE) - - py_names <- strsplit(args[1], ",")[[1]] - - for(obj in py_names) { - meta <- paste0("meta_", obj, ".txt") - r_types <- strsplit(readLines(meta, n=-1, - warn=FALSE), ",")[[1]] - - data <- paste0("data_", obj, ".csv") - df <- tryCatch( - read.csv(data, colClasses=r_types), - error = function(e) read.csv(data) - ) - assign(obj, df) - rm(df) - } - - r_ascii <- as.logical(args[3]) - r_compress <- ifelse(args[4] %in% c("True", "False"), - as.logical(args[4]), - args[4]) - - dfs <- names(Filter(is.data.frame, mget(ls()))) - save(list=dfs, file=args[2], - ascii=r_ascii, compress=r_compress) - """ - - with TemporaryDirectory() as tmp_dir: - for nm, df in zip(self.rda_names, self.all_frames): - - data_file = os.path.join(tmp_dir, f"data_{nm}.csv") - meta_file = os.path.join(tmp_dir, f"meta_{nm}.txt") - r_code = os.path.join(tmp_dir, "rbatch.R") - r_temp = os.path.join(tmp_dir, "rdata.rda") - - df = df.reset_index() if self.index else df - df.to_csv(data_file, index=False) - - with open(meta_file, "w") as f: - f.write( - ",".join( - self.py_to_r_types[p] - for p in df.dtypes.astype(str).tolist() - ) - ) - - cmds = [ - "Rscript", - r_code, - ",".join(self.rda_names), - r_temp, - str(self.ascii), - str(self.compress), - ] - self.run_rscript(tmp_dir, r_batch, cmds) - - self.disk_to_buffer(r_temp) - - return None - - def write_rdata_feather(self) -> None: - """ - Write R rda data via IO feather. - - This method will export one or more DataFrames into temp - feather files and call `read_feather` and `save` in R. - """ - - r_batch = """ - suppressPackageStartupMessages(library(arrow)) - args <- commandArgs(trailingOnly=TRUE) - - py_names <- strsplit(args[1], ",")[[1]] - - for(obj in py_names) { - data <- paste0("data_", obj, ".feather") - df <- arrow::read_feather(data) - assign(obj, df) - rm(df) - } - - r_ascii <- as.logical(args[3]) - r_compress <- ifelse(args[4] %in% c("True", "False"), - as.logical(args[4]), - args[4]) - - dfs <- names(Filter(is.data.frame, mget(ls()))) - save(list=dfs, file=args[2], - ascii=r_ascii, compress=r_compress) - """ - - with TemporaryDirectory() as tmp_dir: - for nm, df in zip(self.rda_names, self.all_frames): - - data_file = os.path.join(tmp_dir, f"data_{nm}.feather") - r_code = os.path.join(tmp_dir, "rbatch.R") - r_temp = os.path.join(tmp_dir, "rdata.rda") - - df = df.reset_index() if self.index else df.reset_index(drop=True) - df.to_feather(data_file) - - cmds = [ - "Rscript", - r_code, - ",".join(self.rda_names), - r_temp, - str(self.ascii), - str(self.compress), - ] - self.run_rscript(tmp_dir, r_batch, cmds) - - self.disk_to_buffer(r_temp) - - def write_rdata_parquet(self) -> None: - """ - Write R rda data via IO parquet. - - This method will export one or more DataFrames into temp - parquet files and call `read_parquet` and `save` in R. - """ - - r_batch = """ - suppressPackageStartupMessages(library(arrow)) - args <- commandArgs(trailingOnly=TRUE) - - py_names <- strsplit(args[1], ",")[[1]] - - for(obj in py_names) { - data <- paste0("data_", obj, ".parquet") - df <- arrow::read_parquet(data) - assign(obj, df) - rm(df) - } - - r_ascii <- as.logical(args[3]) - r_compress <- ifelse(args[4] %in% c("True", "False"), - as.logical(args[4]), - args[4]) - - dfs <- names(Filter(is.data.frame, mget(ls()))) - save(list=dfs, file=args[2], - ascii=r_ascii, compress=r_compress) - """ - - with TemporaryDirectory() as tmp_dir: - for nm, df in zip(self.rda_names, self.all_frames): - - data_file = os.path.join(tmp_dir, f"data_{nm}.parquet") - r_code = os.path.join(tmp_dir, "rbatch.R") - r_temp = os.path.join(tmp_dir, "rdata.rda") - - df = df.reset_index() if self.index else df - df.to_parquet(data_file, index=False) - - cmds = [ - "Rscript", - r_code, - ",".join(self.rda_names), - r_temp, - str(self.ascii), - str(self.compress), - ] - self.run_rscript(tmp_dir, r_batch, cmds) - - self.disk_to_buffer(r_temp) - - def write_rdata_sqlite(self) -> None: - """ - Write R rda data via IO sql. - - This method will export one or more DataFrames into a temp - SQLite database and call `dbReadTable` and `save` in R. - """ - import sqlite3 - - r_batch = """ - suppressPackageStartupMessages(library(RSQLite)) - args <- commandArgs(trailingOnly=TRUE) - - conn <- dbConnect(RSQLite::SQLite(), args[1]) - py_names <- strsplit(args[2], ",")[[1]] - - for(obj in py_names) { - data <- paste0("data_", obj) - df <- dbReadTable(conn, data) - assign(obj, df) - rm(df) - } - dbDisconnect(conn) - - r_ascii <- as.logical(args[4]) - r_compress <- ifelse(args[5] %in% c("True", "False"), - as.logical(args[5]), - args[5]) - - dfs <- names(Filter(is.data.frame, mget(ls()))) - save(list=dfs, file=args[3], - ascii=r_ascii, compress=r_compress) - """ - - with TemporaryDirectory() as tmp_dir: - r_db = os.path.join(tmp_dir, "rdata.db") - conn = sqlite3.connect(r_db) - - for nm, df in zip(self.rda_names, self.all_frames): - r_code = os.path.join(tmp_dir, "rbatch.R") - r_temp = os.path.join(tmp_dir, "rdata.rda") - - df = df.reset_index() if self.index else df - df.to_sql(f"data_{nm}", conn, index=False) - - conn.close() - cmds = [ - "Rscript", - r_code, - r_db, - ",".join(self.rda_names), - r_temp, - str(self.ascii), - str(self.compress), - ] - self.run_rscript(tmp_dir, r_batch, cmds) - - self.disk_to_buffer(r_temp) - - def write_rds_csv(self) -> None: - """ - Write R rds data via IO csv. - - This method will export a single DataFrame into temp csv - data and call `read.csv` and `saveRDS` in R. - """ - - r_batch = """ - args <- commandArgs(trailingOnly=TRUE) - py_data <- args[1] - r_types <- strsplit(args[2], ",")[[1]] - - df <- tryCatch( - read.csv(py_data, colClasses=r_types), - error = function(e) read.csv(py_data) - ) - - r_ascii <- as.logical(args[4]) - r_compress <- ifelse(args[5] %in% c("True", "False"), - as.logical(args[5]), - args[5]) - - saveRDS(df, file=args[3], - ascii=r_ascii, compress=r_compress) - """ - - with TemporaryDirectory() as tmp_dir: - r_code = os.path.join(tmp_dir, "rbatch.R") - py_data = os.path.join(tmp_dir, "pydata.csv") - r_temp = os.path.join(tmp_dir, "rdata.rds") - - py_df = self.frame.reset_index() if self.index else self.frame - r_types = ",".join(py_df.dtypes.astype(str).replace(self.py_to_r_types)) - - py_df.to_csv(py_data, index=False) - - cmds = [ - "Rscript", - r_code, - py_data, - r_types, - r_temp, - str(self.ascii), - str(self.compress), - ] - self.run_rscript(tmp_dir, r_batch, cmds) + write_rds( + path=r_temp, + df=self.frame, + compress=None, + ) self.disk_to_buffer(r_temp) return None - - def write_rds_feather(self) -> None: - """ - Write R rds data via IO feather. - - This method will export a single DataFrame into a temp - feather file to call `read_feather` and `saveRDS` in R. - """ - - r_batch = """ - suppressPackageStartupMessages(library(arrow)) - args <- commandArgs(trailingOnly=TRUE) - - df <- arrow::read_feather(args[1]) - - r_ascii <- as.logical(args[3]) - r_compress <- ifelse(args[4] %in% c("True", "False"), - as.logical(args[4]), - args[4]) - - saveRDS(df, file=args[2], - ascii=r_ascii, compress=r_compress) - """ - - with TemporaryDirectory() as tmp_dir: - r_code = os.path.join(tmp_dir, "rbatch.R") - py_data = os.path.join(tmp_dir, "pydata.feather") - r_temp = os.path.join(tmp_dir, "rdata.rds") - - py_df = ( - self.frame.reset_index() - if self.index - else self.frame.reset_index(drop=True) - ) - - py_df.to_feather(py_data) - - cmds = [ - "Rscript", - r_code, - py_data, - r_temp, - str(self.ascii), - str(self.compress), - ] - self.run_rscript(tmp_dir, r_batch, cmds) - - self.disk_to_buffer(r_temp) - - def write_rds_parquet(self) -> None: - """ - Write R rds data via IO parquet. - - This method will export a single DataFrame into a temp - parquet file for `read_parquet` and `saveRDS` in R. - """ - - r_batch = """ - suppressPackageStartupMessages(library(arrow)) - args <- commandArgs(trailingOnly=TRUE) - - df <- arrow::read_parquet(args[1]) - - r_ascii <- as.logical(args[3]) - r_compress <- ifelse(args[4] %in% c("True", "False"), - as.logical(args[4]), - args[4]) - - saveRDS(df, file=args[2], - ascii=r_ascii, compress=r_compress) - """ - - with TemporaryDirectory() as tmp_dir: - r_code = os.path.join(tmp_dir, "rbatch.R") - py_data = os.path.join(tmp_dir, "pydata.parquet") - r_temp = os.path.join(tmp_dir, "rdata.rds") - - py_df = self.frame.reset_index() if self.index else self.frame - - py_df.to_parquet(py_data, index=False) - - cmds = [ - "Rscript", - r_code, - py_data, - r_temp, - str(self.ascii), - str(self.compress), - ] - self.run_rscript(tmp_dir, r_batch, cmds) - - self.disk_to_buffer(r_temp) - - def write_rds_sqlite(self) -> None: - """ - Write R rds data via IO sql. - - This method will export a single DataFrame into a temp - parquet file for `dbReadTable` and `saveRDS` in R. - """ - import sqlite3 - - r_batch = """ - suppressPackageStartupMessages(library(RSQLite)) - args <- commandArgs(trailingOnly=TRUE) - - conn <- dbConnect(RSQLite::SQLite(), args[1]) - df <- dbReadTable(conn, "pydata") - - r_ascii <- as.logical(args[3]) - r_compress <- ifelse(args[4] %in% c("True", "False"), - as.logical(args[4]), - args[4]) - - saveRDS(df, file=args[2], - ascii=r_ascii, compress=r_compress) - dbDisconnect(conn) - """ - - with TemporaryDirectory() as tmp_dir: - r_code = os.path.join(tmp_dir, "rbatch.R") - py_data = os.path.join(tmp_dir, "pydata.db") - r_temp = os.path.join(tmp_dir, "rdata.rds") - - py_df = self.frame.reset_index() if self.index else self.frame - - conn = sqlite3.connect(py_data) - py_df.to_sql("pydata", conn, index=False) - conn.close() - - cmds = [ - "Rscript", - r_code, - py_data, - r_temp, - str(self.ascii), - str(self.compress), - ] - self.run_rscript(tmp_dir, r_batch, cmds) - - self.disk_to_buffer(r_temp) diff --git a/pandas/tests/io/rdata/test_rscript.py b/pandas/tests/io/rdata/test_rscript.py deleted file mode 100644 index 5df2b499a66a3..0000000000000 --- a/pandas/tests/io/rdata/test_rscript.py +++ /dev/null @@ -1,987 +0,0 @@ -from io import BytesIO -import os -import subprocess -from urllib.error import HTTPError - -import pytest - -from pandas.compat._optional import import_optional_dependency -import pandas.util._test_decorators as td - -from pandas import DataFrame -import pandas._testing as tm - -from pandas.io.rdata import ( - RSCRIPT_EXISTS, - RScriptError, - read_rdata, -) - -pytestmark = pytest.mark.skipif(not RSCRIPT_EXISTS, reason="R is not installed.") - -ghg_df = DataFrame( - { - "gas": { - "141": "Carbon dioxide", - "142": "Methane", - "143": "Nitrous oxide", - "144": "Fluorinated gases", - "145": "Total", - }, - "year": {"141": 2018, "142": 2018, "143": 2018, "144": 2018, "145": 2018}, - "emissions": { - "141": 5424.88150213288, - "142": 634.457127078267, - "143": 434.528555376666, - "144": 182.782432461777, - "145": 6676.64961704959, - }, - } -).rename_axis("rownames") - -plants_df = DataFrame( - { - "plant_group": { - "16": "Pteridophytes", - "17": "Pteridophytes", - "18": "Pteridophytes", - "19": "Pteridophytes", - "20": "Pteridophytes", - }, - "status": { - "16": "Data Deficient", - "17": "Extinct", - "18": "Not Threatened", - "19": "Possibly Threatened", - "20": "Threatened", - }, - "count": {"16": 398, "17": 65, "18": 1294, "19": 408, "20": 1275}, - } -).rename_axis("rownames") - -sea_ice_df = DataFrame( - { - "year": {"1012": 2016, "1013": 2017, "1014": 2018, "1015": 2019, "1016": 2020}, - "mo": {"1012": 12, "1013": 12, "1014": 12, "1015": 12, "1016": 12}, - "data.type": { - "1012": "Goddard", - "1013": "Goddard", - "1014": "Goddard", - "1015": "Goddard", - "1016": "NRTSI-G", - }, - "region": {"1012": "S", "1013": "S", "1014": "S", "1015": "S", "1016": "S"}, - "extent": { - "1012": 8.28, - "1013": 9.48, - "1014": 9.19, - "1015": 9.41, - "1016": 10.44, - }, - "area": {"1012": 5.51, "1013": 6.23, "1014": 5.59, "1015": 6.59, "1016": 6.5}, - } -).rename_axis("rownames") - - -def run_rscript(cmds) -> str: - """ - Run R script at command line. - - This method will read write_rdata output and check - console output. - """ - - r_batch = """ - args <- commandArgs(trailingOnly=TRUE) - - switch(args[2], - "rda" = load(args[1]), - "rds" = { - pandas_dataframe <- readRDS(args[1]) - } - ) - - rm(args) - mget(ls()) - """ - with open(cmds[1], "w") as f: - f.write(r_batch) - - p = subprocess.Popen( - cmds, - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - encoding="UTF-8", - ) - output, error = p.communicate() - if len(error) != 0: - raise ValueError(error) - - return output - - -def r_package_installed(name): - """ - Check if R package is installed. - - Method runs a quick command line call to Rscript to - check if library call succeeds on named package. - """ - - p = subprocess.Popen( - ["Rscript", "-e", f"suppressPackageStartupMessages(library({name}))"], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) - out, err = p.communicate() - - return len(err) == 0 - - -R_ARROW = r_package_installed("arrow") if RSCRIPT_EXISTS else None -R_RSQLITE = r_package_installed("RSQLite") if RSCRIPT_EXISTS else None -PYARROW = import_optional_dependency("pyarrow", errors="ignore") - - -def adj_int(df): - """ - Convert int32 columns to int64. - - Since parquet and feather modes parses ints int int32, - this method converts for testing. - """ - for col in df.select_dtypes("int32").columns: - df[col] = df[col].astype("int64") - - return df - - -def handle_index_rownames(df): - df = df.drop(["rownames"], axis=1).set_index("index").rename_axis(None) - - return df - - -@pytest.fixture(params=["rda", "rds"]) -def rtype(request): - return request.param - - -@pytest.fixture( - params=[ - "csv", - pytest.param( - "parquet", - marks=pytest.mark.skipif( - not R_ARROW or not PYARROW, - reason="R arrow or pyarrow not installed", - ), - ), - pytest.param( - "feather", - marks=pytest.mark.skipif( - not R_ARROW or not PYARROW, - reason="R arrow or pyarrow not installed", - ), - ), - pytest.param( - "sqlite", - marks=pytest.mark.skipif(not R_RSQLITE, reason="R RSQLite not installed"), - ), - ] -) -def mode(request): - return request.param - - -@pytest.fixture(params=[True, False, None]) -def ascii(request): - return request.param - - -@pytest.fixture(params=[False, "gzip", "bzip2", "xz"]) -def comp(request): - return request.param - - -# RDA READER - -# PATH_OR_BUFFER - - -def test_read_rds_file(datapath): - filename = datapath("io", "data", "rdata", "ghg_df.rds") - r_df = read_rdata(filename, engine="rscript") - - if isinstance(r_df, DataFrame): - tm.assert_frame_equal(ghg_df, r_df.tail()) - - -def test_read_rda_file(datapath): - filename = datapath("io", "data", "rdata", "env_data_dfs.rda") - r_dfs = read_rdata(filename, engine="rscript") - - assert list(r_dfs.keys()) == ["plants_df", "sea_ice_df", "ghg_df"] - - tm.assert_frame_equal(ghg_df, r_dfs["ghg_df"].tail()) - tm.assert_frame_equal(plants_df, r_dfs["plants_df"].tail()) - tm.assert_frame_equal(sea_ice_df, r_dfs["sea_ice_df"].tail()) - - -def test_buffer_read_rds(datapath): - filename = datapath("io", "data", "rdata", "sea_ice_df.rds") - - with open(filename, "rb") as f: - r_df = read_rdata(f, file_format="rds", engine="rscript") - - output = adj_int(r_df).tail() - - tm.assert_frame_equal(sea_ice_df, output) - - -def test_bytes_read_rda(datapath): - filename = datapath("io", "data", "rdata", "env_data_dfs.rda") - - with open(filename, "rb") as f: - r_dfs = read_rdata(f, file_format="rda", engine="rscript") - - r_dfs = {str(k): adj_int(v) for k, v in r_dfs.items()} - - assert list(r_dfs.keys()) == ["plants_df", "sea_ice_df", "ghg_df"] - - tm.assert_frame_equal(ghg_df, r_dfs["ghg_df"].tail()) - tm.assert_frame_equal(plants_df, r_dfs["plants_df"].tail()) - tm.assert_frame_equal(sea_ice_df, r_dfs["sea_ice_df"].tail()) - - -def test_bytesio_rds(datapath): - filename = datapath("io", "data", "rdata", "sea_ice_df.rds") - - with open(filename, "rb") as f: - with BytesIO(f.read()) as b_io: - r_df = read_rdata(b_io, file_format="rds", engine="rscript") - - output = adj_int(r_df).tail() - - tm.assert_frame_equal(sea_ice_df, output) - - -def test_bytesio_rda(datapath): - filename = datapath("io", "data", "rdata", "env_data_dfs.rda") - - with open(filename, "rb") as f: - with BytesIO(f.read()) as b_io: - r_dfs = read_rdata(b_io, file_format="rda", engine="rscript") - - r_dfs = {str(k): adj_int(v) for k, v in r_dfs.items()} - - assert list(r_dfs.keys()) == ["plants_df", "sea_ice_df", "ghg_df"] - - tm.assert_frame_equal(ghg_df, r_dfs["ghg_df"].tail()) - tm.assert_frame_equal(plants_df, r_dfs["plants_df"].tail()) - tm.assert_frame_equal(sea_ice_df, r_dfs["sea_ice_df"].tail()) - - -# FILE FORMAT - - -def test_read_wrong_format(datapath): - with pytest.raises(ValueError, match="not a valid value for file_format"): - filename = datapath("io", "data", "rdata", "plants_df.rds") - read_rdata(filename, engine="rscript", file_format="r") - - -def test_read_wrong_file(): - with pytest.raises(FileNotFoundError, match="file cannot be found"): - filename = os.path.join("data", "rdata", "plants_df.rda") - read_rdata(filename, engine="rscript") - - -@pytest.mark.slow -def test_read_rds_non_dfs(datapath, mode): - with pytest.raises( - ValueError, match="No actual data frame or coercible data frames" - ): - filename = datapath("io", "data", "rdata", "ghg_t_tests.rds") - read_rdata(filename, engine="rscript", mode=mode) - - -@pytest.mark.slow -def test_read_rda_non_dfs(datapath, mode): - with pytest.raises( - ValueError, match="No actual data frame or coercible data frames" - ): - filename = datapath("io", "data", "rdata", "env_data_non_dfs.rda") - read_rdata(filename, engine="rscript", mode=mode) - - -@td.skip_if_not_us_locale -def test_read_not_rda_file(datapath, mode): - with pytest.raises(RScriptError, match="bad restore file magic number"): - read_rdata( - datapath("io", "data", "rdata", "ppm_df.csv"), - file_format="rda", - engine="rscript", - mode=mode, - ) - - -@td.skip_if_not_us_locale -def test_read_not_rds_file(datapath, mode): - with pytest.raises(RScriptError, match="unknown input format"): - read_rdata( - datapath("io", "data", "rdata", "ppm_df.csv"), - file_format="rds", - engine="rscript", - mode=mode, - ) - - -def test_bytes_read_infer_rds(datapath): - filename = datapath("io", "data", "rdata", "sea_ice_df.rds") - - with pytest.raises(ValueError, match="Unable to infer file format from file name"): - with open(filename, "rb") as f: - read_rdata(f, engine="rscript") - - -def test_bytes_read_infer_rda(datapath): - filename = datapath("io", "data", "rdata", "env_data_dfs.rda") - - with pytest.raises(ValueError, match="Unable to infer file format from file name"): - with open(filename, "rb") as f: - read_rdata(f, engine="rscript") - - -# URL - - -@tm.network -def test_read_rda_url(): - url_df = DataFrame( - { - "carrier": {"1": "9E", "2": "AA", "3": "AS", "4": "B6", "5": "DL"}, - "name": { - "1": "Endeavor Air Inc.", - "2": "American Airlines Inc.", - "3": "Alaska Airlines Inc.", - "4": "JetBlue Airways", - "5": "Delta Air Lines Inc.", - }, - } - ).rename_axis("rownames") - - url = ( - "https://github.com/hadley/nycflights13/blob/master/data/airlines.rda?raw=true" - ) - r_df = read_rdata(url, file_format="rda", engine="rscript")["airlines"] - - tm.assert_frame_equal(url_df, r_df.head()) - - -@tm.network -def test_read_unable_infer_format(): - with pytest.raises(ValueError, match="Unable to infer file format from file name"): - url = ( - "https://github.com/hadley/nycflights13/" - "blob/master/data/airlines.rda?raw=true" - ) - read_rdata(url, engine="rscript") - - -@tm.network -def test_read_wrong_url(): - with pytest.raises(HTTPError, match="HTTP Error 404: Not Found"): - url = "https://example.com/data.rdata" - read_rdata(url, engine="rscript") - - -# S3 - - -@tm.network -@td.skip_if_no("s3fs") -def test_read_rda_s3(): - s3 = "s3://assets.datacamp.com/production/course_1478/datasets/wine.RData" - s3_df = DataFrame( - { - "Alcohol": {"1": 13.2, "2": 13.16, "3": 14.37, "4": 13.24, "5": 14.2}, - "Malic acid": {"1": 1.78, "2": 2.36, "3": 1.95, "4": 2.59, "5": 1.76}, - "Ash": {"1": 2.14, "2": 2.67, "3": 2.5, "4": 2.87, "5": 2.45}, - "Alcalinity of ash": { - "1": 11.2, - "2": 18.6, - "3": 16.8, - "4": 21.0, - "5": 15.2, - }, - "Magnesium": {"1": 100, "2": 101, "3": 113, "4": 118, "5": 112}, - "Total phenols": {"1": 2.65, "2": 2.8, "3": 3.85, "4": 2.8, "5": 3.27}, - "Flavanoids": {"1": 2.76, "2": 3.24, "3": 3.49, "4": 2.69, "5": 3.39}, - "Nonflavanoid phenols": { - "1": 0.26, - "2": 0.3, - "3": 0.24, - "4": 0.39, - "5": 0.34, - }, - "Proanthocyanins": {"1": 1.28, "2": 2.81, "3": 2.18, "4": 1.82, "5": 1.97}, - "Color intensity": {"1": 4.38, "2": 5.68, "3": 7.8, "4": 4.32, "5": 6.75}, - "Hue": {"1": 3.4, "2": 3.17, "3": 3.45, "4": 2.93, "5": 2.85}, - "Proline": {"1": 1050, "2": 1185, "3": 1480, "4": 735, "5": 1450}, - } - ).rename_axis("rownames") - r_dfs = read_rdata(s3, engine="rscript") - - tm.assert_frame_equal(s3_df, r_dfs["wine"].head()) - - -# ENGINE - - -def test_read_rds_df_output(datapath): - filename = datapath("io", "data", "rdata", "sea_ice_df.rds") - r_dfs = read_rdata(filename, engine="rscript") - - assert isinstance(r_dfs, DataFrame) - - -def test_read_rda_dict_output(datapath): - filename = datapath("io", "data", "rdata", "env_data_dfs.rda") - r_dfs = read_rdata(filename, engine="rscript") - - assert isinstance(r_dfs, dict) - assert list(r_dfs.keys()) == ["plants_df", "sea_ice_df", "ghg_df"] - - -def test_read_wrong_engine(datapath): - with pytest.raises(ValueError, match="not a supported engine"): - filename = datapath("io", "data", "rdata", "sea_ice_df.rds") - read_rdata(filename, engine="rpy2") - - -# MODE - - -@pytest.mark.slow -def test_read_rds_mode_file(datapath, mode): - filename = datapath("io", "data", "rdata", "ghg_df.rds") - r_df = read_rdata(filename, engine="rscript", mode=mode) - - output = adj_int(r_df).tail() - - tm.assert_frame_equal(ghg_df, output) - - -@pytest.mark.slow -def test_read_rda_mode_file(datapath, mode): - filename = datapath("io", "data", "rdata", "env_data_dfs.rda") - r_dfs = read_rdata(filename, engine="rscript", mode=mode) - - if mode in ["parquet", "feather"]: - (r_dfs["ghg_df"], r_dfs["plants_df"], r_dfs["sea_ice_df"]) = ( - adj_int(r_dfs["ghg_df"]), - adj_int(r_dfs["plants_df"]), - adj_int(r_dfs["sea_ice_df"]), - ) - - assert list(r_dfs.keys()) == ["plants_df", "sea_ice_df", "ghg_df"] - - tm.assert_frame_equal(ghg_df, r_dfs["ghg_df"].tail()) - tm.assert_frame_equal(plants_df, r_dfs["plants_df"].tail()) - tm.assert_frame_equal(sea_ice_df, r_dfs["sea_ice_df"].tail()) - - -def test_read_wrong_mode(datapath): - with pytest.raises(ValueError, match="not supported value for mode"): - filename = datapath("io", "data", "rdata", "plants_df.rds") - read_rdata(filename, engine="rscript", mode="pickle") - - -# USE_OBJECTS - - -def test_read_select_frames_rda_dfs(datapath): - filename = datapath("io", "data", "rdata", "env_data_dfs.rda") - r_dfs = read_rdata( - filename, engine="rscript", select_frames=["ghg_df", "sea_ice_df"] - ) - - assert "plants_df" not in list(r_dfs.keys()) - assert "ghg_df" in list(r_dfs.keys()) - assert "sea_ice_df" in list(r_dfs.keys()) - - -def test_read_select_frames_rda_objs(datapath): - filename = datapath("io", "data", "rdata", "env_data_objs.rda") - r_dfs = read_rdata( - filename, - engine="rscript", - select_frames=["ppm_ts", "species_mtx", "plants_arry"], - ) - - assert "species_vec" not in list(r_dfs.keys()) - assert "ghg_df" not in list(r_dfs.keys()) - - assert "ppm_ts" in list(r_dfs.keys()) - assert "species_mtx" in list(r_dfs.keys()) - assert "plants_arry" in list(r_dfs.keys()) - - -def test_read_wrong_select_frames(datapath): - with pytest.raises(TypeError, match="not a valid type for select_frames"): - filename = datapath("io", "data", "rdata", "env_data_dfs.rda") - read_rdata( - filename, - engine="rscript", - select_frames="plants_df", # type: ignore[arg-type] - ) - - -# ROWNAMES - - -def test_read_rownames_true_rds(datapath): - filename = datapath("io", "data", "rdata", "sea_ice_df.rds") - r_df = read_rdata(filename, engine="rscript", rownames=True) - - if isinstance(r_df, DataFrame): - assert r_df.index.name == "rownames" - - -def test_read_rownames_false_rds(datapath): - filename = datapath("io", "data", "rdata", "sea_ice_df.rds") - r_df = read_rdata(filename, engine="rscript", rownames=False) - - if isinstance(r_df, DataFrame): - assert r_df.index.name != "rownames" - - -def test_read_rownames_true_rda(datapath): - filename = datapath("io", "data", "rdata", "env_data_dfs.rda") - r_dfs = read_rdata(filename, engine="rscript", rownames=True) - - assert r_dfs["ghg_df"].index.name == "rownames" - assert r_dfs["plants_df"].index.name == "rownames" - assert r_dfs["sea_ice_df"].index.name == "rownames" - - -def test_read_rownames_false_rda(datapath): - filename = datapath("io", "data", "rdata", "env_data_dfs.rda") - r_dfs = read_rdata(filename, engine="rscript", rownames=False) - - assert r_dfs["ghg_df"].index.name != "rownames" - assert r_dfs["plants_df"].index.name != "rownames" - assert r_dfs["sea_ice_df"].index.name != "rownames" - - -# ENCODING - - -@pytest.mark.xfail( - reason="R encoding is locale specific. Need to think about workaround." -) -def test_non_utf8_data(datapath, rtype): - filename = datapath("io", "data", "rdata", f"climate_non_utf8_df.{rtype}") - - expected = DataFrame( - { - "número": { - "1": 1, - "2": 2, - "3": 3, - "4": 4, - "5": 5, - "6": 6, - "7": 7, - "8": 8, - "9": 9, - "10": 10, - }, - "punto central del climatismo": { - "1": "Parada de la circulación de vuelco meridional del Atlántico", - "2": "Desintegración de la capa de hielo de la Antártida occidental", - "3": "Muerte de la selva amazónica", - "4": "Cambio de monzón en África occidental", - "5": "Permafrost e hidratos de metano", - "6": "Muerte de los arrecifes de coral", - "7": "Cambio de monzón de la India", - "8": "Desintegración de la capa de hielo de Groenlandia", - "9": "Desplazamiento del bosque boreal", - "10": "Reducción del hielo marino del Ártico ", - }, - }, - index=[str(i) for i in range(1, 11)], - ).rename_axis("rownames") - - rdfs = read_rdata(filename, engine="rscript", encoding="iso-8859-1", mode="csv") - - output = rdfs["climate_df"] if rtype == "rda" else rdfs - - tm.assert_frame_equal(output, expected) - - -# RDA WRITER - -# PATH_OR_BUFFER - - -@pytest.mark.slow -def test_write_read_file(datapath, rtype, mode): - with tm.ensure_clean("test.out") as path: - ghg_df.to_rdata( - path, file_format=rtype, engine="rscript", mode=mode, index=False - ) - r_dfs = read_rdata( - path, file_format=rtype, engine="rscript", mode=mode, rownames=False - ) - - expected = ghg_df.reset_index(drop=True) - output = r_dfs if rtype == "rds" else r_dfs["pandas_dataframe"] - output["year"] = output["year"].astype("int64") - - tm.assert_frame_equal(output, expected) - - -@pytest.mark.slow -def test_write_read_bytes_io(datapath, rtype, mode): - with BytesIO() as b_io: - sea_ice_df.to_rdata( - b_io, file_format=rtype, engine="rscript", mode=mode, index=False - ) - r_dfs = read_rdata( - b_io.getvalue(), # type: ignore[arg-type] - file_format=rtype, - engine="rscript", - mode=mode, - rownames=False, - ) - - expected = sea_ice_df.reset_index(drop=True) - output = r_dfs if rtype == "rds" else r_dfs["pandas_dataframe"] - output["year"] = output["year"].astype("int64") - output["mo"] = output["mo"].astype("int64") - - tm.assert_frame_equal(output, expected) - - -# FILE_FORMAT - - -def test_write_rda_file(rtype): - expected = """\ -$pandas_dataframe - rownames year mo data.type region extent area -1 1012 2016 12 Goddard S 8.28 5.51 -2 1013 2017 12 Goddard S 9.48 6.23 -3 1014 2018 12 Goddard S 9.19 5.59 -4 1015 2019 12 Goddard S 9.41 6.59 -5 1016 2020 12 NRTSI-G S 10.44 6.50 - -""" - with tm.ensure_clean_dir() as tmp_dir: - out_file = os.path.join(tmp_dir, "rdata.out") - r_code = os.path.join(tmp_dir, "r_test.R") - - sea_ice_df.to_rdata(out_file, file_format=rtype, engine="rscript") - - cmds = ["Rscript", r_code, out_file, rtype, "pandas_dataframe"] - output = run_rscript(cmds) - - assert output == expected - - -def test_write_wrong_format(): - with tm.ensure_clean("test.rda") as path: - with pytest.raises(ValueError, match=("not a valid value for file_format")): - ghg_df.to_rdata(path, engine="rscript", file_format="csv") - - -def test_write_unable_to_infer(): - with tm.ensure_clean("test") as path: - with pytest.raises( - ValueError, match=("Unable to infer file format from file name") - ): - ghg_df.to_rdata(path, engine="rscript") - - -# ENGINE - - -@td.skip_if_no("pyreadr") -def test_write_engine_consistency(rtype): - expected = """\ -$pandas_dataframe - rownames plant_group status count -1 16 Pteridophytes Data Deficient 398 -2 17 Pteridophytes Extinct 65 -3 18 Pteridophytes Not Threatened 1294 -4 19 Pteridophytes Possibly Threatened 408 -5 20 Pteridophytes Threatened 1275 - -""" - with tm.ensure_clean_dir() as tmp_dir: - out_file = os.path.join(tmp_dir, "rdata.out") - r_code = os.path.join(tmp_dir, "r_test.R") - - plants_df.to_rdata(out_file, file_format=rtype, engine="pyreadr") - cmds = ["Rscript", r_code, out_file, rtype, "pandas_dataframe"] - pyr_output = run_rscript(cmds) - - plants_df.to_rdata(out_file, file_format=rtype, engine="rscript") - cmds = ["Rscript", r_code, out_file, rtype, "pandas_dataframe"] - rcomp_output = run_rscript(cmds) - - assert pyr_output == expected - assert pyr_output == rcomp_output - - -def test_write_wrong_engine(): - with tm.ensure_clean("test.rda") as path: - with pytest.raises(ValueError, match=("not a supported engine")): - ghg_df.to_rdata(path, engine="rpy2") - - -# MODE - - -@pytest.mark.slow -def test_write_mode(rtype, mode): - expected = """\ -$pandas_dataframe - rownames gas year emissions -1 141 Carbon dioxide 2018 5424.8815 -2 142 Methane 2018 634.4571 -3 143 Nitrous oxide 2018 434.5286 -4 144 Fluorinated gases 2018 182.7824 -5 145 Total 2018 6676.6496 - -""" - with tm.ensure_clean_dir() as tmp_dir: - out_file = os.path.join(tmp_dir, "rdata.out") - r_code = os.path.join(tmp_dir, "r_test.R") - - ghg_df.to_rdata(out_file, file_format=rtype, engine="rscript", mode=mode) - cmds = ["Rscript", r_code, out_file, rtype, "pandas_dataframe"] - output = run_rscript(cmds) - - assert output == expected - - -def test_write_wrong_mode(): - with tm.ensure_clean("test.rds") as path: - with pytest.raises(ValueError, match=("not supported value for mode")): - ghg_df.to_rdata(path, engine="rscript", mode="pickle") - - -# INDEX - - -@pytest.mark.slow -def test_write_index_false(rtype, mode): - expected = """\ -$pandas_dataframe - gas year emissions -1 Carbon dioxide 2018 5424.8815 -2 Methane 2018 634.4571 -3 Nitrous oxide 2018 434.5286 -4 Fluorinated gases 2018 182.7824 -5 Total 2018 6676.6496 - -""" - with tm.ensure_clean_dir() as tmp_dir: - out_file = os.path.join(tmp_dir, "rdata.out") - r_code = os.path.join(tmp_dir, "r_test.R") - - ghg_df.to_rdata( - out_file, file_format=rtype, index=False, engine="rscript", mode=mode - ) - - cmds = ["Rscript", r_code, out_file, rtype, "pandas_dataframe"] - output = run_rscript(cmds) - - assert output == expected - - -# ASCII - - -@pytest.mark.slow -def test_write_ascii_output(rtype, mode, ascii): - expected = """\ -$pandas_dataframe - rownames gas year emissions -1 141 Carbon dioxide 2018 5424.8815 -2 142 Methane 2018 634.4571 -3 143 Nitrous oxide 2018 434.5286 -4 144 Fluorinated gases 2018 182.7824 -5 145 Total 2018 6676.6496 - -""" - with tm.ensure_clean_dir() as tmp_dir: - out_file = os.path.join(tmp_dir, "rdata.out") - r_code = os.path.join(tmp_dir, "r_test.R") - - ghg_df.to_rdata( - out_file, file_format=rtype, engine="rscript", mode=mode, ascii=ascii - ) - - cmds = ["Rscript", r_code, out_file, rtype, "pandas_dataframe"] - output = run_rscript(cmds) - - assert output == expected - - -@td.skip_if_windows -def test_write_read_ascii(rtype): - with tm.ensure_clean_dir() as tmp_dir: - out_file = os.path.join(tmp_dir, "rdata.out") - - ghg_df.to_rdata( - out_file, - file_format=rtype, - engine="rscript", - index=False, - ascii=True, - compress=False, - ) - - with open(out_file, newline="") as f: - r_dfs = read_rdata(f, file_format=rtype, engine="rscript", rownames=False) - - expected = ghg_df.reset_index(drop=True) - output = r_dfs if rtype == "rds" else r_dfs["pandas_dataframe"] - output["year"] = output["year"].astype("int64") - - tm.assert_frame_equal(output, expected) - - -# COMPRESS - - -@pytest.mark.slow -def test_write_compress_types(rtype, mode, comp): - expected = """\ -$pandas_dataframe - rownames year mo data.type region extent area -1 1012 2016 12 Goddard S 8.28 5.51 -2 1013 2017 12 Goddard S 9.48 6.23 -3 1014 2018 12 Goddard S 9.19 5.59 -4 1015 2019 12 Goddard S 9.41 6.59 -5 1016 2020 12 NRTSI-G S 10.44 6.50 - -""" - with tm.ensure_clean_dir() as tmp_dir: - out_file = os.path.join(tmp_dir, "rdata.out") - r_code = os.path.join(tmp_dir, "r_test.R") - - sea_ice_df.to_rdata( - out_file, file_format=rtype, engine="rscript", mode=mode, compress=comp - ) - - cmds = ["Rscript", r_code, out_file, rtype, "pandas_dataframe"] - output = run_rscript(cmds) - - assert output == expected - - -def test_write_wrong_comp(): - with tm.ensure_clean("test.rds") as path: - with pytest.raises(ValueError, match=("not a supported value for compress")): - ghg_df.to_rdata(path, engine="rscript", compress="zip") - - -def test_write_none_comp(): - with tm.ensure_clean("test.rds") as path: - with pytest.raises(RScriptError, match=("invalid 'compress' argument")): - ghg_df.to_rdata(path, engine="rscript", compress=None) - - -# OTHER_FRAMES - - -@pytest.mark.slow -def test_write_other_frames(mode): - expected = """\ -$ghg_df - rownames gas year emissions -1 141 Carbon dioxide 2018 5424.8815 -2 142 Methane 2018 634.4571 -3 143 Nitrous oxide 2018 434.5286 -4 144 Fluorinated gases 2018 182.7824 -5 145 Total 2018 6676.6496 - -$plants_df - rownames plant_group status count -1 16 Pteridophytes Data Deficient 398 -2 17 Pteridophytes Extinct 65 -3 18 Pteridophytes Not Threatened 1294 -4 19 Pteridophytes Possibly Threatened 408 -5 20 Pteridophytes Threatened 1275 - -$sea_ice_df - rownames year mo data.type region extent area -1 1012 2016 12 Goddard S 8.28 5.51 -2 1013 2017 12 Goddard S 9.48 6.23 -3 1014 2018 12 Goddard S 9.19 5.59 -4 1015 2019 12 Goddard S 9.41 6.59 -5 1016 2020 12 NRTSI-G S 10.44 6.50 - -""" - with tm.ensure_clean_dir() as tmp_dir: - out_file = os.path.join(tmp_dir, "rdata.rda") - r_code = os.path.join(tmp_dir, "r_test.R") - - ghg_df.to_rdata( - out_file, - engine="rscript", - other_frames=[plants_df, sea_ice_df], - rda_names=["ghg_df", "plants_df", "sea_ice_df"], - mode=mode, - ) - - cmds = ["Rscript", r_code, out_file, "rda", ""] - output = run_rscript(cmds) - - assert output == expected - - -def test_write_other_frames_wrong_type(): - with tm.ensure_clean("test.rds") as path: - with pytest.raises( - TypeError, match=("objects in other_frames is not a DataFrame") - ): - ghg_df.to_rdata( - path, engine="rscript", other_frames=plants_df, rda_names=["plants_df"] - ) - - -def test_write_read_other_frames(datapath): - with tm.ensure_clean("test.rda") as path: - ghg_df.to_rdata( - path, - engine="rscript", - other_frames=[plants_df, sea_ice_df], - rda_names=["ghg_df", "plants_df", "sea_ice_df"], - ) - r_dfs = read_rdata(path, engine="rscript") - - assert list(r_dfs.keys()) == ["plants_df", "sea_ice_df", "ghg_df"] - - -# RDA NAMES - - -def test_write_mismatched_names_frames(): - with tm.ensure_clean("test.rds") as path: - with pytest.raises( - ValueError, - match=("does not match number of current DataFrame and other_frames"), - ): - ghg_df.to_rdata( - path, - engine="rscript", - other_frames=[plants_df, sea_ice_df], - rda_names=["plants_df", "sea_ice_df"], - ) diff --git a/pandas/tests/io/rdata/test_pyreadr.py b/pandas/tests/io/test_rdata.py similarity index 73% rename from pandas/tests/io/rdata/test_pyreadr.py rename to pandas/tests/io/test_rdata.py index fbcc9b06523fc..129764e69596a 100644 --- a/pandas/tests/io/rdata/test_pyreadr.py +++ b/pandas/tests/io/test_rdata.py @@ -77,13 +77,8 @@ def rtype(request): return request.param -@pytest.fixture(params=[None, False, "gzip"]) -def ok_comp(request): - return request.param - - -@pytest.fixture(params=[True, "bzip2", "xz"]) -def bad_comp(request): +@pytest.fixture(params=[None, "gzip", "bz2", "xz"]) +def comp(request): return request.param @@ -115,7 +110,7 @@ def adj_int(df): def test_read_rds_file(datapath): filename = datapath("io", "data", "rdata", "ghg_df.rds") - r_df = read_rdata(filename, engine="pyreadr") + r_df = read_rdata(filename) output = adj_int(r_df).tail() tm.assert_frame_equal(ghg_df, output) @@ -123,7 +118,7 @@ def test_read_rds_file(datapath): def test_read_rda_file(datapath): filename = datapath("io", "data", "rdata", "env_data_dfs.rda") - r_dfs = read_rdata(filename, engine="pyreadr") + r_dfs = read_rdata(filename) r_dfs = {str(k): adj_int(v) for k, v in r_dfs.items()} @@ -138,7 +133,7 @@ def test_bytes_read_rds(datapath): filename = datapath("io", "data", "rdata", "sea_ice_df.rds") with open(filename, "rb") as f: - r_df = read_rdata(f, file_format="rds", engine="pyreadr") + r_df = read_rdata(f, file_format="rds") output = adj_int(r_df).tail() @@ -149,7 +144,7 @@ def test_bytes_read_rda(datapath): filename = datapath("io", "data", "rdata", "env_data_dfs.rda") with open(filename, "rb") as f: - r_dfs = read_rdata(f, file_format="rda", engine="pyreadr") + r_dfs = read_rdata(f, file_format="rda") r_dfs = {str(k): adj_int(v) for k, v in r_dfs.items()} @@ -165,7 +160,7 @@ def test_bytesio_rds(datapath): with open(filename, "rb") as f: with BytesIO(f.read()) as b_io: - r_df = read_rdata(b_io, file_format="rds", engine="pyreadr") + r_df = read_rdata(b_io, file_format="rds") output = adj_int(r_df).tail() @@ -177,7 +172,7 @@ def test_bytesio_rda(datapath): with open(filename, "rb") as f: with BytesIO(f.read()) as b_io: - r_dfs = read_rdata(b_io, file_format="rda", engine="pyreadr") + r_dfs = read_rdata(b_io, file_format="rda") r_dfs = {str(k): adj_int(v) for k, v in r_dfs.items()} @@ -194,13 +189,13 @@ def test_bytesio_rda(datapath): def test_read_wrong_format(datapath): with pytest.raises(ValueError, match="not a valid value for file_format"): filename = datapath("io", "data", "rdata", "plants_df.rds") - read_rdata(filename, engine="pyreadr", file_format="r") + read_rdata(filename, file_format="r") def test_read_wrong_file(): with pytest.raises(FileNotFoundError, match="file cannot be found"): filename = os.path.join("data", "rdata", "plants_df.rda") - read_rdata(filename, engine="pyreadr") + read_rdata(filename) def test_read_rds_non_df(datapath): @@ -211,7 +206,7 @@ def test_read_rds_non_df(datapath): match="Invalid file, or file has unsupported features", ): filename = datapath("io", "data", "rdata", "ppm_ts.rds") - read_rdata(filename, engine="pyreadr") + read_rdata(filename) def test_read_rda_non_dfs(datapath): @@ -222,7 +217,7 @@ def test_read_rda_non_dfs(datapath): match="Invalid file, or file has unsupported features", ): filename = datapath("io", "data", "rdata", "env_data_non_dfs.rda") - read_rdata(filename, engine="pyreadr") + read_rdata(filename) def test_read_not_rda_file(datapath): @@ -232,7 +227,7 @@ def test_read_not_rda_file(datapath): custom_errors.LibrdataError, match="The file contains an unrecognized object" ): filename = datapath("io", "data", "rdata", "ppm_df.csv") - read_rdata(filename, file_format="rda", engine="pyreadr") + read_rdata(filename, file_format="rda") def test_bytes_read_infer_rds(datapath): @@ -240,7 +235,7 @@ def test_bytes_read_infer_rds(datapath): with pytest.raises(ValueError, match="Unable to infer file format from file name"): with open(filename, "rb") as f: - read_rdata(f, engine="pyreadr") + read_rdata(f) def test_bytes_read_infer_rda(datapath): @@ -248,7 +243,7 @@ def test_bytes_read_infer_rda(datapath): with pytest.raises(ValueError, match="Unable to infer file format from file name"): with open(filename, "rb") as f: - read_rdata(f, engine="pyreadr") + read_rdata(f) # URL @@ -272,7 +267,7 @@ def test_read_rda_url(): url = ( "https://github.com/hadley/nycflights13/blob/master/data/airlines.rda?raw=true" ) - r_dfs = read_rdata(url, file_format="rda", engine="pyreadr") + r_dfs = read_rdata(url, file_format="rda") tm.assert_frame_equal(url_df, r_dfs["airlines"].head()) @@ -284,14 +279,14 @@ def test_read_unable_infer_format(): "https://github.com/hadley/nycflights13/" "blob/master/data/airlines.rda?raw=true" ) - read_rdata(url, engine="pyreadr") + read_rdata(url) @tm.network def test_read_wrong_url(): with pytest.raises(HTTPError, match="HTTP Error 404: Not Found"): url = "https://example.com/data.rdata" - read_rdata(url, engine="pyreadr") + read_rdata(url) # S3 @@ -317,7 +312,7 @@ def test_read_rda_s3(): "Proline": {1: 1050, 2: 1185, 3: 1480, 4: 735, 5: 1450}, } ).rename_axis("rownames") - r_dfs = read_rdata(s3, engine="pyreadr") + r_dfs = read_rdata(s3) r_dfs["wine"] = adj_int(r_dfs["wine"]) # pyreadr remove dots in colnames @@ -331,38 +326,25 @@ def test_read_rda_s3(): def test_read_rds_df_output(datapath): filename = datapath("io", "data", "rdata", "sea_ice_df.rds") - r_df = read_rdata(filename, engine="pyreadr") + r_df = read_rdata(filename) assert isinstance(r_df, DataFrame) def test_read_rda_dict_output(datapath): filename = datapath("io", "data", "rdata", "env_data_dfs.rda") - r_dfs = read_rdata(filename, engine="pyreadr") + r_dfs = read_rdata(filename) assert isinstance(r_dfs, dict) assert list(r_dfs.keys()) == ["ghg_df", "plants_df", "sea_ice_df"] -def test_read_wrong_engine(datapath): - with pytest.raises(ValueError, match="not a supported engine"): - filename = datapath("io", "data", "rdata", "sea_ice_df.rds") - read_rdata(filename, engine="rpy2") - - -# MODE - -# IGNORED OPTION FOR pyreadr ENGINE - - -# USE_OBJECTS +# SELECT_FRAMES def test_read_select_frames_rda_dfs(datapath): filename = datapath("io", "data", "rdata", "env_data_dfs.rda") - r_dfs = read_rdata( - filename, engine="pyreadr", select_frames=["ghg_df", "sea_ice_df"] - ) + r_dfs = read_rdata(filename, select_frames=["ghg_df", "sea_ice_df"]) assert "plants_df" not in list(r_dfs.keys()) assert "ghg_df" in list(r_dfs.keys()) @@ -372,11 +354,7 @@ def test_read_select_frames_rda_dfs(datapath): def test_read_wrong_select_frames(datapath): with pytest.raises(TypeError, match="not a valid type for select_frames"): filename = datapath("io", "data", "rdata", "env_data_dfs.rda") - read_rdata( - filename, - engine="pyreadr", - select_frames="plants_df", # type: ignore[arg-type] - ) + read_rdata(filename, select_frames="plants_df") # ROWNAMES @@ -384,7 +362,7 @@ def test_read_wrong_select_frames(datapath): def test_read_rownames_true_rds(datapath): filename = datapath("io", "data", "rdata", "sea_ice_df.rds") - r_df = read_rdata(filename, engine="pyreadr", rownames=True) + r_df = read_rdata(filename, rownames=True) if isinstance(r_df, DataFrame): assert r_df.index.name == "rownames" @@ -392,7 +370,7 @@ def test_read_rownames_true_rds(datapath): def test_read_rownames_false_rds(datapath): filename = datapath("io", "data", "rdata", "sea_ice_df.rds") - r_df = read_rdata(filename, engine="pyreadr", rownames=False) + r_df = read_rdata(filename, rownames=False) if isinstance(r_df, DataFrame): assert r_df.index.name != "rownames" @@ -400,7 +378,7 @@ def test_read_rownames_false_rds(datapath): def test_read_rownames_true_rda(datapath): filename = datapath("io", "data", "rdata", "env_data_dfs.rda") - r_dfs = read_rdata(filename, engine="pyreadr", rownames=True) + r_dfs = read_rdata(filename, rownames=True) assert r_dfs["ghg_df"].index.name == "rownames" assert r_dfs["plants_df"].index.name == "rownames" @@ -409,7 +387,7 @@ def test_read_rownames_true_rda(datapath): def test_read_rownames_false_rda(datapath): filename = datapath("io", "data", "rdata", "env_data_dfs.rda") - r_dfs = read_rdata(filename, engine="pyreadr", rownames=False) + r_dfs = read_rdata(filename, rownames=False) assert r_dfs["ghg_df"].index.name != "rownames" assert r_dfs["plants_df"].index.name != "rownames" @@ -422,7 +400,7 @@ def test_read_rownames_false_rda(datapath): def test_non_utf8_data(datapath, rtype): filename = datapath("io", "data", "rdata", f"climate_non_utf8_df.{rtype}") with pytest.raises(UnicodeDecodeError, match=("'utf-8' codec can't decode byte")): - read_rdata(filename, engine="pyreadr") + read_rdata(filename) # RDA WRITER @@ -432,8 +410,8 @@ def test_non_utf8_data(datapath, rtype): def test_write_read_file(rtype): with tm.ensure_clean("test.out") as path: - ghg_df.to_rdata(path, file_format=rtype, engine="pyreadr", index=False) - r_dfs = read_rdata(path, file_format=rtype, engine="pyreadr", rownames=False) + ghg_df.to_rdata(path, file_format=rtype, index=False) + r_dfs = read_rdata(path, file_format=rtype, rownames=False) expected = ghg_df.reset_index(drop=True) output = ( @@ -448,10 +426,8 @@ def test_write_read_pathlib(rtype): with tm.ensure_clean_dir() as tmp_dir: tmp_file = Path(tmp_dir).joinpath("test.out") - sea_ice_df.to_rdata(tmp_file, file_format=rtype, engine="pyreadr", index=False) - r_dfs = read_rdata( - tmp_file, file_format=rtype, engine="pyreadr", rownames=False - ) + sea_ice_df.to_rdata(tmp_file, file_format=rtype, index=False) + r_dfs = read_rdata(tmp_file, file_format=rtype, rownames=False) expected = sea_ice_df.reset_index(drop=True) output = ( @@ -463,11 +439,10 @@ def test_write_read_pathlib(rtype): def test_write_read_filelike(rtype): with BytesIO() as b_io: - sea_ice_df.to_rdata(b_io, file_format=rtype, engine="pyreadr", index=False) + sea_ice_df.to_rdata(b_io, file_format=rtype, index=False) r_dfs = read_rdata( - b_io.getvalue(), # type: ignore[arg-type] + b_io.getvalue(), file_format=rtype, - engine="pyreadr", rownames=False, ) @@ -485,7 +460,7 @@ def test_write_read_filelike(rtype): def test_write_wrong_format(): with tm.ensure_clean("test.rda") as path: with pytest.raises(ValueError, match=("not a valid value for file_format")): - ghg_df.to_rdata(path, engine="pyreadr", file_format="csv") + ghg_df.to_rdata(path, file_format="csv") def test_write_unable_to_infer(): @@ -493,21 +468,7 @@ def test_write_unable_to_infer(): with pytest.raises( ValueError, match=("Unable to infer file format from file name") ): - ghg_df.to_rdata(path, engine="pyreadr") - - -# ENGINE - - -def test_write_wrong_engine(): - with tm.ensure_clean("test.rda") as path: - with pytest.raises(ValueError, match=("not a supported engine")): - ghg_df.to_rdata(path, engine="rpy2") - - -# MODE - -# IGNORED OPTION FOR pyreadr ENGINE + ghg_df.to_rdata(path) # INDEX @@ -515,10 +476,8 @@ def test_write_wrong_engine(): def test_index_true(rtype): with tm.ensure_clean("test.out") as path: - plants_df.rename_axis(None).to_rdata( - path, file_format=rtype, engine="pyreadr", index=True - ) - r_dfs = read_rdata(path, file_format=rtype, engine="pyreadr") + plants_df.rename_axis(None).to_rdata(path, file_format=rtype, index=True) + r_dfs = read_rdata(path, file_format=rtype) r_df = r_dfs if rtype == "rds" else r_dfs["pandas_dataframe"] @@ -528,10 +487,8 @@ def test_index_true(rtype): def test_index_false(rtype): with tm.ensure_clean("test.out") as path: - plants_df.rename_axis(None).to_rdata( - path, file_format=rtype, engine="pyreadr", index=False - ) - r_dfs = read_rdata(path, file_format=rtype, engine="pyreadr") + plants_df.rename_axis(None).to_rdata(path, file_format=rtype, index=False) + r_dfs = read_rdata(path, file_format=rtype) r_df = r_dfs if rtype == "rds" else r_dfs["pandas_dataframe"] @@ -539,20 +496,13 @@ def test_index_false(rtype): assert "index" not in r_df.columns -# ASCII - -# IGNORED OPTION FOR pyreadr ENGINE - - # COMPRESS -def test_compress_ok_comp(rtype, ok_comp): +def test_compress_ok_comp(rtype, comp): with tm.ensure_clean("test.out") as path: - ghg_df.to_rdata( - path, file_format=rtype, engine="pyreadr", compress=ok_comp, index=False - ) - r_dfs = read_rdata(path, file_format=rtype, engine="pyreadr", rownames=False) + ghg_df.to_rdata(path, file_format=rtype, compression=comp, index=False) + r_dfs = read_rdata(path, file_format=rtype, rownames=False) expected = ghg_df.reset_index(drop=True) output = ( @@ -562,34 +512,10 @@ def test_compress_ok_comp(rtype, ok_comp): tm.assert_frame_equal(output, expected) -def test_compress_bad_comp(rtype, bad_comp): - from pyreadr import custom_errors - - with tm.ensure_clean("test.out") as path: - with pytest.raises( - custom_errors.PyreadrError, - match=(f"compression {bad_comp} not implemented!"), - ): - ghg_df.to_rdata( - path, - file_format=rtype, - engine="pyreadr", - index=False, - compress=bad_comp, - ) - - def test_compress_zip(rtype): with tm.ensure_clean("test.out") as path: - with pytest.raises(ValueError, match=("not a supported value for compress")): - ghg_df.to_rdata( - path, file_format=rtype, engine="pyreadr", index=False, compress="zip" - ) - - -# OTHER_FRAMES - -# IGNORED OPTION FOR pyreadr ENGINE + with pytest.raises(ValueError, match=("not a supported value for compression")): + ghg_df.to_rdata(path, file_format=rtype, index=False, compression="zip") # RDA_NAMES @@ -597,13 +523,7 @@ def test_compress_zip(rtype): def test_new_rda_name(): with tm.ensure_clean("test.rda") as path: - ghg_df.to_rdata(path, engine="pyreadr", rda_names=["py_df"]) - r_dfs = read_rdata(path, engine="pyreadr") + ghg_df.to_rdata(path, rda_name="py_df") + r_dfs = read_rdata(path) assert "py_df" in list(r_dfs.keys()) - - -def test_type_rda_name(): - with tm.ensure_clean("test.rds") as path: - with pytest.raises(TypeError, match=("not a valid type for rda_names")): - ghg_df.to_rdata(path, engine="rscript", rda_names="py)df") From a5983e006022a4afa17b02f3c604138689c688cc Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Wed, 14 Apr 2021 07:08:05 -0500 Subject: [PATCH 7/8] Fix duplicate entry in ci dep yaml --- ci/deps/azure-windows-37.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml index 2dde2e2892f99..6e7be62cdc56f 100644 --- a/ci/deps/azure-windows-37.yaml +++ b/ci/deps/azure-windows-37.yaml @@ -38,7 +38,6 @@ dependencies: - xlwt - pyreadstat - pyreadr - - pyreadr - pip - pip: - pyxlsb From e78bf6ed22dfa5804b2ee9fc85b620ffc97103ba Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Thu, 15 Apr 2021 22:29:00 -0500 Subject: [PATCH 8/8] Refactor to handle binary content, add datetime notes in docs --- doc/source/user_guide/io.rst | 48 +++++++++++--------------- pandas/io/rdata.py | 48 ++++++++++++++++---------- pandas/tests/io/data/rdata/ppm_df.rds | Bin 0 -> 14315 bytes pandas/tests/io/test_rdata.py | 4 +-- 4 files changed, 52 insertions(+), 48 deletions(-) create mode 100644 pandas/tests/io/data/rdata/ppm_df.rds diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 0023f70e699bd..f4bbde8efcd92 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -6012,11 +6012,11 @@ To read from a file-like object, read object in argument, ``path_or_buffer``: .. ipython:: python - rds_file = os.path.join(file_path, "sea_ice_df.rds") + rds_file = os.path.join(file_path, "plants_df.rds") with open(rds_file, "rb") as f: - sea_ice_df = pd.read_rdata(f.read(), file_format="rds") + plants_df = pd.read_rdata(f.read(), file_format="rds") - sea_ice_df + plants_df To read from URL, pass link directly into method: @@ -6035,8 +6035,7 @@ another issue. Any R data encoded in non utf-8 is currently not supported: In [608]: ghcran = pd.read_rdata("s3://public-r-data/ghcran.Rdata") ... - UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 45: -invalid continuation byte + UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 45: invalid continuation byte Also, remember if R data files do not contain any data frame object, a parsing error will occur: @@ -6050,6 +6049,17 @@ will occur: .. _io.rdata_writer: +Please note R's ``Date`` (without time component) will translate to ``object`` type +in pandas. Also, R's date/time field type, ``POSIXct``, will translate to UTC time +in pandas. + +.. ipython:: python + + ppm_df = pd.read_rdata(os.path.join(file_path, "ppm_df.rds")) + ppm_df.head() + ppm_df.tail() + ppm_df.dtypes + Writing R data '''''''''''''' @@ -6069,7 +6079,7 @@ and optionally give it a name: .. ipython:: python - plants_df.to_rdata("plants_df.rda", rda_name="plants_df") + ghg_df.to_rdata("ghg_df.rda", rda_name="ghg_df") While RData and rda types can hold multiple R objects, this method currently only supports writing out a single DataFrame. @@ -6079,7 +6089,7 @@ Even write to a buffer and read its content: .. ipython:: python with BytesIO() as b_io: - sea_ice_df.to_rdata(b_io, file_format="rda", index=False) + env_dfs["sea_ice_df"].to_rdata(b_io, file_format="rda", index=False) print( pd.read_rdata( b_io.getvalue(), @@ -6134,8 +6144,8 @@ Like other IO methods, ``storage_options`` are enabled to write to those platfor :suppress: os.remove("ghg_df.rds") + os.remove("ghg_df.rda") os.remove("plants_df.rds") - os.remove("plants_df.rda") os.remove("plants_df_gz.rds") os.remove("plants_df_bz2.rds") os.remove("plants_df_xz.rds") @@ -6147,18 +6157,7 @@ loaded in R: .. code-block:: r plants_df <- readRDS("plants_df.rds") - tail(plants_df, 5) - plant_group status count - 16 Pteridophytes Data Deficient 398 - 17 Pteridophytes Extinct 65 - 18 Pteridophytes Not Threatened 1294 - 19 Pteridophytes Possibly Threatened 408 - 20 Pteridophytes Threatened 1275 - - - load("env_dfs.rda") - eapply(.GlobalEnv, tail, 5) - $plants_df + plants_df plant_group status count 16 Pteridophytes Data Deficient 398 17 Pteridophytes Extinct 65 @@ -6166,14 +6165,9 @@ loaded in R: 19 Pteridophytes Possibly Threatened 408 20 Pteridophytes Threatened 1275 - $sea_ice_df - year mo data.type region extent area - 1012 2016 12 Goddard S 8.28 5.51 - 1013 2017 12 Goddard S 9.48 6.23 - 1014 2018 12 Goddard S 9.19 5.59 - 1015 2019 12 Goddard S 9.41 6.59 - 1016 2020 12 NRTSI-G S 10.44 6.50 + load("ghg_df.rda") + mget(list=ls()) $ghg_df gas year emissions 141 Carbon dioxide 2018 5424.8815 diff --git a/pandas/io/rdata.py b/pandas/io/rdata.py index 91852f5bd281a..4114b6d1f8349 100644 --- a/pandas/io/rdata.py +++ b/pandas/io/rdata.py @@ -1,4 +1,3 @@ -from datetime import datetime import io import os from tempfile import TemporaryDirectory @@ -52,7 +51,7 @@ def read_rdata( Any valid file path is acceptable. The string could be a URL. Valid URL schemes include http, ftp, s3, and file. - file_format : {{'infer', 'rda', 'rdata', 'rds'}}, default 'infer' + file_format : {{'infer', 'rdata', 'rda', 'rds'}}, default 'infer' R serialization type as output from R's base::save or base::saveRDS commands. Default 'infer' will use extension in file name to to determine the format type. @@ -269,7 +268,7 @@ class _RDataReader: Any valid string path is acceptable. The string could be a URL. Valid URL schemes include http, ftp, s3, and file. - file_format : {{'infer', 'rda', 'rdata', 'rds'}}, default 'infer' + file_format : {{'infer', 'rdata', 'rda', 'rds'}}, default 'infer' R serialization type. select_frames : list, default None @@ -318,7 +317,13 @@ def verify_params(self) -> None: and raise appropriate errors. """ - if self.file_format not in ["infer", "rda", "rdata", "rds"]: + path_ext: Optional[str] = ( + os.path.splitext(self.path_or_buffer.lower())[1][1:] + if isinstance(self.path_or_buffer, str) + else None + ) + + if self.file_format not in ["infer", "rdata", "rda", "rds"]: raise ValueError( f"'{self.file_format}' is not a valid value for file_format" ) @@ -326,15 +331,15 @@ def verify_params(self) -> None: if ( self.file_format == "infer" and isinstance(self.path_or_buffer, str) - and not self.path_or_buffer.lower().endswith((".rda", ".rdata", ".rds")) + and path_ext not in ["rdata", "rda", "rds"] ) or (self.file_format == "infer" and not isinstance(self.path_or_buffer, str)): raise ValueError( f"Unable to infer file format from file name: {self.path_or_buffer}. " - "Please use known R data type (.rda, .rdata, .rds)." + "Please use known R data type (rdata, rda, rds)." ) - if self.file_format == "infer": - self.file_format = os.path.splitext(self.path_or_buffer.lower())[1][1:] + if self.file_format == "infer" and isinstance(path_ext, str): + self.file_format = path_ext if self.select_frames is not None and not is_list_like(self.select_frames): raise TypeError( @@ -360,9 +365,9 @@ def buffer_to_disk(self, tmp_dir: str) -> str: ) with _preprocess_data(handle_data) as r_data: - mode = "wb" if isinstance(r_data, io.BytesIO) else "w" - with open(r_temp, mode) as f: - f.write(r_data.read()) + if isinstance(r_data, io.BytesIO): + with open(r_temp, "wb") as f: + f.write(r_data.read()) return r_temp @@ -412,10 +417,9 @@ def handle_rownames(self, df) -> DataFrame: def parse_data(self) -> Union[DataFrame, Dict[str, DataFrame]]: from pyreadr import read_r - tz = datetime.now().astimezone().tzinfo with TemporaryDirectory() as tmp_dir: r_temp = self.buffer_to_disk(tmp_dir) - rdata = read_r(r_temp, use_objects=self.select_frames, timezone=tz) + rdata = read_r(r_temp, use_objects=self.select_frames) rdata = {k: self.handle_rownames(df) for k, df in rdata.items()} rdata = rdata[None] if self.file_format == "rds" else dict(rdata) @@ -432,7 +436,7 @@ class RDataWriter: path_or_buffer : a valid str, path object or file-like object Any valid string path is acceptable. - file_format : {{'infer', 'rda', 'rdata', 'rds'}}, default 'infer' + file_format : {{'infer', 'rdata', 'rda', 'rds'}}, default 'infer' R serialization type. rda_name : str, default "pandas_dataframe" @@ -487,7 +491,13 @@ def verify_params(self) -> None: and raise appropriate errors. """ - if self.file_format not in ["infer", "rda", "rdata", "rds"]: + path_ext: Optional[str] = ( + os.path.splitext(self.path_or_buffer.lower())[1][1:] + if isinstance(self.path_or_buffer, str) + else None + ) + + if self.file_format not in ["infer", "rdata", "rda", "rds"]: raise ValueError( f"{self.file_format} is not a valid value for file_format." ) @@ -495,15 +505,15 @@ def verify_params(self) -> None: if ( self.file_format == "infer" and isinstance(self.path_or_buffer, str) - and not self.path_or_buffer.lower().endswith((".rda", ".rdata", ".rds")) + and path_ext not in ["rdata", "rda", "rds"] ): raise ValueError( f"Unable to infer file format from file name: {self.path_or_buffer}" - "Please use known R data type (.rda, .rdata, .rds)." + "Please use known R data type (rdata, rda, rds)." ) - if self.file_format == "infer" and isinstance(self.path_or_buffer, str): - self.file_format = os.path.splitext(self.path_or_buffer.lower())[1][1:] + if self.file_format == "infer" and isinstance(path_ext, str): + self.file_format = path_ext if self.compression is not None and self.compression not in [ "gzip", diff --git a/pandas/tests/io/data/rdata/ppm_df.rds b/pandas/tests/io/data/rdata/ppm_df.rds new file mode 100644 index 0000000000000000000000000000000000000000..242a3e2b112367d0b7adfaa29679057819489cfb GIT binary patch literal 14315 zcmdsec~lcy(5Hxi$|i0oTNG3tBJf<<$x}fTggik-w#4Nr2to`90TL1c6%iq#D9Vzk zsCkGG5!pimE})=9WC?2$)&La)t1_SrQiFvcZF z*CyN7#@ENj*Vm@g$EMTQ#@5G%@HpV?lYq0!SP;;kAJG3Kp#O2eZyy_uug&c1{QChu zS8OgHwb@yo|1jKV@SM-!X`k;luP2`bEc)0;U+1R>bW8{MT(EilZ+<|S&tQO$jm_(8 zcCWA5zP{%08V&W4zRdqnn*X6X|3i8H;$@qu@_>$ofDU{>2P2?^63{_^oewXbk?vt- zZ9zTN{f6*2h_jsX?8q+O9cRLBxMxXJhpL#1>L=4DNV0 zuN!@!-hEFBy5U%p{ig6gxvw9S-uxIENSc0<9hzgX+03WQ2T}8W&uV5Bsp0Ct zZ7C-aw5&&-#zaY3;lq8l?DSIjo~T_Z^A4_Eb{oS->aPqH<_9+n(7s~q#lrW5Or_gA z`w|!=mkv0zU^K+K@2L&5a2;*G&F-;-^_`h_?e~zmL*)G6+kSZW>m4h*FX7L9h|icA z&mU$;w_QHltyM7xZalkG3bt<+*mZ|`G&KL-U{ih12BsMB!%h8I=O?uTs9Nee7IuxQ z;6DKVRb2s4H>vB!)Q1$zJL=&os=0i)O6@ELgRC!YS{EMIY1`aE{2?3v1OXKbvm^Nq zEy$a^6-*7v2DCNc&ktucwII`$E$svfSgzUm-AXEASLHC6fmhOcDFTMci5R)AMv)+x3DhxzcX!7q-%L z;ChTL z-xdcA>MBJ5-?!C43vR$LBRd0^Sp{h@ZN^HnR+29N3CpfEG#zGuaR#PXK{sI<7%kv2 zD<~bd2D1q`!wO2PTM5@RBeR`Ke~maMcu3j;d5F@np6Yw!K7$@KZ~9L zGsL(jneq>@Vp;OkE5JNM%XVn1&cIpky_-pwc8OdHM`f zvSSj6@4-rK0jlkWPAh07cC}(V zKeH92jopIY2K>cOW8rRsSE^P6+xUlB87(>)hT0%Kk5%Z^z&id5*3s6?bVE&$wuc_t z3AoN5V_CI$-^8r|t?^io-jTGikCW|g*jXW%qInjYGUQ+)4MkQPYSV>5aWaEXOW18aL2pv?d$zJN8++MWS6 zP&ok;tPL&JHz6yqD-~<_)-BfQkX3}$svW>179<_4rP>GtvL3Zqr$JU?O%zx8#8!d^ zR$H-^|DE-y)j9*LtJ(upvuJ5x9hDidk#EBaY$2vYvTjv0L^SnYRE0Y}ZTO%%n#q{Ef0@9C)Z#;#?-0&0f>H zqefx4tfH;>^~EbyH!^^n{PV}79G&yl&l_O0=oRm=zLkoCg7x!$t{Jb+W=xHPxH(lN zzH2DEX3V_<(%~18E*M$q%GcVIr%3`z9$ z;cInx3*v9JooN)tr}*%nB>}r%t%B{q2p4jUPp+~5%y^?APzR)c{o-t{-u$~uZy|FF z;XGO17myD5wNTCyzOU1hM^^g+f6_CnB%3fB$LX10F73wny%`Dp5}_q_JXN_`o!hv$ePZ1B?fgQfo`l*?y`~8H zk>s$;Blx4l83Fcj!^BN&e9Ia#>`O?Rx#mbc5)rRHJzhl+@0*2h77Tq)!s5(D)s|wD z3#4bJ?SK0k6Msz&kZx0{((zhclImo0Pq_Ms{)pS^65cf8kH_*lqk^Vl*o)XM+OIQV zbc8Ciqh7T`@ost=`EQ=k>V~sy7awm=d65~eIP$q(y-s#^h}t4GcgYaH{hL~5s{SSV z;;keXs0^3AO1Y1~JW0xVJ*okuY9gYSS2$FV5j*sKyrW9*6ZJt+(2Ih*!=X#^gdxIl zX=eSxO_1XS9(4ZxE7*p5uom^oZ-UoAxL30mAaN0SPpC)l52xOTJV2DR)ss(}F-uh@ zOS_gf&sRflO=nPVs*N0RN4B>`?-Mh27=TU@60GwNH#PCu+XU(kq}onau5$MtVZ3p# zDahy|D*R>0Pg2uEfKpd@5hVDhSvU1KcKU*2PTUYLV@c>3-t{Da4rRx-!5WAV2j z5f&EPR9XXe+b#GmOW92Zgjw09%CWYC=-g;9x}w}T_Gr_OCjQ`nIs-A9q0**88s%dh zm9R0Heb)tXH$MSjT40+>hnu7uDXYT0CcY`uIbzzl2YNLuomnyK?Wl-g>fvroY*@&S z&BW@@!h+e@db326^VHX-M1IZ)uOq=>F2@t6Z5T+^oxqf3W9#pkQm^YPVt&+zACv|wv#(Dd?)W`jh;D7>|aZ{4_kQ3O7@BF(k(Fx+&ah~TS za9Wr~O`aC@wqcg2T8fUlUk6N`bLtO$>3VGMpXU!S_0_Tq z$Q@H{7jen!lUu&JK1@&TFZkxQROe22gLvL)N;XN}__M)rU$%G+ASB8UXOOMij7O(F}qRq%r1dlhxA~Fbbm-| z&e8nTZ7DgX^qH8Cjw^mj|CH%*J#a8c0JL-TmU4QD=i|ue4lQG5tBwgG%qFCJWS7U= z8|*uTXY<+5Be880{`1J!iVOgpn*g*=VxA5;}i6vwf}+k+m0mDaHm_CftAakwB#ZMs^#pzi#0q z-j`Zrc<-GKzmN9WGd-PrZI3uJfNI5svc}EIeNv}U_v0_mUm5RIUrE&7g`s3TKuyX@ypFncmP<-B_CEWK4HbfEcxUh z<14eS1@@-IxOGsH;nuE`tU4KqugieiuhA!e(%avSXW&Z$aE_={w>F8Yf@OIDA?fis3QofjJywdjF(_N~7|7cJZ* znJ(STPyyRuUf^avcZ888`lI(zG`{HQJID+C0Ov4vVJ9`-5iw@AsnHlL|6MsvJOFmg z$5cUYy&N#jR&K$#`7~Dnd-1BP_?-ANGi$~Vtw9w-judAe>EXw_hzAUh6qj$1l7j~> zfpmXb<`zdBfv*SmKk}r^`N>FA9>1bHuzFp|u85y5>tw6IPi{KArt;lC<_pdf*2KI5 zao)Gbqwb>PEU5)uZi9b?3F~3U)dvjQ8f=%e5CdLKu)PkF09eg5HCMK@cV2m%dI08y zw>M)ol*FHTntJd@d^B(xA5Gf^U*YhXC?Ho#Kla!_ENsIv*r=b?XWH!*`g9Gp)cr@i zH-PTQ3o@jC2LF`s$>tu0So<$sUO<9Pqn49GA9e)mUx}xu|hI7 zY0nfWB5c<+z{QR|$X@xJu4*RMN z#%QM&gTSZ!gazINh~c>N`l4Q*@2oM8Xf;`4#dkS^*&er5 z`pEV>J?y8lf8YW(_dU*=75Yk3>=Ix>@rq@8u+^*O(YKTbf8*ks1)S}R2XXb%En~$N zTYZx8AS?({WzNKsbWY%fOYYWBQ>lTp7*8^ZH(thavEO|Mw*IZtWo|=BZ zEY42F(=3Y%taP)O`(#yxfL*B@dtkNn-fH>9TM}^IwfRT1-1j^$pS#3q+BY3f@!wcS zgz};4p+oM2h`Z>|qV2NpsLF510lJ%m{QRWTgxVNWvV&>#m(b!2&)b=F%=V%B-;=5g zr9%gb@dvxlv*F$(C`oO0nVF6Elir!c4z#~Cj)dl*yKYY&EzG0VZvlr~_0R{01mNPM zms}Pd3W-e`*a7CtdB;L` zQ%2oz5~ojQe^VbDJA7lK@>z?>sl(l`Y4YGp+Akb?Mu+wyOJTu_csZF_N-|4_Jf=w&Vf)ZT>XrNK;w?glLwI?La!X=zZkE9H<-iR^avFx>Lz#_PjE?_y zQ~=#k`#`3E?rI}v+ifS*+*@3}esU}sZ*`HAWyOiylbQ7RPv~&{75vvE%v}NAqZTKJ z^%x}I+``**k$Fk&B-m!SZbEWkuOnfl!aRqn0?TP4+rL9!JvnZkXL3SaiTk2XeRJF| zFU2y!CNO(ZpoVaQ2g=tbZVm{a)O)pyd*=1esPo?&O+@9VsJkU^1Bu#=7LvZYsC@TF zABHj;^Mg)y_Kk(4mSx9q;Xhw_oYgEp0`85qQ~aCrMWB5`aJnglWHJ>!7LmL__xlj& z{=LrS#1v=OSJa)S{Hi~5l#rziIdQJ!73J)U zoa%?;B(;AolH0w&n6N77M&)= z1}J6sLvwbJ&;Cv|r)#IFkFjloAg9P7xoK_y1yh zkG`tq=%`$2eLBsgy{9RW>he`)5g)_NjYSRq0ej}G2j3NMq)j^i8^13(r_66~67g=q z6jc*yb@j(%u)e0c#P{IC`7^5(y!a3~9vx0-{2uRQ-}mriSA{H%DhZmvyLoez4lc=l zS0CIuxLQ%7s~&DnNn75b9D><}y7MJ_mxTWwV^QTXi$f&Nm2jBcrEM-~k@0B$=st%V zW{)IB=ARqWsI=-y<-P{o_or=Vcx9w0>t{VRE3pK+2t7p21-SGRyE$yXj233YdWQ&R zbfqgV2-T$sQd7%@2QJBmhlj2e!D)Rli=%SRD%ERwNoY}+g!1;Mfdv2BFRxRwqY^pT zerqfyZ0Oa6qGr^5e4weM@cLgx4pmJ(O3dyH$U1o{w8$@?IC=}4Jdi*#Ux=w5cBg-I zzRQ?UsaIsl=BV<9fPzEvtL@EiB1hYvk$zoqm2hcdSjpgSf8aJQr-UCD9M(@gQ}veqemtR`UAl?@R4C3wi?Dj5OzdmkgG}i zk?GrFZ>2h26^Tt?l=nMvFPR;LHl?cGhL^O5s8}O(_GAN822|gW-?PI0_rI<3XJd0FYE~%lK zK+IvwR#et)C)l5%LqQ5tQ2}OF_8bAeZ_XvQc}>*iVItkR?8D=4RAs4Zw;okA_9G!G zXyRW)chXFKWb+sDS3^;ZqBe@Ten(w8bwUqZGlDv154%d~=5N17-V`m}?M zKy$MSy2y_XDuHwek9L47`zUSR?LiVF5rlh{yn*w4p|sIK)*on|`NhRsEtV&}#Bfxz zOzQ;74CMp8UDu9?=u4VYy{>ON!Ijf>oBLI>PLonuIah@fX}GGK-YSn$Qgl+4sv$>< zIUJVh!#_#4U8=kP2<+d;jBYYy7D#v&|?nVOVDCEIT*qwVNKxoS7)Mss&E zl}>k1Q_57lB`M-KO>@b2;tA^oj&9=Z-X00QflgpVx#8q(XSf`XNv8Ej^=vO>-kKu( z6$`>8jC>@wol$nHX54-!{TnWt*cQT0O%Gzha5A~`&Q{IqF|dh8PVMjQuPSb!5Bh^; zUE7BUyk}S!W^Zm6Hm3<14gN@}%uMn4Re!Z<4tCLkPdwZjSqMGtn5^O3tfXq6crTX5o?xEWg0T!=&KS2v}i= z_9<4+7BD099F=k_oR8ws;+OI&)Rnv+5jwyTRtA<6kp5rGi- z3rsm-2tL~0SJ_2`R~3EBoFOAZgjb?*jrFQ>HM2KMv@1tidtX{}2hpcIiLj%l3+T!a zdU$icu*U^>=#@(Cm%ieTd4B@OcoXP1lzq}ldWb)F2Og42yF3KpW=4rZY1VK1uvn@{ zbqv-tuE3t|jsS;s3 zJg9wnxTIDP1>J~l`OGTt0fQ8n^0*uGl>fvl)r|8=dCAm7T|ZsW8#e^2AW3Rcn@P%E z)gP6@!=xVcL^)=T{9D@4R)+_a0$MyaEWsmMo`Ua`NQqO%8+7`HK<*%5Vu3&i@#w6> z+lzCE*kw7ay8e}Xh>qa@#Hq&!;vTB%-C_=D(K4@nQ3O#2ft7B(DMMI0HV%hv?hy!h zV3p_`EpQ0>0&WorKErXQQ~l+12a&Im2^cDR1QHTKmC&=y6RK4R5iWz^9R=acD)pF2 zk!~NMDyF2*6TCjz1O!$H=!+&eY@xUMizXI*7U(F4sS?_#(B&-k4>hPCRr6@Ulyn4! z>HyV{1biOkKg>O9J1i~5Ty@<_N$})E%cFvZ1?{IMDiRg!wd#812CC|6Sf2wHt6(3+ zQa_+oQLZY)k}v?Fc0SJCxprdohoW~RbK!t@k~`pbd`gJO8DT3gf1jmE$_b5zN>@}b0Jyq<^Tq#hMC(IsN@rb z3ZXT<4jS#TgKq5b({;v4PQQ%}$y9KtSb_v>2n>sc&y1+a>IS%Z9{>C4>fw)ACFs;1p9T8!C@U=$H1x)gChvupsO$t%oA| zLW79{2$qOTI*R3uNc*ctwQU?q@A0?bJe09T{X{fr1`sKg%&-U505zz19 z*|9Ul7AKOGDD4aM7x^RgpO z#DsA*Z12^y9*9t~1E2hL_Ipn;=82dv!CUFqiYr^XWP$}>8*=rBw3B8P>Y#|?+-cqi z4;+>zM8XN`@3Y45<)4g*ut_yZZiz@P(d}Jpycz&t;XZKR{6{{sP zBCPy2rgfGs5GE+l8EWi^>EWfl#0Y^Tarr!ZMTpe7vVrMr{8Hz6R;QUva7FWgh=E?? zM`gTCr|2v73I!=mW{q|!W`$I$Kuq$d_N;Z+a|fkmoZfQ>*?Y_?@ z|MbV6-kL)Ijj_E=i&aiH+h{oN7gE%B(0(wx?ym{90{82S@KIajxO9y`F{w*5{*$`- zPw?g&p5VKf^-6me^Je<&ZJ^=(b`z}eJ`deXGX>qg-S_FAKqb&G^>$IY6m$12Kbd9c z4j7i)X7HAX{P7;)ud*4&O#3hR^JfKw^_dVXB4zJHEwFZ}WX5P_b_ViGI>Rf51{nFd z74VU(o~)zo{1U%r;{6VNQ$sTOHY-9s%iho5KJ+mpYzi5gd>6LH*9n!7kZsxv8pL%xk#WO0Q ze|Y}BuuS=pBq^IdG_&__??2VWMh~6cL=s%FXNXu=jMA!l~H$g;mfsO zX0dmVH{26l>GL24p=PSP`TvMgBgY$3b`>U7GL}W6@3IR!f0c9}{Y85*_Px+=23`UU zwfNci{t30Nw$?$0Sh}`ka)2{5#4LukOPczP=m}_cq&agexV^Df$TwYyS^&#p88|7H!QOz=hMpleEJ*3WJv% z`d}nbqgqkOn|ocYezPC^Tk3Jvg$eheYLIp5Xu{6aEl!+EF3jNLSWV?#V0UHwFV~i_ zt1ir`&YPsI2fNWo+YH{3%Ok718Vg$`;rouo{Zw?aT=loc|I%%t$CYF(B?7Qycz!SCXOJ6jx|*QI}cw1=|c*-y;F z@J*@qPE4!LE7dyO*jEcN%8sgyBx zjM7slKwlRVbI|bUOU4J-O*v@C2rFj7pp$-aLfi!UdVgn5LnMXTn8(YAoirn_!CxD9 z#TGe9VjI`~lr$a^xR?1UTcrzKZ`!-}w=g{&R%BWit!<%B7V@%VFT#F|DGEL#SH|0q z7xLasUAOo#=4_qG+d0K|Cz$mM{%0y)9wCC_OOX`9v(T+S7hi!NuF#$^GzOZk=n;t z$Wb0l-VajSK_>VIJZ)OKTWxTFlcf5yGPqs%Ka=ANXZ3Wv$2WFUxfcJ`%>?rQO6T1Q zdHjt(j>X5?-?Y|xH}OBHsPm0e`1i4*@{0II`$FK_6-I2V21Z|Wmp+}l{9Zrae(~=A7?RVlvyau$V&tXy#IIxAhwNSA8+D~O7tL|@{z?Q5-c5tGtxo!0?cx|z=VI5_Y ze<{OugrM2w{FcArdRO4>=oZJD>{A5cnKAq86kb@;Ee^=0o&1N(*FoF2EdI;tmO);2 zmj--p&KLo;kGd@Q>-kAHBbHXj0PjTs*Y#i8H{eIb!&OhN(yh0^64bprLVdDOPA6j| zbAOgMd0ejES|+KeB7W$lXo-7^WWO$OyyjRXhsqzgpzmMyje4IR-WTv0H{9oVF!7Ch z_?uiz+m*cm?foXvqWVg^3JY+D$LK2${N}2ZPoPA{lO+QS8s!ADh~Q$`=_ff+b3KoZ zgOw+(+CxVlE=8T($NLFtsK!*G4SCplf%0|RoXT>qslIX!_vW~wyANB^> zHb-%q8{fa}+YjflmIMdNVW`=Vau^l;Z_hB7eld`_+7%qpmx$>KNqy4{ZW68~Vxz8# zigWGQ=iG%)dop*QQ%=_e`-3I7dR!xdmHj*O63&8MMrYH9-(lp%_xue{)=e>>2fOBw zySr}4C{4wx$od{E^rEP#4wRUZ7)->aUgeVM)317Z3Y?nZxBB88l>M^;o^!L*Jd~%5 zmpi$mKlaR4yi+|NqWMc)KhN&eiD@?MG$A&T5@gD~{vh5p5jpX3F^2N8!jZ?hRt}D8 zpwDJimoO4ZeIQ2zSM%o4ez_CpLOGirJHOPD>hA!(BC?)J_eYvu;H2)81P~!NKBgQl z4Ri@j?Zp2)tCQ)ZY;$j}=yIA_LYgMFi$0m~hB&wB{t1oD^O+EjiyXVK;!9k+k-^V% zUYAK9ftS8-n>ev>XZOT=6?HSsjs9$P6xU8hOGGKJW<-f)J>vFJ(Sx!6E}{dnRs;^O z?wr-MwWHK`oJ2L(_i6b{%Az=S_yg^;w{gL!Y-&9%T;10?x6{ug6#bpc8%ptgx#3;& z&iJ=ZC8@n*;K@PG>l&2HXl+JC-O`wyKkeKHH73 zB5B+B(&T~x4BgPg$h#tnxI8_aLI_7!^o&M@q6qFOzr(3Lzr)!jQ%PFFI;dFe*C0xSpf2zXWd*5Bz>q%P1t(hM=N5B?*nm zq1mKn`8A~b!X+a3>2fu8#o50fTB zDO(E6Rp*>DY2Oln5z}|gB3nqjNcUxJiAZkXqWF?}mKgM{hWpC#={3&lI^gG!FPv~i z=|?3te9r7b0Jk&@u*t-l0uNqH5Oy@Uwq(g}!knVM)kD>P&;6$uL8&M}Fm(H&ZD%Cs zu=`zN;*xmf<`#NR=zxKt&ZOp=aw^Me*#Fp-@Ju*$C>)f`ghpsvd{A9JJA8!`aVYuo zQ0KMf1si!Ek)*CQz*W3Y+)qIL=)($89QgEaxj8<3eyOsyw&VxMEfjuqaIBwkXDBF` z>UaP}8;gl{A`km#8qQzj#%u{4?q~k_nexPUl+VDLQ7%5{Np_8KH5HSm=Su!P&%Io7 zHd6UB>cK_9yUheAa_YdCJMJ7R_^3yXsCrj%^ziqT0T)xXIt{HsCG2swo)dYz_sZCU+0}i(h4%sY1Mw%I|T=5{oL;Ww~}$L`GWqA2q!;LR1obEu+#q9IvPWAnw~!2|EfpWE*vQid)JcfCPQ zH>B9kIiVU*@2ExM8;Jld+72H-^k_4!Ox;vd5I>j_3MQuN4*UDl9L+uI@AOPq4<*Jy z$=!YO^Hntk)hYQRM|d8h(Fy$p|V&=@vLpa zEZT5j3qLkgsuX?w&{)}51E#p}vmuzpr`CoxC7+2HmL>b`4#f&9QN3ez=Ez1kXxJY* zX%^}LUpx@)02hb%^T-fKCpY=pOdc>g)_&|9N_shDP$sQm)Nv;r%V+tAXfC57$6g}n0WGzA4CRj$V|ITK>H7QDa5whCA_eKwIVG`@modcW z?IpF)*5`WBr1K;D`%oW2?FUG;21b!%k=ufeN}(aCf{!KarcF{j%GLsLqkvEfIDai9E<4U3_bNDO9c9L7R1 zASUvGgvwE7=3t!HRAY$x-rJM4iP>{{RpNUl%@M;6jsFzX*cw*VDqq%Ghf6~#AH??- z-d<4J&BYhxK)PFqF-=~reO-(J=Pt%8ZWrVJ<6VyNOGwyA-PIa z{qJ+qvI3hl2yEoqA-JhQt%I(DlA56TgUcpu@lnHem5jC_+t@A3H5Z9LD*uRe{`?qj zUX9s|Ja>K~5Z!)wm@tm&s^M;u@D}Fvo7)Dn^^xLTH>$at!IF2?lDIq7lIgHv+;}pq zVbOPA1>tM7`aQyiapN`@fll1!@rxdsvX}KR2!39Ynv|5{}BQI zZ5aV?%dQ27LTT|+%xco>z`8w7*CwhjPbDAn2;H|3&z*uG#kHBm&CMWZl=qv4M94qE zsGVulcg+viqGG^4Z|PN2z9!o52s`~k2Z8*LW>@5aaif{F%@4(}5Y)~RdMIkAI~F09 zaX)gO=v7S5e~m?mcZEgCPc4pQR4O{_LcqR^;>d}BeefDCnwms$yeqP;+SgS4hhtLLWzSWBkI; zb7QzST84&)?AmLaYU&VTZRo_j7H3#>&3yRhW5SIJQ9S2<-g0y>Lc~P1XcQiX(v5OO z*{ZIK_t_)Gi_wjVb%L#&OPs8_Ytops6aBQ9P4k@Y-k3iBA=S0$U!Q90 zD!#9XhKFY7_A_cZZK0^)AxSkimZNB_sjFabjvz7j4dY^tQjk-nJC~I=>>jMYhN>*# z(rOFtg*U;UAg4Ust|^CWC&MC;G0qY2OVM-;;@jY8-{%_a`9fRu(SbpE3`K-Q4Jrjd z{f;7I?1r;D*Qu?Z+mp^xgxqjf@r>!@hvH^E+5WiJQ%5tG3>EHH#%dk}+qykfTSD&* z*YX?vDwAw|p`~qTz8-xyyGJhK$@wggj1*BeTWpqi$L`NR`pY5p>bo9>yI2?{DrcS< zG5Kp*+uiLg7R(CFyru2S9XMWTi_;U&(ZM6pGdKTPTKe^X()=Zs)T)&0ogzLQanBZwoq+!qOP%*Zv zitYpQDgYinPW8Pcnj-*O!26 z2Ef}l?kv}`6oBM=)@P}YS$b?Ar@CiRVOi|Ni4A*HxmI|=eQI_#yE!QPi%=2m)xYco z=L2+y%Kwj}%diWtk)syx2>qw@xI!78y9~jKx@`g`{Y&-ZBCl~?NVexaukj5ZJ$I-m zdsU`cfTt>KhswhWKXqRX-LCQ<^UVX`hZD2sjk2iNd+J>i(RqMt2f)i5AM6G+cB>4Q zh2`&$E8IO5j_%BCvrW#s*3-U&jI*fw?`@cl$OBMC>bqaotW)csQYfJc$Mw|gFV$oL zQ>fSkf*DCc4qbNxJv;8^Z?drtfKHw^dbR?pmACv)NA@=W_1LRxU72Ovn{UCEXS{Ab zg(l>gSRVBHr*hVNJc8z}&9n8YTyOLV6_g!j6<6^s!Lq`u()g;?L(7M| zf1jPy&P%j(y!X*~c3a$|Gn}@KR@Uk#|wr%m& zH_tq4FZ}vvI~G5F+i&^E3%@DZ|DStz`OKnSmI}!&HJ;DhphA7ZLIBocQlIjZ(FTk?m@oLkU5p^_44D F{1