dask · martindurant · Jan 10, 2022 · Nov 21, 2021 · Dec 3, 2021 · Dec 3, 2021
diff --git a/fastparquet/api.py b/fastparquet/api.py
@@ -12,9 +12,10 @@
 from .core import read_thrift
 from .thrift_structures import parquet_thrift
 from . import core, schema, converted_types, encoding, dataframe
-from .util import (default_open, default_remove, ParquetException, val_to_num, ops,
-                   ensure_bytes, check_column_names, metadata_from_many,
-                   ex_from_sep, json_decoder, _strip_path_tail)
+from .util import (default_open, default_remove, ParquetException, val_to_num,
+                   ops, ensure_bytes, check_column_names, metadata_from_many,
+                   ex_from_sep, json_decoder, _strip_path_tail,
+                   reset_row_idx)
 
 
 class ParquetFile(object):
@@ -342,7 +343,7 @@ def iter_row_groups(self, filters=None, **kwargs):
             if not df.empty:
                 yield df
 
-    def remove_row_groups(self, rgs, write_fmd:bool = True,
+    def remove_row_groups(self, rgs, write_fmd:bool=True,
                           open_with=default_open, remove_with=None):
         """
         Remove list of row groups from disk. `ParquetFile` metadata are
@@ -372,11 +373,11 @@ def remove_row_groups(self, rgs, write_fmd:bool = True,
                 remove_with = default_remove
         if not isinstance(rgs, list):
             rgs = [rgs]
-        rgs_to_remove = row_groups_map(rgs)
+        rgs_to_remove = map_row_groups(rgs)
         if "fastparquet" not in self.created_by or self.file_scheme=='flat':
             # Check if some files contain row groups both to be removed and to
             # be kept.
-            all_rgs = row_groups_map(self.row_groups)
+            all_rgs = map_row_groups(self.row_groups)
             for file in rgs_to_remove:
                 if len(rgs_to_remove[file]) < len(all_rgs[file]):
                     raise ValueError(f'File {file} contains row groups both \
@@ -391,10 +392,94 @@ def remove_row_groups(self, rgs, write_fmd:bool = True,
         except IOError:
             pass
         self._set_attrs()
-
         if write_fmd:
             self._write_common_metadata(open_with)
 
+    def append_as_row_groups(self, data, row_group_offsets=None,
+                             compression=None, write_fmd:bool=True,
+                             open_with=default_open, mkdirs=None, append=True,
+                             stats=True):
+        """
+        Append data as new row groups to disk. Updated `ParquetFile` metadata
+        are written on disk accordingly (optional).
+
+        Parameter
+        ---------
+        data: pandas dataframe
+            Data to append.
+        row_group_offsets: int or list of ints
+            If int, row-groups will be approximately this many rows, rounded
+            down to make row groups about the same size;
+            If a list, the explicit index values to start new row groups;
+            If `None`, set to 50000000.
+        compression: str, dict, None
+            compression to apply to each column, e.g. ``GZIP`` or ``SNAPPY`` or
+            a ``dict`` like ``{"col1": "SNAPPY", "col2": None}`` to specify per
+            column compression types.
+            By default, do not compress.
+        write_fmd: bool, True
+            Write updated common metadata to disk.
+        open_with: function
+            When called with a f(path, mode), returns an open file-like object.
+        mkdirs: function
+            When called with a path/URL, creates any necessary dictionaries to
+            make that location writable, e.g., ``os.makedirs``. This is not
+            necessary if using the simple file scheme.
+        append: bool (False) or 'overwrite'
+            If False, construct data-set from scratch;
+            If True, add new row-group(s) to existing data-set. In the latter
+            case, the data-set must exist, and the schema must match the input
+            data;
+            If 'overwrite', existing partitions will be replaced in-place, where
+            new data has any rows within an existing partition. To enable this,
+            in addition to append to a data-set with partitions, and 'hive'
+            scheme, row_group_offset has to be set to `[0]` (meaning writing a
+            single row group per partition).
+        stats: True|False|list(str)
+            Whether to calculate and write summary statistics.
+            If True (default), do it for every column;
+            If False, never do;
+            If a list of str, do it only for those specified columns.
+        """
+        from .writer import write_simple, write_multi
+        if self._get_index():
+            # Adjust index of pandas dataframe.
+            data = reset_row_idx(data)
+        if (self.file_scheme == 'simple'
+            or (self.file_scheme == 'empty' and self.fn[-9:] != '_metadata')):
+            # Case 'simple'.
+            if sorted(self.columns) != sorted(data.columns):
+                raise ValueError('File schema is not compatible with '
+                                 'existing file schema.')
+            if append == 'overwrite':
+                raise ValueError("Not possible to overwrite with simple file \
+scheme.")
+            write_simple(self.fn, data, self.fmd, row_group_offsets,
+                         compression, open_with, append, stats)
+        else:
+            # Case 'hive' or 'drill'.
+            partition_on = list(self.cats)
+            if append == 'overwrite':
+                if (row_group_offsets != [0]) and (row_group_offsets != 0):
+                    raise ValueError("When overwriting partitions, writing \
+several row groups per partition is not possible. Please, force writing a \
+single row group per partition by setting `row_group_offsets=[0]`.")
+                if not partition_on:
+                    raise ValueError("No partitioning column has been set in \
+existing data-set. Overwrite of partitions is not possible.")
+                exist_rgps = [rg.columns[0].file_path
+                              for rg in self.row_groups]
+                if len(exist_rgps) > len(_strip_path_tail(exist_rgps)):
+                    # Some row groups are in the same folder (partition).
+                    raise ValueError("Some partition folders contain several \
+row groups. This situation is not allowed with use of `append='overwrite'`.")
+            write_multi(self.basepath, data, self.fmd, row_group_offsets,
+                        compression, self.file_scheme, write_fmd=write_fmd,
+                        open_with=open_with, mkdirs=mkdirs,
+                        partition_on=partition_on, append=append, stats=stats)
+        self._set_attrs()
+        return
+
     def _write_common_metadata(self, open_with=default_open):
         """
         Write common metadata to disk.
@@ -1224,7 +1309,7 @@ def filter_not_in(values, vmin=None, vmax=None):
         return False
 
 
-def row_groups_map(rgs: list) -> dict:
+def map_row_groups(rgs: list) -> dict:
     """
     Returns row group lists sorted by parquet files.
 

diff --git a/fastparquet/test/test_api.py b/fastparquet/test/test_api.py
@@ -18,7 +18,7 @@
 import fastparquet
 from fastparquet import write, ParquetFile
 from fastparquet.api import (statistics, sorted_partitioned_columns, filter_in,
-                             filter_not_in, row_groups_map)
+                             filter_not_in, map_row_groups)
 from fastparquet.util import join_path
 
 TEST_DATA = "test-data"
@@ -1228,7 +1228,7 @@ def test_remove_rgs_partitioned_pyarrow_multi(tempdir):
     with pytest.raises(ValueError, match="^File b=hi/a97cc141d16f4014a59e5b234dddf07c.parquet"):
         pf.remove_row_groups(pf.row_groups[0])
     # Removing all row groups of a same file is ok.
-    files_rgs = row_groups_map(pf.row_groups) # sort row groups per file
+    files_rgs = map_row_groups(pf.row_groups) # sort row groups per file
     file = list(files_rgs)[0]
     pf.remove_row_groups(files_rgs[file])
     assert len(pf.row_groups) == 2  # check row group list updated (4 initially)
@@ -1248,10 +1248,55 @@ def test_remove_rgs_simple_merge(tempdir):
     with pytest.raises(ValueError, match="^File fn1.parquet"):
         pf.remove_row_groups(pf.row_groups[0])
     # Removing all row groups of a same file is ok.
-    files_rgs = row_groups_map(pf.row_groups) # sort row groups per file
+    files_rgs = map_row_groups(pf.row_groups) # sort row groups per file
     file = list(files_rgs)[0]
     pf.remove_row_groups(files_rgs[file])
     assert len(pf.row_groups) == 2  # check row group list updated (4 initially)    
     df_ref = pd.DataFrame({'a':range(4), 'b':['lo']*2+['hi']*2})
     assert pf.to_pandas().equals(df_ref) 
-
+
+
+def test_append_rgs_simple(tempdir):
+    fn = os.path.join(tempdir, 'test.parq')
+    write(fn, df_remove_rgs[:2], file_scheme='simple')
+    pf = ParquetFile(fn)
+    pf.append_as_row_groups(df_remove_rgs[2:])
+    pf2 = ParquetFile(fn)
+    assert pf.fmd == pf2.fmd   # metadata are updated in-place.
+    assert pf.to_pandas().equals(df_remove_rgs)
+
+
+def test_append_rgs_simple_no_index(tempdir):
+    fn = os.path.join(tempdir, 'test.parq')
+    df = df_remove_rgs.reset_index(drop=True)
+    write(fn, df[:2], file_scheme='simple')
+    pf = ParquetFile(fn)
+    pf.append_as_row_groups(df[2:])
+    pf2 = ParquetFile(fn)
+    assert pf.fmd == pf2.fmd   # metadata are updated in-place.
+    assert pf.to_pandas().equals(df)
+
+
+def test_append_rgs_hive(tempdir):
+    dn = os.path.join(tempdir, 'test_parq')
+    write(dn, df_remove_rgs[:3], file_scheme='hive', row_group_offsets=[0,2])
+    pf = ParquetFile(dn)
+    pf.append_as_row_groups(df_remove_rgs[3:], [0, 1])
+    assert len(pf.row_groups) == 4
+    pf2 = ParquetFile(dn)
+    assert pf.fmd == pf2.fmd   # metadata are updated in-place.
+    assert pf.to_pandas().equals(df_remove_rgs)
+
+
+def test_append_rgs_hive_partitions(tempdir):
+    dn = os.path.join(tempdir, 'test_parq')
+    write(dn, df_remove_rgs[:3], file_scheme='hive', row_group_offsets=[0,2],
+          partition_on=['country'])
+    pf = ParquetFile(dn)
+    pf.append_as_row_groups(df_remove_rgs[3:], [0, 1])
+    assert len(pf.row_groups) == 4
+    pf2 = ParquetFile(dn)
+    assert pf.fmd == pf2.fmd   # metadata are updated in-place.
+    df = df_remove_rgs.sort_index()
+    df['country'] = df['country'].astype('category')
+    assert pf.to_pandas().sort_index().equals(df)
diff --git a/fastparquet/test/test_overwrite.py b/fastparquet/test/test_overwrite.py
@@ -10,16 +10,14 @@
 
 
 def test_write_with_rgp_by_date_as_index(tempdir):
-
     # Step 1 - Writing of a 1st df, with `row_group_offsets=0`,
     # `file_scheme=hive` and `partition_on=['location', 'color`].
     df1 = pd.DataFrame({'humidity': [0.3, 0.8, 0.9],
                         'pressure': [1e5, 1.1e5, 0.95e5],
                         'location': ['Paris', 'Paris', 'Milan'],
                         'color': ['red', 'black', 'blue']})
-    write(tempdir, df1, row_group_offsets=0, file_scheme='hive',
+    write(tempdir, df1, row_group_offsets=[0], file_scheme='hive',
           partition_on=['location', 'color'])
-
     # Step 2 - Overwriting with a 2nd df having overlapping data, in
     # 'overwrite' mode:
     # `row_group_offsets=0`, `file_scheme=hive`,
@@ -28,10 +26,8 @@ def test_write_with_rgp_by_date_as_index(tempdir):
                         'pressure': [9e4, 1e5, 1.1e5, 1.1e5, 0.95e5],
                         'location': ['Milan', 'Paris', 'Paris', 'Paris', 'Paris'],
                         'color': ['red', 'black', 'black', 'green', 'green' ]})
-
-    write(tempdir, df2, row_group_offsets=0, file_scheme='hive', append='overwrite',
+    write(tempdir, df2, row_group_offsets=[0], file_scheme='hive', append='overwrite',
           partition_on=['location', 'color'])
-
     expected = pd.DataFrame({'humidity': [0.9, 0.5, 0.3, 0.4, 0.8, 1.1, 0.3],
                              'pressure': [9.5e4, 9e4, 1e5, 1.1e5, 1.1e5, 9.5e4, 1e5],
                              'location': ['Milan', 'Milan', 'Paris', 'Paris', 'Paris', 'Paris', 'Paris'],
@@ -44,18 +40,38 @@ def test_write_with_rgp_by_date_as_index(tempdir):
     # df1. Total resulting number of rows is 7.
     assert expected.equals(recorded)
 
-def test_several_existing_parts_in_folder_exception(tempdir):
-
+def test_exception_1(tempdir):
     df1 = pd.DataFrame({'humidity': [0.3, 0.8, 0.9, 0.7],
                         'pressure': [1e5, 1.1e5, 0.95e5, 1e5],
                         'location': ['Paris', 'Paris', 'Milan', 'Paris'],
                         'exterior': ['yes', 'no', 'yes', 'yes']})
-
-    write(tempdir, df1, row_group_offsets = 1, file_scheme='hive',
+    # Several existing parts in folder exception.
+    write(tempdir, df1, row_group_offsets=1, file_scheme='hive',
           write_index=False, partition_on=['location', 'exterior'])
-
     with pytest.raises(ValueError, match="^Some partition folders"):
-        write(tempdir, df1, row_group_offsets = 0, file_scheme='hive',
+        write(tempdir, df1, row_group_offsets=0, file_scheme='hive',
+              write_index=False, partition_on=['location', 'exterior'],
+              append='overwrite')
+    with pytest.raises(ValueError, match="^Some partition folders"):
+        write(tempdir, df1, row_group_offsets=[0], file_scheme='hive',
+              write_index=False, partition_on=['location', 'exterior'],
+              append='overwrite')
+    # If not 0 row group offset, not accepted.
+    with pytest.raises(ValueError, match="^When overwriting"):
+        write(tempdir, df1, row_group_offsets=1, file_scheme='hive',
               write_index=False, partition_on=['location', 'exterior'],
               append='overwrite')
 
+
+def test_exception_2(tempdir):
+    df1 = pd.DataFrame({'humidity': [0.3, 0.8, 0.9, 0.7],
+                        'pressure': [1e5, 1.1e5, 0.95e5, 1e5],
+                        'location': ['Paris', 'Paris', 'Milan', 'Paris'],
+                        'exterior': ['yes', 'no', 'yes', 'yes']})
+    # No partitions.
+    write(tempdir, df1, row_group_offsets=1, file_scheme='hive',
+          write_index=False)
+    with pytest.raises(ValueError, match="^No partitioning"):
+        write(tempdir, df1, row_group_offsets=0, file_scheme='hive',
+              write_index=False, append='overwrite')
+
diff --git a/fastparquet/util.py b/fastparquet/util.py
@@ -120,6 +120,58 @@ def check_column_names(columns, *args):
                                  "" % (missing, arg, columns))
 
 
+def reset_row_idx(data: pd.DataFrame) -> pd.DataFrame:
+    """
+    Shift row index to columns of the DataFrame, compatible for storing to
+    parquet.
+
+    Parameters
+    ----------
+    data: pd.DataFrame
+    Returns
+    -------
+    data: pd.DataFrame
+    """
+    if isinstance(data.index, pd.MultiIndex):
+        for name, cats, codes in zip(data.index.names, data.index.levels,
+                                     data.index.codes):
+            data = data.assign(**{name: pd.Categorical.from_codes(codes,
+                                                                  cats)})
+        data.reset_index(drop=True)
+    else:
+        data = data.reset_index()
+    return data
+
+
+def to_rg_offsets(row_group_offset: int, n_rows: int):
+    """
+    Express 'row_group_offset' as a list of row indexes.
+
+    Parameters
+    ----------
+    row_group_offset: int
+        Row-groups will be approximately this many rows, rounded down to make
+        row groups about the same size
+    n_rows: int
+        Total number of rows in dataset.
+
+    Returns
+    -------
+    row_group_offsets: List[int]
+       List of row indexes marking the start of each row group.
+    """
+    # TODO
+    # Could be extended so that instead of a target number of rows per
+    # row group, it accepts a target size (MB or GB) per row group.
+    if not row_group_offset:     # if row group is 0.
+        row_group_offsets = [0]
+    else:
+        nparts = max((n_rows - 1) // row_group_offset + 1, 1)
+        chunksize = max(min((n_rows - 1) // nparts + 1, n_rows), 1)
+        row_group_offsets = list(range(0, n_rows, chunksize))
+    return row_group_offsets
+
+
 def metadata_from_many(file_list, verify_schema=False, open_with=default_open,
                        root=False, fs=None):
     """