From 2442fda56de896b2fabee4fe82fbc573e39d58b9 Mon Sep 17 00:00:00 2001 From: Dongdong Tian Date: Wed, 3 Apr 2024 10:10:13 +0800 Subject: [PATCH 1/3] Session.virtualfile_to_dataset: Add 'strings' output type for the array of trailing texts --- pygmt/clib/session.py | 29 +++++++++++++++++++++++------ pygmt/datatypes/dataset.py | 31 ++++++++++++++++++++----------- 2 files changed, 43 insertions(+), 17 deletions(-) diff --git a/pygmt/clib/session.py b/pygmt/clib/session.py index a9ea8c65f27..3616ca56d48 100644 --- a/pygmt/clib/session.py +++ b/pygmt/clib/session.py @@ -1775,7 +1775,7 @@ def read_virtualfile( def virtualfile_to_dataset( self, vfname: str, - output_type: Literal["pandas", "numpy", "file"] = "pandas", + output_type: Literal["pandas", "numpy", "file", "strings"] = "pandas", column_names: list[str] | None = None, dtype: type | dict[str, type] | None = None, index_col: str | int | None = None, @@ -1796,6 +1796,7 @@ def virtualfile_to_dataset( - ``"pandas"`` will return a :class:`pandas.DataFrame` object. - ``"numpy"`` will return a :class:`numpy.ndarray` object. - ``"file"`` means the result was saved to a file and will return ``None``. + - ``"strings"`` will return the trailing text only as a list of strings. column_names The column names for the :class:`pandas.DataFrame` output. dtype @@ -1841,6 +1842,16 @@ def virtualfile_to_dataset( ... assert result is None ... assert Path(outtmp.name).stat().st_size > 0 ... + ... # strings output + ... with Session() as lib: + ... with lib.virtualfile_out(kind="dataset") as vouttbl: + ... lib.call_module("read", f"{tmpfile.name} {vouttbl} -Td") + ... outstr = lib.virtualfile_to_dataset( + ... vfname=vouttbl, output_type="strings" + ... ) + ... assert isinstance(outstr, np.ndarray) + ... assert outstr.dtype.kind in ("S", "U") + ... ... # numpy output ... with Session() as lib: ... with lib.virtualfile_out(kind="dataset") as vouttbl: @@ -1869,6 +1880,9 @@ def virtualfile_to_dataset( ... column_names=["col1", "col2", "col3", "coltext"], ... ) ... assert isinstance(outpd2, pd.DataFrame) + >>> outstr + array(['TEXT1 TEXT23', 'TEXT4 TEXT567', 'TEXT8 TEXT90', + 'TEXT123 TEXT456789'], dtype='>> outnp array([[1.0, 2.0, 3.0, 'TEXT1 TEXT23'], [4.0, 5.0, 6.0, 'TEXT4 TEXT567'], @@ -1890,11 +1904,14 @@ def virtualfile_to_dataset( if output_type == "file": # Already written to file, so return None return None - # Read the virtual file as a GMT dataset and convert to pandas.DataFrame - result = self.read_virtualfile(vfname, kind="dataset").contents.to_dataframe( - column_names=column_names, - dtype=dtype, - index_col=index_col, + # Read the virtual file as a GMT dataset + result = self.read_virtualfile(vfname, kind="dataset").contents + + if output_type == "strings": # strings output + return result.to_strings() + + result = result.to_dataframe( + column_names=column_names, dtype=dtype, index_col=index_col ) if output_type == "numpy": # numpy.ndarray output return result.to_numpy() diff --git a/pygmt/datatypes/dataset.py b/pygmt/datatypes/dataset.py index 7a61b7f3d91..a14e6af495c 100644 --- a/pygmt/datatypes/dataset.py +++ b/pygmt/datatypes/dataset.py @@ -144,6 +144,19 @@ class _GMT_DATASEGMENT(ctp.Structure): # noqa: N801 ("hidden", ctp.c_void_p), ] + def to_strings(self) -> np.ndarray[Any, np.dtype[np.str_]]: + """ + Convert the trailing text column to an array of strings. + """ + textvector = [] + for itbl in range(self.n_tables): + dtbl = self.table[itbl].contents + for iseg in range(dtbl.n_segments): + dseg = dtbl.segment[iseg].contents + if dseg.text: + textvector.extend(dseg.text[: dseg.n_rows]) + return np.char.decode(textvector) if textvector else np.array([], dtype=str) + def to_dataframe( self, column_names: pd.Index | None = None, @@ -194,7 +207,11 @@ def to_dataframe( ... with lib.virtualfile_out(kind="dataset") as vouttbl: ... lib.call_module("read", f"{tmpfile.name} {vouttbl} -Td") ... ds = lib.read_virtualfile(vouttbl, kind="dataset") + ... text = ds.contents.to_strings() ... df = ds.contents.to_dataframe() + >>> text + array(['TEXT1 TEXT23', 'TEXT4 TEXT567', 'TEXT8 TEXT90', + 'TEXT123 TEXT456789'], dtype='>> df 0 1 2 3 0 1.0 2.0 3.0 TEXT1 TEXT23 @@ -218,17 +235,9 @@ def to_dataframe( vectors.append(pd.Series(data=np.concatenate(colvector))) # Deal with trailing text column - textvector = [] - for itbl in range(self.n_tables): - dtbl = self.table[itbl].contents - for iseg in range(dtbl.n_segments): - dseg = dtbl.segment[iseg].contents - if dseg.text: - textvector.extend(dseg.text[: dseg.n_rows]) - if textvector: - vectors.append( - pd.Series(data=np.char.decode(textvector), dtype=pd.StringDtype()) - ) + textvector = self.to_strings() + if len(textvector) != 0: + vectors.append(pd.Series(data=textvector, dtype=pd.StringDtype())) if len(vectors) == 0: # Return an empty DataFrame if no columns are found. From bafe0f30b4b8ac1cf5d4e4d8daa775b0c592fd15 Mon Sep 17 00:00:00 2001 From: Dongdong Tian Date: Wed, 3 Apr 2024 12:41:43 +0800 Subject: [PATCH 2/3] Apply suggestions from code review --- pygmt/clib/session.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pygmt/clib/session.py b/pygmt/clib/session.py index 3616ca56d48..eedb4572992 100644 --- a/pygmt/clib/session.py +++ b/pygmt/clib/session.py @@ -1796,7 +1796,7 @@ def virtualfile_to_dataset( - ``"pandas"`` will return a :class:`pandas.DataFrame` object. - ``"numpy"`` will return a :class:`numpy.ndarray` object. - ``"file"`` means the result was saved to a file and will return ``None``. - - ``"strings"`` will return the trailing text only as a list of strings. + - ``"strings"`` will return the trailing text only as an array of strings. column_names The column names for the :class:`pandas.DataFrame` output. dtype @@ -1904,7 +1904,7 @@ def virtualfile_to_dataset( if output_type == "file": # Already written to file, so return None return None - # Read the virtual file as a GMT dataset + # Read the virtual file as a _GMT_DATASET object result = self.read_virtualfile(vfname, kind="dataset").contents if output_type == "strings": # strings output From e996f56a0037e7961346de4ec5746fca0e7df927 Mon Sep 17 00:00:00 2001 From: Dongdong Tian Date: Wed, 3 Apr 2024 20:12:45 +0800 Subject: [PATCH 3/3] Simplify nested for loops --- pygmt/datatypes/dataset.py | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/pygmt/datatypes/dataset.py b/pygmt/datatypes/dataset.py index a14e6af495c..7d0b1d469db 100644 --- a/pygmt/datatypes/dataset.py +++ b/pygmt/datatypes/dataset.py @@ -149,12 +149,10 @@ def to_strings(self) -> np.ndarray[Any, np.dtype[np.str_]]: Convert the trailing text column to an array of strings. """ textvector = [] - for itbl in range(self.n_tables): - dtbl = self.table[itbl].contents - for iseg in range(dtbl.n_segments): - dseg = dtbl.segment[iseg].contents - if dseg.text: - textvector.extend(dseg.text[: dseg.n_rows]) + for table in self.table[: self.n_tables]: + for segment in table.contents.segment[: table.contents.n_segments]: + if segment.contents.text: + textvector.extend(segment.contents.text[: segment.contents.n_rows]) return np.char.decode(textvector) if textvector else np.array([], dtype=str) def to_dataframe( @@ -224,14 +222,13 @@ def to_dataframe( vectors = [] # Deal with numeric columns for icol in range(self.n_columns): - colvector = [] - for itbl in range(self.n_tables): - dtbl = self.table[itbl].contents - for iseg in range(dtbl.n_segments): - dseg = dtbl.segment[iseg].contents - colvector.append( - np.ctypeslib.as_array(dseg.data[icol], shape=(dseg.n_rows,)) - ) + colvector = [ + np.ctypeslib.as_array( + seg.contents.data[icol], shape=(seg.contents.n_rows,) + ) + for tbl in self.table[: self.n_tables] + for seg in tbl.contents.segment[: tbl.contents.n_segments] + ] vectors.append(pd.Series(data=np.concatenate(colvector))) # Deal with trailing text column