From 2442fda56de896b2fabee4fe82fbc573e39d58b9 Mon Sep 17 00:00:00 2001
From: Dongdong Tian <seisman.info@gmail.com>
Date: Wed, 3 Apr 2024 10:10:13 +0800
Subject: [PATCH 1/3] Session.virtualfile_to_dataset: Add 'strings' output type
 for the array of trailing texts

---
 pygmt/clib/session.py      | 29 +++++++++++++++++++++++------
 pygmt/datatypes/dataset.py | 31 ++++++++++++++++++++-----------
 2 files changed, 43 insertions(+), 17 deletions(-)

diff --git a/pygmt/clib/session.py b/pygmt/clib/session.py
index a9ea8c65f27..3616ca56d48 100644
--- a/pygmt/clib/session.py
+++ b/pygmt/clib/session.py
@@ -1775,7 +1775,7 @@ def read_virtualfile(
     def virtualfile_to_dataset(
         self,
         vfname: str,
-        output_type: Literal["pandas", "numpy", "file"] = "pandas",
+        output_type: Literal["pandas", "numpy", "file", "strings"] = "pandas",
         column_names: list[str] | None = None,
         dtype: type | dict[str, type] | None = None,
         index_col: str | int | None = None,
@@ -1796,6 +1796,7 @@ def virtualfile_to_dataset(
             - ``"pandas"`` will return a :class:`pandas.DataFrame` object.
             - ``"numpy"`` will return a :class:`numpy.ndarray` object.
             - ``"file"`` means the result was saved to a file and will return ``None``.
+            - ``"strings"`` will return the trailing text only as a list of strings.
         column_names
             The column names for the :class:`pandas.DataFrame` output.
         dtype
@@ -1841,6 +1842,16 @@ def virtualfile_to_dataset(
         ...                 assert result is None
         ...                 assert Path(outtmp.name).stat().st_size > 0
         ...
+        ...     # strings output
+        ...     with Session() as lib:
+        ...         with lib.virtualfile_out(kind="dataset") as vouttbl:
+        ...             lib.call_module("read", f"{tmpfile.name} {vouttbl} -Td")
+        ...             outstr = lib.virtualfile_to_dataset(
+        ...                 vfname=vouttbl, output_type="strings"
+        ...             )
+        ...     assert isinstance(outstr, np.ndarray)
+        ...     assert outstr.dtype.kind in ("S", "U")
+        ...
         ...     # numpy output
         ...     with Session() as lib:
         ...         with lib.virtualfile_out(kind="dataset") as vouttbl:
@@ -1869,6 +1880,9 @@ def virtualfile_to_dataset(
         ...                 column_names=["col1", "col2", "col3", "coltext"],
         ...             )
         ...     assert isinstance(outpd2, pd.DataFrame)
+        >>> outstr
+        array(['TEXT1 TEXT23', 'TEXT4 TEXT567', 'TEXT8 TEXT90',
+           'TEXT123 TEXT456789'], dtype='<U18')
         >>> outnp
         array([[1.0, 2.0, 3.0, 'TEXT1 TEXT23'],
                [4.0, 5.0, 6.0, 'TEXT4 TEXT567'],
@@ -1890,11 +1904,14 @@ def virtualfile_to_dataset(
         if output_type == "file":  # Already written to file, so return None
             return None
 
-        # Read the virtual file as a GMT dataset and convert to pandas.DataFrame
-        result = self.read_virtualfile(vfname, kind="dataset").contents.to_dataframe(
-            column_names=column_names,
-            dtype=dtype,
-            index_col=index_col,
+        # Read the virtual file as a GMT dataset
+        result = self.read_virtualfile(vfname, kind="dataset").contents
+
+        if output_type == "strings":  # strings output
+            return result.to_strings()
+
+        result = result.to_dataframe(
+            column_names=column_names, dtype=dtype, index_col=index_col
         )
         if output_type == "numpy":  # numpy.ndarray output
             return result.to_numpy()
diff --git a/pygmt/datatypes/dataset.py b/pygmt/datatypes/dataset.py
index 7a61b7f3d91..a14e6af495c 100644
--- a/pygmt/datatypes/dataset.py
+++ b/pygmt/datatypes/dataset.py
@@ -144,6 +144,19 @@ class _GMT_DATASEGMENT(ctp.Structure):  # noqa: N801
         ("hidden", ctp.c_void_p),
     ]
 
+    def to_strings(self) -> np.ndarray[Any, np.dtype[np.str_]]:
+        """
+        Convert the trailing text column to an array of strings.
+        """
+        textvector = []
+        for itbl in range(self.n_tables):
+            dtbl = self.table[itbl].contents
+            for iseg in range(dtbl.n_segments):
+                dseg = dtbl.segment[iseg].contents
+                if dseg.text:
+                    textvector.extend(dseg.text[: dseg.n_rows])
+        return np.char.decode(textvector) if textvector else np.array([], dtype=str)
+
     def to_dataframe(
         self,
         column_names: pd.Index | None = None,
@@ -194,7 +207,11 @@ def to_dataframe(
         ...         with lib.virtualfile_out(kind="dataset") as vouttbl:
         ...             lib.call_module("read", f"{tmpfile.name} {vouttbl} -Td")
         ...             ds = lib.read_virtualfile(vouttbl, kind="dataset")
+        ...             text = ds.contents.to_strings()
         ...             df = ds.contents.to_dataframe()
+        >>> text
+        array(['TEXT1 TEXT23', 'TEXT4 TEXT567', 'TEXT8 TEXT90',
+           'TEXT123 TEXT456789'], dtype='<U18')
         >>> df
               0     1     2                   3
         0   1.0   2.0   3.0        TEXT1 TEXT23
@@ -218,17 +235,9 @@ def to_dataframe(
             vectors.append(pd.Series(data=np.concatenate(colvector)))
 
         # Deal with trailing text column
-        textvector = []
-        for itbl in range(self.n_tables):
-            dtbl = self.table[itbl].contents
-            for iseg in range(dtbl.n_segments):
-                dseg = dtbl.segment[iseg].contents
-                if dseg.text:
-                    textvector.extend(dseg.text[: dseg.n_rows])
-        if textvector:
-            vectors.append(
-                pd.Series(data=np.char.decode(textvector), dtype=pd.StringDtype())
-            )
+        textvector = self.to_strings()
+        if len(textvector) != 0:
+            vectors.append(pd.Series(data=textvector, dtype=pd.StringDtype()))
 
         if len(vectors) == 0:
             # Return an empty DataFrame if no columns are found.

From bafe0f30b4b8ac1cf5d4e4d8daa775b0c592fd15 Mon Sep 17 00:00:00 2001
From: Dongdong Tian <seisman.info@gmail.com>
Date: Wed, 3 Apr 2024 12:41:43 +0800
Subject: [PATCH 2/3] Apply suggestions from code review

---
 pygmt/clib/session.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pygmt/clib/session.py b/pygmt/clib/session.py
index 3616ca56d48..eedb4572992 100644
--- a/pygmt/clib/session.py
+++ b/pygmt/clib/session.py
@@ -1796,7 +1796,7 @@ def virtualfile_to_dataset(
             - ``"pandas"`` will return a :class:`pandas.DataFrame` object.
             - ``"numpy"`` will return a :class:`numpy.ndarray` object.
             - ``"file"`` means the result was saved to a file and will return ``None``.
-            - ``"strings"`` will return the trailing text only as a list of strings.
+            - ``"strings"`` will return the trailing text only as an array of strings.
         column_names
             The column names for the :class:`pandas.DataFrame` output.
         dtype
@@ -1904,7 +1904,7 @@ def virtualfile_to_dataset(
         if output_type == "file":  # Already written to file, so return None
             return None
 
-        # Read the virtual file as a GMT dataset
+        # Read the virtual file as a _GMT_DATASET object
         result = self.read_virtualfile(vfname, kind="dataset").contents
 
         if output_type == "strings":  # strings output

From e996f56a0037e7961346de4ec5746fca0e7df927 Mon Sep 17 00:00:00 2001
From: Dongdong Tian <seisman.info@gmail.com>
Date: Wed, 3 Apr 2024 20:12:45 +0800
Subject: [PATCH 3/3] Simplify nested for loops

---
 pygmt/datatypes/dataset.py | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/pygmt/datatypes/dataset.py b/pygmt/datatypes/dataset.py
index a14e6af495c..7d0b1d469db 100644
--- a/pygmt/datatypes/dataset.py
+++ b/pygmt/datatypes/dataset.py
@@ -149,12 +149,10 @@ def to_strings(self) -> np.ndarray[Any, np.dtype[np.str_]]:
         Convert the trailing text column to an array of strings.
         """
         textvector = []
-        for itbl in range(self.n_tables):
-            dtbl = self.table[itbl].contents
-            for iseg in range(dtbl.n_segments):
-                dseg = dtbl.segment[iseg].contents
-                if dseg.text:
-                    textvector.extend(dseg.text[: dseg.n_rows])
+        for table in self.table[: self.n_tables]:
+            for segment in table.contents.segment[: table.contents.n_segments]:
+                if segment.contents.text:
+                    textvector.extend(segment.contents.text[: segment.contents.n_rows])
         return np.char.decode(textvector) if textvector else np.array([], dtype=str)
 
     def to_dataframe(
@@ -224,14 +222,13 @@ def to_dataframe(
         vectors = []
         # Deal with numeric columns
         for icol in range(self.n_columns):
-            colvector = []
-            for itbl in range(self.n_tables):
-                dtbl = self.table[itbl].contents
-                for iseg in range(dtbl.n_segments):
-                    dseg = dtbl.segment[iseg].contents
-                    colvector.append(
-                        np.ctypeslib.as_array(dseg.data[icol], shape=(dseg.n_rows,))
-                    )
+            colvector = [
+                np.ctypeslib.as_array(
+                    seg.contents.data[icol], shape=(seg.contents.n_rows,)
+                )
+                for tbl in self.table[: self.n_tables]
+                for seg in tbl.contents.segment[: tbl.contents.n_segments]
+            ]
             vectors.append(pd.Series(data=np.concatenate(colvector)))
 
         # Deal with trailing text column