chore: add ipython startup scripts

tablegpt · Nov 13, 2024 · 99b32d2 · 99b32d2
1 parent 3a64cce
commit 99b32d2
Show file tree

Hide file tree

Showing 5 changed files with 179 additions and 0 deletions.
diff --git a/ipython/README.md b/ipython/README.md
@@ -0,0 +1,33 @@
+# TableGPT IPython Kernel
+
+This kernel is used to execute code generated by `tablegpt-agent` and has been equipped with data analysis and Chinese font support.
+
+## Installation
+
+You can install the dependencies required for the `IPython Kernel` using the following command:
+
+```sh
+pip install -r requirements.txt
+```
+
+## Usage
+
+If you're using the local executor (`pybox.LocalPyBoxManager`), you need to copy the code from the `ipython-startup-scripts` folder to `$HOME/.ipython/profile_default/startup/`. This directory contains functions and configurations required for performing data analysis with `tablegpt-agent`.
+
+Note: The `~/.ipython` directory must be writable for the process launching the kernel, otherwise there will be a warning message: `UserWarning: IPython dir '/home/jovyan/.ipython' is not a writable location, using a temp directory.` and the startup scripts won't take effects.
+
+## Official document
+
+Official document at `~/.ipython/profile_default/startup/README`:
+
+> This is the IPython startup directory
+>
+> .py and .ipy files in this directory will be run *prior* to any code or files specified
+> via the exec_lines or exec_files configurables whenever you load this profile.
+>
+> Files will be run in lexicographical order, so you can control the execution order of files
+> with a prefix, e.g.::
+>
+>     00-first.py
+>     50-middle.py
+>     99-last.ipy
diff --git a/ipython/ipython-startup-scripts/00-pandas.py b/ipython/ipython-startup-scripts/00-pandas.py
@@ -0,0 +1,9 @@
+import pandas as pd
+
+pd.set_option("display.width", 2048)
+# 8 is the minimum value to display `df.describe()`. We have other truncation mechanisms so it's OK to flex this a bit.
+pd.set_option("display.max_rows", 8)
+pd.set_option("display.max_columns", 40)
+pd.set_option("display.max_colwidth", 40)
+pd.set_option("display.precision", 3)
+pd.set_option("future.no_silent_downcasting", True)
diff --git a/ipython/ipython-startup-scripts/98-udfs.py b/ipython/ipython-startup-scripts/98-udfs.py
@@ -0,0 +1,120 @@
+import os
+from pathlib import Path
+from typing import NamedTuple, cast
+
+import pandas as pd
+import concurrent.futures
+
+
+class FileEncoding(NamedTuple):
+    """File encoding as the NamedTuple."""
+
+    encoding: str | None
+    """The encoding of the file."""
+    confidence: float
+    """The confidence of the encoding."""
+    language: str | None
+    """The language of the file."""
+
+
+def detect_file_encodings(
+    file_path: str | Path, timeout: int = 5
+) -> list[FileEncoding]:
+    """Try to detect the file encoding.
+
+    Returns a list of `FileEncoding` tuples with the detected encodings ordered
+    by confidence.
+
+    Args:
+        file_path: The path to the file to detect the encoding for.
+        timeout: The timeout in seconds for the encoding detection.
+    """
+    import chardet
+
+    file_path = str(file_path)
+
+    def read_and_detect(file_path: str) -> list[dict]:
+        with open(file_path, "rb") as f:
+            rawdata = f.read()
+        return cast(list[dict], chardet.detect_all(rawdata))
+
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        future = executor.submit(read_and_detect, file_path)
+        try:
+            encodings = future.result(timeout=timeout)
+        except concurrent.futures.TimeoutError:
+            raise TimeoutError(
+                f"Timeout reached while detecting encoding for {file_path}"
+            )
+
+    if all(encoding["encoding"] is None for encoding in encodings):
+        raise RuntimeError(f"Could not detect encoding for {file_path}")
+    return [FileEncoding(**enc) for enc in encodings if enc["encoding"] is not None]
+
+
+def path_from_uri(uri: str) -> Path:
+    """Return a new path from the given 'file' URI.
+    This is implemented in Python 3.13.
+    See <https://github.com/python/cpython/pull/107640>
+    and <https://github.com/python/cpython/pull/107640/files#diff-fa525485738fc33d05b06c159172ff1f319c26e88d8c6bb39f7dbaae4dc4105c>
+    TODO: remove when we migrate to Python 3.13"""
+    if not uri.startswith("file:"):
+        raise ValueError(f"URI does not start with 'file:': {uri!r}")
+    path = uri[5:]
+    if path[:3] == "///":
+        # Remove empty authority
+        path = path[2:]
+    elif path[:12] == "//localhost/":
+        # Remove 'localhost' authority
+        path = path[11:]
+    if path[:3] == "///" or (path[:1] == "/" and path[2:3] in ":|"):
+        # Remove slash before DOS device/UNC path
+        path = path[1:]
+    if path[1:2] == "|":
+        # Replace bar with colon in DOS drive
+        path = path[:1] + ":" + path[2:]
+    from urllib.parse import unquote_to_bytes
+
+    path = Path(os.fsdecode(unquote_to_bytes(path)))
+    if not path.is_absolute():
+        raise ValueError(f"URI is not absolute: {uri!r}")
+    return path
+
+
+def file_extention(file: str) -> str:
+    path = Path(file)
+    return path.suffix
+
+
+def read_df(uri: str, autodetect_encoding: bool = True, **kwargs) -> pd.DataFrame:
+    """A simple wrapper to read different file formats into DataFrame."""
+    try:
+        return _read_df(uri, **kwargs)
+    except UnicodeDecodeError as e:
+        if autodetect_encoding:
+            detected_encodings = detect_file_encodings(path_from_uri(uri), timeout=30)
+            for encoding in detected_encodings:
+                try:
+                    return _read_df(uri, encoding=encoding.encoding, **kwargs)
+                except UnicodeDecodeError:
+                    continue
+        # Either we ran out of detected encoding, or autodetect_encoding is False,
+        # we should raise encoding error
+        raise ValueError(f"不支持的文件编码{e.encoding}，请转换成 utf-8 后重试")
+
+
+def _read_df(uri: str, encoding: str = "utf-8", **kwargs) -> pd.DataFrame:
+    """A simple wrapper to read different file formats into DataFrame."""
+    ext = file_extention(uri).lower()
+    if ext == ".csv":
+        df = pd.read_csv(uri, encoding=encoding, **kwargs)
+    elif ext == ".tsv":
+        df = pd.read_csv(uri, sep="\t", encoding=encoding, **kwargs)
+    elif ext in [".xls", ".xlsx", ".xlsm", ".xlsb", ".odf", ".ods", ".odt"]:
+        # read_excel does not support 'encoding' arg, also it seems that it does not need it.
+        df = pd.read_excel(uri, **kwargs)
+    else:
+        raise ValueError(
+            f"TableGPT 目前支持 csv、tsv 以及 xlsx 文件，您上传的文件格式 {ext} 暂不支持。"
+        )
+    return df
diff --git a/ipython/ipython-startup-scripts/99-cfont.py b/ipython/ipython-startup-scripts/99-cfont.py
@@ -0,0 +1,5 @@
+import seaborn as sns
+from mplfonts import use_font
+
+use_font("Noto Serif CJK SC")
+sns.set_theme(font="Noto Serif CJK SC")
diff --git a/ipython/requirements.txt b/ipython/requirements.txt
@@ -0,0 +1,12 @@
+pandas >=2.2,<3.0.0
+scipy >=1.13.0,<2.0.0
+tabulate >=0.9.0,<1.0.0
+scikit-learn >=1.0.0,<2.0.0
+statsmodels >=0.10.0,<1.0.0
+matplotlib >=3.8.4,<4.0.0
+seaborn >=0.13.1,<1.0.0
+mplfonts >=0.0.8,<1.0.0
+numexpr >=2.8.4
+openpyxl >=3.1.2,<4.0.0  # read xlsx files
+xlrd >= 2.0.1  # read xls files
+odfpy  # read ods files