Skip to content

Commit

Permalink
chore: add ipython startup scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
weekenthralling committed Nov 13, 2024
1 parent 3a64cce commit 99b32d2
Show file tree
Hide file tree
Showing 5 changed files with 179 additions and 0 deletions.
33 changes: 33 additions & 0 deletions ipython/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# TableGPT IPython Kernel

This kernel is used to execute code generated by `tablegpt-agent` and has been equipped with data analysis and Chinese font support.

## Installation

You can install the dependencies required for the `IPython Kernel` using the following command:

```sh
pip install -r requirements.txt
```

## Usage

If you're using the local executor (`pybox.LocalPyBoxManager`), you need to copy the code from the `ipython-startup-scripts` folder to `$HOME/.ipython/profile_default/startup/`. This directory contains functions and configurations required for performing data analysis with `tablegpt-agent`.

Note: The `~/.ipython` directory must be writable for the process launching the kernel, otherwise there will be a warning message: `UserWarning: IPython dir '/home/jovyan/.ipython' is not a writable location, using a temp directory.` and the startup scripts won't take effects.

## Official document

Official document at `~/.ipython/profile_default/startup/README`:

> This is the IPython startup directory
>
> .py and .ipy files in this directory will be run *prior* to any code or files specified
> via the exec_lines or exec_files configurables whenever you load this profile.
>
> Files will be run in lexicographical order, so you can control the execution order of files
> with a prefix, e.g.::
>
> 00-first.py
> 50-middle.py
> 99-last.ipy
9 changes: 9 additions & 0 deletions ipython/ipython-startup-scripts/00-pandas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import pandas as pd

pd.set_option("display.width", 2048)
# 8 is the minimum value to display `df.describe()`. We have other truncation mechanisms so it's OK to flex this a bit.
pd.set_option("display.max_rows", 8)
pd.set_option("display.max_columns", 40)
pd.set_option("display.max_colwidth", 40)
pd.set_option("display.precision", 3)
pd.set_option("future.no_silent_downcasting", True)
120 changes: 120 additions & 0 deletions ipython/ipython-startup-scripts/98-udfs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
import os
from pathlib import Path
from typing import NamedTuple, cast

import pandas as pd
import concurrent.futures


class FileEncoding(NamedTuple):
"""File encoding as the NamedTuple."""

encoding: str | None
"""The encoding of the file."""
confidence: float
"""The confidence of the encoding."""
language: str | None
"""The language of the file."""


def detect_file_encodings(
file_path: str | Path, timeout: int = 5
) -> list[FileEncoding]:
"""Try to detect the file encoding.
Returns a list of `FileEncoding` tuples with the detected encodings ordered
by confidence.
Args:
file_path: The path to the file to detect the encoding for.
timeout: The timeout in seconds for the encoding detection.
"""
import chardet

file_path = str(file_path)

def read_and_detect(file_path: str) -> list[dict]:
with open(file_path, "rb") as f:
rawdata = f.read()
return cast(list[dict], chardet.detect_all(rawdata))

with concurrent.futures.ThreadPoolExecutor() as executor:
future = executor.submit(read_and_detect, file_path)
try:
encodings = future.result(timeout=timeout)
except concurrent.futures.TimeoutError:
raise TimeoutError(
f"Timeout reached while detecting encoding for {file_path}"
)

if all(encoding["encoding"] is None for encoding in encodings):
raise RuntimeError(f"Could not detect encoding for {file_path}")
return [FileEncoding(**enc) for enc in encodings if enc["encoding"] is not None]


def path_from_uri(uri: str) -> Path:
"""Return a new path from the given 'file' URI.
This is implemented in Python 3.13.
See <https://github.com/python/cpython/pull/107640>
and <https://github.com/python/cpython/pull/107640/files#diff-fa525485738fc33d05b06c159172ff1f319c26e88d8c6bb39f7dbaae4dc4105c>
TODO: remove when we migrate to Python 3.13"""
if not uri.startswith("file:"):
raise ValueError(f"URI does not start with 'file:': {uri!r}")
path = uri[5:]
if path[:3] == "///":
# Remove empty authority
path = path[2:]
elif path[:12] == "//localhost/":
# Remove 'localhost' authority
path = path[11:]
if path[:3] == "///" or (path[:1] == "/" and path[2:3] in ":|"):
# Remove slash before DOS device/UNC path
path = path[1:]
if path[1:2] == "|":
# Replace bar with colon in DOS drive
path = path[:1] + ":" + path[2:]
from urllib.parse import unquote_to_bytes

path = Path(os.fsdecode(unquote_to_bytes(path)))
if not path.is_absolute():
raise ValueError(f"URI is not absolute: {uri!r}")
return path


def file_extention(file: str) -> str:
path = Path(file)
return path.suffix


def read_df(uri: str, autodetect_encoding: bool = True, **kwargs) -> pd.DataFrame:
"""A simple wrapper to read different file formats into DataFrame."""
try:
return _read_df(uri, **kwargs)
except UnicodeDecodeError as e:
if autodetect_encoding:
detected_encodings = detect_file_encodings(path_from_uri(uri), timeout=30)
for encoding in detected_encodings:
try:
return _read_df(uri, encoding=encoding.encoding, **kwargs)
except UnicodeDecodeError:
continue
# Either we ran out of detected encoding, or autodetect_encoding is False,
# we should raise encoding error
raise ValueError(f"不支持的文件编码{e.encoding},请转换成 utf-8 后重试")


def _read_df(uri: str, encoding: str = "utf-8", **kwargs) -> pd.DataFrame:
"""A simple wrapper to read different file formats into DataFrame."""
ext = file_extention(uri).lower()
if ext == ".csv":
df = pd.read_csv(uri, encoding=encoding, **kwargs)
elif ext == ".tsv":
df = pd.read_csv(uri, sep="\t", encoding=encoding, **kwargs)
elif ext in [".xls", ".xlsx", ".xlsm", ".xlsb", ".odf", ".ods", ".odt"]:
# read_excel does not support 'encoding' arg, also it seems that it does not need it.
df = pd.read_excel(uri, **kwargs)
else:
raise ValueError(
f"TableGPT 目前支持 csv、tsv 以及 xlsx 文件,您上传的文件格式 {ext} 暂不支持。"
)
return df
5 changes: 5 additions & 0 deletions ipython/ipython-startup-scripts/99-cfont.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
import seaborn as sns
from mplfonts import use_font

use_font("Noto Serif CJK SC")
sns.set_theme(font="Noto Serif CJK SC")
12 changes: 12 additions & 0 deletions ipython/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
pandas >=2.2,<3.0.0
scipy >=1.13.0,<2.0.0
tabulate >=0.9.0,<1.0.0
scikit-learn >=1.0.0,<2.0.0
statsmodels >=0.10.0,<1.0.0
matplotlib >=3.8.4,<4.0.0
seaborn >=0.13.1,<1.0.0
mplfonts >=0.0.8,<1.0.0
numexpr >=2.8.4
openpyxl >=3.1.2,<4.0.0 # read xlsx files
xlrd >= 2.0.1 # read xls files
odfpy # read ods files

0 comments on commit 99b32d2

Please sign in to comment.