-
Notifications
You must be signed in to change notification settings - Fork 45
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
3a64cce
commit 99b32d2
Showing
5 changed files
with
179 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
# TableGPT IPython Kernel | ||
|
||
This kernel is used to execute code generated by `tablegpt-agent` and has been equipped with data analysis and Chinese font support. | ||
|
||
## Installation | ||
|
||
You can install the dependencies required for the `IPython Kernel` using the following command: | ||
|
||
```sh | ||
pip install -r requirements.txt | ||
``` | ||
|
||
## Usage | ||
|
||
If you're using the local executor (`pybox.LocalPyBoxManager`), you need to copy the code from the `ipython-startup-scripts` folder to `$HOME/.ipython/profile_default/startup/`. This directory contains functions and configurations required for performing data analysis with `tablegpt-agent`. | ||
|
||
Note: The `~/.ipython` directory must be writable for the process launching the kernel, otherwise there will be a warning message: `UserWarning: IPython dir '/home/jovyan/.ipython' is not a writable location, using a temp directory.` and the startup scripts won't take effects. | ||
|
||
## Official document | ||
|
||
Official document at `~/.ipython/profile_default/startup/README`: | ||
|
||
> This is the IPython startup directory | ||
> | ||
> .py and .ipy files in this directory will be run *prior* to any code or files specified | ||
> via the exec_lines or exec_files configurables whenever you load this profile. | ||
> | ||
> Files will be run in lexicographical order, so you can control the execution order of files | ||
> with a prefix, e.g.:: | ||
> | ||
> 00-first.py | ||
> 50-middle.py | ||
> 99-last.ipy |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
import pandas as pd | ||
|
||
pd.set_option("display.width", 2048) | ||
# 8 is the minimum value to display `df.describe()`. We have other truncation mechanisms so it's OK to flex this a bit. | ||
pd.set_option("display.max_rows", 8) | ||
pd.set_option("display.max_columns", 40) | ||
pd.set_option("display.max_colwidth", 40) | ||
pd.set_option("display.precision", 3) | ||
pd.set_option("future.no_silent_downcasting", True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,120 @@ | ||
import os | ||
from pathlib import Path | ||
from typing import NamedTuple, cast | ||
|
||
import pandas as pd | ||
import concurrent.futures | ||
|
||
|
||
class FileEncoding(NamedTuple): | ||
"""File encoding as the NamedTuple.""" | ||
|
||
encoding: str | None | ||
"""The encoding of the file.""" | ||
confidence: float | ||
"""The confidence of the encoding.""" | ||
language: str | None | ||
"""The language of the file.""" | ||
|
||
|
||
def detect_file_encodings( | ||
file_path: str | Path, timeout: int = 5 | ||
) -> list[FileEncoding]: | ||
"""Try to detect the file encoding. | ||
Returns a list of `FileEncoding` tuples with the detected encodings ordered | ||
by confidence. | ||
Args: | ||
file_path: The path to the file to detect the encoding for. | ||
timeout: The timeout in seconds for the encoding detection. | ||
""" | ||
import chardet | ||
|
||
file_path = str(file_path) | ||
|
||
def read_and_detect(file_path: str) -> list[dict]: | ||
with open(file_path, "rb") as f: | ||
rawdata = f.read() | ||
return cast(list[dict], chardet.detect_all(rawdata)) | ||
|
||
with concurrent.futures.ThreadPoolExecutor() as executor: | ||
future = executor.submit(read_and_detect, file_path) | ||
try: | ||
encodings = future.result(timeout=timeout) | ||
except concurrent.futures.TimeoutError: | ||
raise TimeoutError( | ||
f"Timeout reached while detecting encoding for {file_path}" | ||
) | ||
|
||
if all(encoding["encoding"] is None for encoding in encodings): | ||
raise RuntimeError(f"Could not detect encoding for {file_path}") | ||
return [FileEncoding(**enc) for enc in encodings if enc["encoding"] is not None] | ||
|
||
|
||
def path_from_uri(uri: str) -> Path: | ||
"""Return a new path from the given 'file' URI. | ||
This is implemented in Python 3.13. | ||
See <https://github.com/python/cpython/pull/107640> | ||
and <https://github.com/python/cpython/pull/107640/files#diff-fa525485738fc33d05b06c159172ff1f319c26e88d8c6bb39f7dbaae4dc4105c> | ||
TODO: remove when we migrate to Python 3.13""" | ||
if not uri.startswith("file:"): | ||
raise ValueError(f"URI does not start with 'file:': {uri!r}") | ||
path = uri[5:] | ||
if path[:3] == "///": | ||
# Remove empty authority | ||
path = path[2:] | ||
elif path[:12] == "//localhost/": | ||
# Remove 'localhost' authority | ||
path = path[11:] | ||
if path[:3] == "///" or (path[:1] == "/" and path[2:3] in ":|"): | ||
# Remove slash before DOS device/UNC path | ||
path = path[1:] | ||
if path[1:2] == "|": | ||
# Replace bar with colon in DOS drive | ||
path = path[:1] + ":" + path[2:] | ||
from urllib.parse import unquote_to_bytes | ||
|
||
path = Path(os.fsdecode(unquote_to_bytes(path))) | ||
if not path.is_absolute(): | ||
raise ValueError(f"URI is not absolute: {uri!r}") | ||
return path | ||
|
||
|
||
def file_extention(file: str) -> str: | ||
path = Path(file) | ||
return path.suffix | ||
|
||
|
||
def read_df(uri: str, autodetect_encoding: bool = True, **kwargs) -> pd.DataFrame: | ||
"""A simple wrapper to read different file formats into DataFrame.""" | ||
try: | ||
return _read_df(uri, **kwargs) | ||
except UnicodeDecodeError as e: | ||
if autodetect_encoding: | ||
detected_encodings = detect_file_encodings(path_from_uri(uri), timeout=30) | ||
for encoding in detected_encodings: | ||
try: | ||
return _read_df(uri, encoding=encoding.encoding, **kwargs) | ||
except UnicodeDecodeError: | ||
continue | ||
# Either we ran out of detected encoding, or autodetect_encoding is False, | ||
# we should raise encoding error | ||
raise ValueError(f"不支持的文件编码{e.encoding},请转换成 utf-8 后重试") | ||
|
||
|
||
def _read_df(uri: str, encoding: str = "utf-8", **kwargs) -> pd.DataFrame: | ||
"""A simple wrapper to read different file formats into DataFrame.""" | ||
ext = file_extention(uri).lower() | ||
if ext == ".csv": | ||
df = pd.read_csv(uri, encoding=encoding, **kwargs) | ||
elif ext == ".tsv": | ||
df = pd.read_csv(uri, sep="\t", encoding=encoding, **kwargs) | ||
elif ext in [".xls", ".xlsx", ".xlsm", ".xlsb", ".odf", ".ods", ".odt"]: | ||
# read_excel does not support 'encoding' arg, also it seems that it does not need it. | ||
df = pd.read_excel(uri, **kwargs) | ||
else: | ||
raise ValueError( | ||
f"TableGPT 目前支持 csv、tsv 以及 xlsx 文件,您上传的文件格式 {ext} 暂不支持。" | ||
) | ||
return df |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
import seaborn as sns | ||
from mplfonts import use_font | ||
|
||
use_font("Noto Serif CJK SC") | ||
sns.set_theme(font="Noto Serif CJK SC") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
pandas >=2.2,<3.0.0 | ||
scipy >=1.13.0,<2.0.0 | ||
tabulate >=0.9.0,<1.0.0 | ||
scikit-learn >=1.0.0,<2.0.0 | ||
statsmodels >=0.10.0,<1.0.0 | ||
matplotlib >=3.8.4,<4.0.0 | ||
seaborn >=0.13.1,<1.0.0 | ||
mplfonts >=0.0.8,<1.0.0 | ||
numexpr >=2.8.4 | ||
openpyxl >=3.1.2,<4.0.0 # read xlsx files | ||
xlrd >= 2.0.1 # read xls files | ||
odfpy # read ods files |