Update doc and fix endpoint address and add examples (#31)

* Support load dataset by sdk * Update doc and fix endpoint address and add examples * handle blank was changed to plus
OpenCSGs · Aug 28, 2024 · 63f4b97 · 63f4b97
1 parent 5ef8ae6
commit 63f4b97
Show file tree

Hide file tree

Showing 19 changed files with 308 additions and 76 deletions.
diff --git a/README.md b/README.md
@@ -77,7 +77,7 @@ pip install .
 ## 命令行使用示例
 
 ```shell
-export CSG_TOKEN=3b77c98077b415ca381ded189b86d5df226e3776
+export CSG_TOKEN=your_access_token
 
 # 模型下载
 csghub-cli download wanghh2000/myprivate1 
@@ -100,19 +100,19 @@ csghub-cli upload wanghh2000/myds1 abc/4.txt abc/5.txt -t dataset
 
 ```python
 from pycsghub.snapshot_download import snapshot_download
-token = "3b77c98077b415ca381ded189b86d5df226e3776"
+token = "your_access_token"
 
 endpoint = "https://hub.opencsg.com"
 repo_type = "model"
 repo_id = 'OpenCSG/csg-wukong-1B'
 cache_dir = '/Users/hhwang/temp/'
-result = snapshot_download(repo_id, cache_dir=cache_dir, endpoint=endpoint, token=token, repo_type=repotype)
+result = snapshot_download(repo_id, repo_type=repo_type, cache_dir=cache_dir, endpoint=endpoint, token=token,)
 ```
 
 ### 数据集下载
 ```python
 from pycsghub.snapshot_download import snapshot_download
-token = "3b77c98077b415ca381ded189b86d5df226e3776"
+token = "your_access_token"
 
 endpoint = "https://hub.opencsg.com"
 repo_id = 'AIWizards/tmmluplus'
@@ -127,7 +127,7 @@ result = snapshot_download(repo_id, repo_type=repo_type, cache_dir=cache_dir, en
 
 ```python
 from pycsghub.file_download import file_download
-token = "3b77c98077b415ca381ded189b86d5df226e3776"
+token = "your_access_token"
 
 endpoint = "https://hub.opencsg.com"
 repo_type = "model"
@@ -140,7 +140,7 @@ result = file_download(repo_id, file_name='README.md', cache_dir=cache_dir, endp
 
 ```python
 from pycsghub.file_download import http_get
-token = "3b77c98077b415ca381ded189b86d5df226e3776"
+token = "your_access_token"
 
 url = "https://hub.opencsg.com/api/v1/models/OpenCSG/csg-wukong-1B/resolve/tokenizer.model"
 local_dir = '/home/test/'
@@ -155,7 +155,7 @@ http_get(url=url, token=token, local_dir=local_dir, file_name=file_name, headers
 ```python
 from pycsghub.file_upload import http_upload_file
 
-token = "3b77c98077b415ca381ded189b86d5df226e3776"
+token = "your_access_token"
 
 endpoint = "https://hub.opencsg.com"
 repo_type = "model"
@@ -168,7 +168,7 @@ result = http_upload_file(repo_id, endpoint=endpoint, token=token, repo_type='mo
 ```python
 from pycsghub.file_upload import http_upload_file
 
-token = "3b77c98077b415ca381ded189b86d5df226e3776"
+token = "your_access_token"
 
 endpoint = "https://hub.opencsg.com"
 repo_type = "model"

diff --git a/README_EN.md b/README_EN.md
@@ -46,7 +46,7 @@ After installation, you can begin using the SDK to connect to your CSGHub server
 import os 
 from pycsghub.repo_reader import AutoModelForCausalLM, AutoTokenizer
 
-os.environ['CSG_TOKEN'] = '3b77c98077b415ca381ded189b86d5df226e3776'
+os.environ['CSG_TOKEN'] = 'your_access_token'
 
 mid = 'OpenCSG/csg-wukong-1B'
 model = AutoModelForCausalLM.from_pretrained(mid)
@@ -77,7 +77,7 @@ pip install .
 ## Use cases of command line
 
 ```shell
-export CSG_TOKEN=3b77c98077b415ca381ded189b86d5df226e3776
+export CSG_TOKEN=your_access_token
 
 # download model
 csghub-cli download wanghh2000/myprivate1
@@ -102,7 +102,7 @@ For more detailed instructions, including API documentation and usage examples,
 
 ```python
 from pycsghub.snapshot_download import snapshot_download
-token = "3b77c98077b415ca381ded189b86d5df226e3776"
+token = "your_access_token"
 
 endpoint = "https://hub.opencsg.com"
 repo_id = 'OpenCSG/csg-wukong-1B'
@@ -127,7 +127,7 @@ Use `http_get` function to download single file
 
 ```python
 from pycsghub.file_download import http_get
-token = "3b77c98077b415ca381ded189b86d5df226e3776"
+token = "your_access_token"
 
 url = "https://hub.opencsg.com/api/v1/models/OpenCSG/csg-wukong-1B/resolve/tokenizer.model"
 local_dir = '/home/test/'
@@ -141,7 +141,7 @@ use `file_download` function to download single file from a repository
 
 ```python
 from pycsghub.file_download import file_download
-token = "3b77c98077b415ca381ded189b86d5df226e3776"
+token = "your_access_token"
 
 endpoint = "https://hub.opencsg.com"
 repo_id = 'OpenCSG/csg-wukong-1B'
@@ -154,7 +154,7 @@ result = file_download(repo_id, file_name='README.md', cache_dir=cache_dir, endp
 ```python
 from pycsghub.file_upload import http_upload_file
 
-token = "3b77c98077b415ca381ded189b86d5df226e3776"
+token = "your_access_token"
 
 endpoint = "https://hub.opencsg.com"
 repo_type = "model"
@@ -167,7 +167,7 @@ result = http_upload_file(repo_id, endpoint=endpoint, token=token, repo_type='mo
 ```python
 from pycsghub.file_upload import http_upload_file
 
-token = "3b77c98077b415ca381ded189b86d5df226e3776"
+token = "your_access_token"
 
 endpoint = "https://hub.opencsg.com"
 repo_type = "model"

diff --git a/examples/README.md b/examples/README.md
@@ -0,0 +1,25 @@
+# Examples
+
+我们提供了大量示例脚本，用于通过 CSGHub SDK 与 CSGHub 服务器进行交互。
+
+虽然我们努力展示尽可能多的用例，但预计它们不会在您的特定问题上开箱即用，并且您需要更改几行代码以适应您的需求。为了帮助您，大多数示例完全公开了数据的预处理，允许您根据需要进行调整和编辑。
+
+## Important note
+
+**Important**
+
+为了确保您能够成功运行最新版本的示例脚本，您需要**从源代码安装库**。为此，请在新虚拟环境中执行以下步骤：
+
+```shell
+git clone https://github.com/OpenCSGs/csghub-sdk.git
+cd csghub-sdk
+pip install .
+```
+
+运行示例脚本前，请先设置必要的环境变量如下。
+
+```shell
+export HF_ENDPOINT="https://hub.opencsg.com"
+```
+
+你可以根据自己的需求调整脚本。
diff --git a/examples/README_EN.md b/examples/README_EN.md
@@ -0,0 +1,25 @@
+# Examples
+
+We host a wide range of example scripts for use CSGHub SDK to interact with the CSGHub server.
+
+While we strive to present as many use cases as possible. It is expected that they won't work out-of-the-box on your specific problem and that you will be required to change a few lines of code to adapt them to your needs. To help you with that, most of the examples fully expose the preprocessing of the data, allowing you to tweak and edit them as required.
+
+## Important note
+
+**Important**
+
+To make sure you can successfully run the latest versions of the example scripts, you have to **install the library from source**. To do this, execute the following steps in a new virtual environment:
+
+```shell
+git clone https://github.com/OpenCSGs/csghub-sdk.git
+cd csghub-sdk
+pip install .
+```
+
+Before running the example script, please set the necessary environment variables as follows.
+
+```shell
+export HF_ENDPOINT="https://hub.opencsg.com"
+```
+
+You can also adapt the script to your own needs.
diff --git a/examples/download_dataset.py b/examples/download_dataset.py
@@ -0,0 +1,9 @@
+from pycsghub.snapshot_download import snapshot_download
+# token = "your access token"
+token = None
+
+endpoint = "https://hub.opencsg.com"
+repo_id = 'OpenDataLab/CodeExp'
+repo_type = "dataset"
+cache_dir = '/Users/hhwang/temp/'
+result = snapshot_download(repo_id, repo_type=repo_type, cache_dir=cache_dir, endpoint=endpoint, token=token)
diff --git a/examples/download_file.py b/examples/download_file.py
@@ -0,0 +1,9 @@
+from pycsghub.file_download import file_download
+# token = "your access token"
+token = None
+
+endpoint = "https://hub.opencsg.com"
+repo_type = "model"
+repo_id = 'OpenCSG/csg-wukong-1B'
+cache_dir = '/Users/hhwang/temp/'
+result = file_download(repo_id, file_name='README.md', cache_dir=cache_dir, endpoint=endpoint, token=token, repo_type=repo_type)
diff --git a/examples/download_model.py b/examples/download_model.py
@@ -0,0 +1,9 @@
+from pycsghub.snapshot_download import snapshot_download
+# token = "your access token"
+token = None
+
+endpoint = "https://hub.opencsg.com"
+repo_type = "model"
+repo_id = 'OpenCSG/csg-wukong-1B'
+cache_dir = '/Users/hhwang/temp/'
+result = snapshot_download(repo_id, repo_type=repo_type, cache_dir=cache_dir, endpoint=endpoint, token=token)
diff --git a/examples/load_dataset.py b/examples/load_dataset.py
@@ -0,0 +1,11 @@
+# from datasets.load import load_dataset
+from pycsghub.repo_reader import load_dataset
+
+dsPath = "wanghh2000/glue"
+dsName = "mrpc"
+
+# access_token = "your_access_token"
+access_token = None
+
+raw_datasets = load_dataset(path=dsPath, name=dsName, token=access_token)
+print('raw_datasets', raw_datasets)
diff --git a/examples/run_finetune_bert.py b/examples/run_finetune_bert.py
@@ -0,0 +1,65 @@
+from typing import Any
+import pandas as pd
+
+from transformers import DataCollatorWithPadding
+from transformers import TrainingArguments
+from transformers import Trainer
+
+from pycsghub.repo_reader import load_dataset
+from pycsghub.repo_reader import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
+
+model_id_or_path = "wanghh2000/bert-base-uncased"
+tokenizer = AutoTokenizer.from_pretrained(model_id_or_path, trust_remote_code=True)
+model = AutoModelForSequenceClassification.from_pretrained(model_id_or_path)
+
+dsPath = "wanghh2000/glue"
+dsName = "mrpc"
+# access_token = "your_access_token"
+access_token = None
+raw_datasets = load_dataset(dsPath, dsName, token=access_token)
+
+def get_data_proprocess() -> Any:
+    def preprocess_function(examples: pd.DataFrame):            
+        ret = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=100)
+        ret = {**examples, **ret}
+        return pd.DataFrame.from_dict(ret)
+    return preprocess_function
+
+train_dataset = raw_datasets["train"].select(range(20)).map(get_data_proprocess(), batched=True)
+eval_dataset = raw_datasets["validation"].select(range(20)).map(get_data_proprocess(), batched=True)
+
+def data_collator() -> Any:
+    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+    return data_collator
+
+outputDir = "/Users/hhwang/temp/ff"
+args = TrainingArguments(
+    outputDir,
+    evaluation_strategy="steps",
+    save_strategy="steps",
+    logging_strategy="steps",
+    logging_steps = 2,
+    save_steps = 10,
+    eval_steps = 2,
+    learning_rate=2e-5,
+    per_device_train_batch_size=4,
+    per_device_eval_batch_size=4,
+    num_train_epochs=2,
+    weight_decay=0.01,
+    push_to_hub=False,
+    disable_tqdm=False,  # declutter the output a little
+    use_cpu=True,  # you need to explicitly set no_cuda if you want CPUs
+    remove_unused_columns=True,
+)
+
+trainer = Trainer(
+    model,
+    args,
+    train_dataset=train_dataset,
+    eval_dataset=eval_dataset,
+    tokenizer=tokenizer,
+)
+
+trainResult = trainer.train()
+trainer.save_model()
+print(f"save model to {outputDir}")
diff --git a/examples/run_wukong_inference.py b/examples/run_wukong_inference.py
@@ -0,0 +1,10 @@
+import os 
+from pycsghub.repo_reader import AutoModelForCausalLM, AutoTokenizer
+
+mid = 'OpenCSG/csg-wukong-1B'
+model = AutoModelForCausalLM.from_pretrained(mid)
+tokenizer = AutoTokenizer.from_pretrained(mid)
+
+inputs = tokenizer.encode("Write a short story", return_tensors="pt")
+outputs = model.generate(inputs)
+print('result: ',tokenizer.batch_decode(outputs))
diff --git a/examples/upload_file.py b/examples/upload_file.py
@@ -0,0 +1,8 @@
+from pycsghub.file_upload import http_upload_file
+
+token = "your_access_token"
+
+endpoint = "https://hub.opencsg.com"
+repo_type = "model"
+repo_id = 'wanghh2000/myprivate1'
+result = http_upload_file(repo_id, endpoint=endpoint, token=token, repo_type='model', file_path='README.md')
diff --git a/pycsghub/file_download.py b/pycsghub/file_download.py
@@ -79,7 +79,7 @@ def file_download(
                 " online, set 'local_files_only' to False.")
         return cache.get_root_location()
     else:
-        download_endpoint = endpoint if endpoint is not None else get_endpoint()
+        download_endpoint = get_endpoint(endpoint=endpoint)
         # make headers
         # todo need to add cookies？
         repo_info = utils.get_repo_info(repo_id=repo_id,
@@ -205,10 +205,9 @@ def http_get(*,
 
 
 if __name__ == '__main__':
-    token = "f3a7b9c1d6e5f8e2a1b5d4f9e6a2b8d7c3a4e2b1d9f6e7a8d2c5a7b4c1e3f5b8a1d4f9" + \
-            "b7d6e2f8a5d3b1e7f9c6a8b2d1e4f7d5b6e9f2a4b3c8e1d7f995hd82hf"
+    token = "your_access_token"
 
-    url = "https://hub-stg.opencsg.com/api/v1/models/wayne0019/lwfmodel/resolve/lfsfile.bin"
+    url = "https://hub.opencsg.com/api/v1/models/wayne0019/lwfmodel/resolve/lfsfile.bin"
     local_dir = '/home/test/'
     file_name = 'test.txt'
     headers = None

diff --git a/pycsghub/repo_reader/__init__.py b/pycsghub/repo_reader/__init__.py
@@ -1 +1,2 @@
-from .model.huggingface.model_auto import *
+from .model.huggingface.model_auto import *
+from .dataset.huggingface.load import *