Skip to content

FEAT: 作业提交V4版本 #97

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions selected_homework/openai-translator_v4/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
### 作业需求
基于 ChatGLM2-6B 实现带图形化界面的 openai-translator

### 作业总结
+ [openai_api_demo](openai_api_demo)

利用的chatGLM中的api_demo进行调整为server项,故没有采用ChatGLM2-6b,而是ChatGLM3-6b

运行起来需要:
1. git clone https://www.modelscope.cn/ZhipuAI/chatglm3-6b.git
2. 确保机器有足够的资源【俺没有。。。故暂未实现。。。】

+ [ai_translator](ai_translator)

将历史的
from langchain_openai import ChatOpenAI
替换为
from langchain.llms import ChatGLM
并针对ChatGLM做参数匹配
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .book import Book
from .page import Page
from .content import ContentType, Content, TableContent
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from .page import Page

class Book:
def __init__(self, pdf_file_path):
self.pdf_file_path = pdf_file_path
self.pages = []

def add_page(self, page: Page):
self.pages.append(page)
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import pandas as pd

from enum import Enum, auto
from PIL import Image as PILImage
from utils import LOG
from io import StringIO

class ContentType(Enum):
TEXT = auto()
TABLE = auto()
IMAGE = auto()

class Content:
def __init__(self, content_type, original, translation=None):
self.content_type = content_type
self.original = original
self.translation = translation
self.status = False

def set_translation(self, translation, status):
if not self.check_translation_type(translation):
raise ValueError(f"Invalid translation type. Expected {self.content_type}, but got {type(translation)}")
self.translation = translation
self.status = status

def check_translation_type(self, translation):
if self.content_type == ContentType.TEXT and isinstance(translation, str):
return True
elif self.content_type == ContentType.TABLE and isinstance(translation, list):
return True
elif self.content_type == ContentType.IMAGE and isinstance(translation, PILImage.Image):
return True
return False

def __str__(self):
return self.original


class TableContent(Content):
def __init__(self, data, translation=None):
df = pd.DataFrame(data)

# Verify if the number of rows and columns in the data and DataFrame object match
if len(data) != len(df) or len(data[0]) != len(df.columns):
raise ValueError("The number of rows and columns in the extracted table data and DataFrame object do not match.")

super().__init__(ContentType.TABLE, df)

def set_translation(self, translation, status):
try:
if not isinstance(translation, str):
raise ValueError(f"Invalid translation type. Expected str, but got {type(translation)}")

LOG.debug(f"[translation]\n{translation}")
# Extract column names from the first set of brackets
header = translation.split(']')[0][1:].split(', ')
# Extract data rows from the remaining brackets
data_rows = translation.split('] ')[1:]
# Replace Chinese punctuation and split each row into a list of values
data_rows = [row[1:-1].split(', ') for row in data_rows]
# Create a DataFrame using the extracted header and data
translated_df = pd.DataFrame(data_rows, columns=header)
LOG.debug(f"[translated_df]\n{translated_df}")
self.translation = translated_df
self.status = status
except Exception as e:
LOG.error(f"An error occurred during table translation: {e}")
self.translation = None
self.status = False

def __str__(self):
return self.original.to_string(header=False, index=False)

def iter_items(self, translated=False):
target_df = self.translation if translated else self.original
for row_idx, row in target_df.iterrows():
for col_idx, item in enumerate(row):
yield (row_idx, col_idx, item)

def update_item(self, row_idx, col_idx, new_value, translated=False):
target_df = self.translation if translated else self.original
target_df.at[row_idx, col_idx] = new_value

def get_original_as_str(self):
return self.original.to_string(header=False, index=False)
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from .content import Content

class Page:
def __init__(self):
self.contents = []

def add_content(self, content: Content):
self.contents.append(content)
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
model_name: "chatglm2-6b"
input_file: "tests/test.pdf"
output_file_format: "markdown"
source_language: "English"
target_language: "Chinese"
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import sys
import os

sys.path.append(os.path.dirname(os.path.abspath(__file__)))

from flask import Flask, request, send_file, jsonify
from translator import PDFTranslator, TranslationConfig
from utils import ArgumentParser, LOG

app = Flask(__name__)

TEMP_FILE_DIR = "flask_temps/"

@app.route('/translation', methods=['POST'])
def translation():
try:
input_file = request.files['input_file']
source_language = request.form.get('source_language', 'English')
target_language = request.form.get('target_language', 'Chinese')

LOG.debug(f"[input_file]\n{input_file}")
LOG.debug(f"[input_file.filename]\n{input_file.filename}")

if input_file and input_file.filename:
# # 创建临时文件
input_file_path = TEMP_FILE_DIR+input_file.filename
LOG.debug(f"[input_file_path]\n{input_file_path}")

input_file.save(input_file_path)

# 调用翻译函数
output_file_path = Translator.translate_pdf(
input_file=input_file_path,
source_language=source_language,
target_language=target_language)

# 移除临时文件
# os.remove(input_file_path)

# 构造完整的文件路径
output_file_path = os.getcwd() + "/" + output_file_path
LOG.debug(output_file_path)

# 返回翻译后的文件
return send_file(output_file_path, as_attachment=True)
except Exception as e:
response = {
'status': 'error',
'message': str(e)
}
return jsonify(response), 400


def initialize_translator():
# 解析命令行
argument_parser = ArgumentParser()
args = argument_parser.parse_arguments()

# 初始化配置单例
config = TranslationConfig()
config.initialize(args)
# 实例化 PDFTranslator 类,并调用 translate_pdf() 方法
global Translator
Translator = PDFTranslator(config.model_name)


if __name__ == "__main__":
# 初始化 translator
initialize_translator()
# 启动 Flask Web Server
app.run(host="0.0.0.0", port=5000, debug=True)
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import sys
import os
import gradio as gr


sys.path.append(os.path.dirname(os.path.abspath(__file__)))

from utils import ArgumentParser, LOG
from translator import PDFTranslator, TranslationConfig


def translation(input_file, source_language, target_language, translate_style):
LOG.debug(
f"[翻译任务]\n源文件: {input_file.name}\n源语言: {source_language}\n目标语言: {target_language}\n翻译风格: {translate_style}")

output_file_path = Translator.translate_pdf(
input_file.name, source_language=source_language,
target_language=target_language, translate_style=translate_style
)

return output_file_path

def launch_gradio():

iface = gr.Interface(
fn=translation,
title="[Homework]OpenAI-Translator v4(接入GLM3-6b)",
inputs=[
gr.File(label="上传PDF文件"),
gr.Textbox(label="源语言(默认:英文)", placeholder="English", value="English"),
gr.Textbox(label="目标语言(默认:中文)", placeholder="Chinese", value="Chinese"),
gr.Radio(["Normal people", "Children", "Professor"]),
],
outputs=[
gr.File(label="下载翻译文件")
],
allow_flagging="never"
)

iface.launch(share=True, server_name="0.0.0.0")

def initialize_translator():
# 解析命令行
argument_parser = ArgumentParser()
args = argument_parser.parse_arguments()

# 初始化配置单例
config = TranslationConfig()
config.initialize(args)
# 实例化 PDFTranslator 类,并调用 translate_pdf() 方法
global Translator
Translator = PDFTranslator(config.model_name)


if __name__ == "__main__":
# 初始化 translator
initialize_translator()
# 启动 Gradio 服务
launch_gradio()
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
测试数据
这个数据集包含了由OpenAI的AI语言模型ChatGPT提供的两个测试样本。
这些样本包括一个Markdown表格和一个英文文本段落,可以用来测试支持文本和表格格式的英译中翻译软件。
文本测试
快速的棕色狐狸跳过懒狗。这个句子包含了英语字母表中的每个字母至少一次。句子是经常用来测试字体、键盘和其他与文本相关的工具的。除了英语,其他许多语言也有句子。由于语言的独特特点,有些句子更难构造。

| 水果 | 颜色 | 价格(美元) |
| --- | --- | --- |
| 苹果 | 红色 | 1.2 |
| 香蕉 | 黄色 | 0.5 |
| 橙子 | 橙色 | 0.8 |
| 草莓 | 红色 | 2.5 |
| 蓝莓 | 蓝色 | 3.0 |
| 猕猴桃 | 绿色 | 1.0 |
| 芒果 | 橙色 | 1.5 |
| 葡萄 | 紫色 | 2.00 |

---

Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .pdf_translator import PDFTranslator
from .translation_config import TranslationConfig
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
class PageOutOfRangeException(Exception):
def __init__(self, book_pages, requested_pages):
self.book_pages = book_pages
self.requested_pages = requested_pages
super().__init__(f"Page out of range: Book has {book_pages} pages, but {requested_pages} pages were requested.")
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import pdfplumber
from typing import Optional
from book import Book, Page, Content, ContentType, TableContent
from translator.exceptions import PageOutOfRangeException
from utils import LOG


class PDFParser:
def __init__(self):
pass

def parse_pdf(self, pdf_file_path: str, pages: Optional[int] = None) -> Book:
book = Book(pdf_file_path)

with pdfplumber.open(pdf_file_path) as pdf:
if pages is not None and pages > len(pdf.pages):
raise PageOutOfRangeException(len(pdf.pages), pages)

if pages is None:
pages_to_parse = pdf.pages
else:
pages_to_parse = pdf.pages[:pages]

for pdf_page in pages_to_parse:
page = Page()

# Store the original text content
raw_text = pdf_page.extract_text()
tables = pdf_page.extract_tables()

# Remove each cell's content from the original text
for table_data in tables:
for row in table_data:
for cell in row:
raw_text = raw_text.replace(cell, "", 1)

# Handling text
if raw_text:
# Remove empty lines and leading/trailing whitespaces
raw_text_lines = raw_text.splitlines()
cleaned_raw_text_lines = [line.strip() for line in raw_text_lines if line.strip()]
cleaned_raw_text = "\n".join(cleaned_raw_text_lines)

text_content = Content(content_type=ContentType.TEXT, original=cleaned_raw_text)
page.add_content(text_content)
LOG.debug(f"[raw_text]\n {cleaned_raw_text}")



# Handling tables
if tables:
table = TableContent(tables)
page.add_content(table)
LOG.debug(f"[table]\n{table}")

book.add_page(page)

return book
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from typing import Optional
from translator.pdf_parser import PDFParser
from translator.writer import Writer
from translator.translation_chain import TranslationChain

class PDFTranslator:
def __init__(self, model_name: str):
self.translate_chain = TranslationChain(model_name)
self.pdf_parser = PDFParser()
self.writer = Writer()

def translate_pdf(self,
input_file: str,
output_file_format: str = 'markdown',
source_language: str = "English",
target_language: str = 'Chinese',
translate_style: str = "Normal Style",
pages: Optional[int] = None):

self.book = self.pdf_parser.parse_pdf(input_file, pages)

for page_idx, page in enumerate(self.book.pages):
for content_idx, content in enumerate(page.contents):
# Translate content.original
translation, status = self.translate_chain.run(
content, source_language, target_language, translate_style)
# Update the content in self.book.pages directly
self.book.pages[page_idx].contents[content_idx].set_translation(translation, status)

return self.writer.save_translated_book(self.book, output_file_format)
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from langchain.llms import ChatGLM
from langchain.chains import LLMChain

from langchain.prompts.chat import (
ChatPromptTemplate,
SystemMessagePromptTemplate,
HumanMessagePromptTemplate,
)

from utils import LOG

class TranslationChain:
def __init__(self, model_name: str = "chatglm2-6b", verbose: bool = True):

# 翻译任务指令始终由 System 角色承担
template = (
"""You are a translation expert, proficient in various languages. \n
Translates {source_language} to {target_language}. \n
Speak like {translate_style}."""
)
system_message_prompt = SystemMessagePromptTemplate.from_template(template)

# 待翻译文本由 Human 角色输入
human_template = "{text}"
human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)

# 使用 System 和 Human 角色的提示模板构造 ChatPromptTemplate
chat_prompt_template = ChatPromptTemplate.from_messages(
[system_message_prompt, human_message_prompt]
)

chat = ChatGLM(
endpoint_url="http://127.0.0.1:8000",
max_token=8000
)

self.chain = LLMChain(llm=chat, prompt=chat_prompt_template, verbose=verbose)

def run(self, text: str,
source_language: str,
target_language: str,
translate_style: str) -> (str, bool):
result = ""
try:
result = self.chain.run({
"text": text,
"source_language": source_language,
"target_language": target_language,
"translate_style": translate_style
})
except Exception as e:
LOG.error(f"An error occurred during translation: {e}")
return result, False
return result, True
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import yaml

class TranslationConfig:
_instance = None

def __new__(cls):
if cls._instance is None:
cls._instance = super(TranslationConfig, cls).__new__(cls)
cls._instance._config = None
return cls._instance

def initialize(self, args):
with open(args.config_file, "r") as f:
config = yaml.safe_load(f)

# Use the argparse Namespace to update the configuration
overridden_values = {
key: value for key, value in vars(args).items() if key in config and value is not None
}
config.update(overridden_values)

# Store the original config dictionary
self._instance._config = config

def __getattr__(self, name):
# Try to get attribute from _config
if self._instance._config and name in self._instance._config:
return self._instance._config[name]
raise AttributeError(f"'TranslationConfig' object has no attribute '{name}'")
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
import os
from reportlab.lib import colors, pagesizes, units
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
from reportlab.platypus import (
SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak
)

from book import Book, ContentType
from utils import LOG

class Writer:
def __init__(self):
pass

def save_translated_book(self, book: Book, ouput_file_format: str):
LOG.debug(ouput_file_format)

if ouput_file_format.lower() == "pdf":
output_file_path = self._save_translated_book_pdf(book)
elif ouput_file_format.lower() == "markdown":
output_file_path = self._save_translated_book_markdown(book)
else:
LOG.error(f"不支持文件类型: {ouput_file_format}")
return ""

LOG.info(f"翻译完成,文件保存至: {output_file_path}")

return output_file_path


def _save_translated_book_pdf(self, book: Book, output_file_path: str = None):

output_file_path = book.pdf_file_path.replace('.pdf', f'_translated.pdf')

LOG.info(f"开始导出: {output_file_path}")

# Register Chinese font
font_path = "../fonts/simsun.ttc" # 请将此路径替换为您的字体文件路径
pdfmetrics.registerFont(TTFont("SimSun", font_path))

# Create a new ParagraphStyle with the SimSun font
simsun_style = ParagraphStyle('SimSun', fontName='SimSun', fontSize=12, leading=14)

# Create a PDF document
doc = SimpleDocTemplate(output_file_path, pagesize=pagesizes.letter)
styles = getSampleStyleSheet()
story = []

# Iterate over the pages and contents
for page in book.pages:
for content in page.contents:
if content.status:
if content.content_type == ContentType.TEXT:
# Add translated text to the PDF
text = content.translation
para = Paragraph(text, simsun_style)
story.append(para)

elif content.content_type == ContentType.TABLE:
# Add table to the PDF
table = content.translation
table_style = TableStyle([
('BACKGROUND', (0, 0), (-1, 0), colors.grey),
('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
('ALIGN', (0, 0), (-1, -1), 'CENTER'),
('FONTNAME', (0, 0), (-1, 0), 'SimSun'), # 更改表头字体为 "SimSun"
('FONTSIZE', (0, 0), (-1, 0), 14),
('BOTTOMPADDING', (0, 0), (-1, 0), 12),
('BACKGROUND', (0, 1), (-1, -1), colors.beige),
('FONTNAME', (0, 1), (-1, -1), 'SimSun'), # 更改表格中的字体为 "SimSun"
('GRID', (0, 0), (-1, -1), 1, colors.black)
])
pdf_table = Table(table.values.tolist())
pdf_table.setStyle(table_style)
story.append(pdf_table)
# Add a page break after each page except the last one
if page != book.pages[-1]:
story.append(PageBreak())

# Save the translated book as a new PDF file
doc.build(story)
return output_file_path


def _save_translated_book_markdown(self, book: Book, output_file_path: str = None):
output_file_path = book.pdf_file_path.replace('.pdf', f'_translated.md')

LOG.info(f"开始导出: {output_file_path}")
with open(output_file_path, 'w', encoding='utf-8') as output_file:
# Iterate over the pages and contents
for page in book.pages:
for content in page.contents:
if content.status:
if content.content_type == ContentType.TEXT:
# Add translated text to the Markdown file
text = content.translation
output_file.write(text + '\n\n')

elif content.content_type == ContentType.TABLE:
# Add table to the Markdown file
table = content.translation
header = '| ' + ' | '.join(str(column) for column in table.columns) + ' |' + '\n'
separator = '| ' + ' | '.join(['---'] * len(table.columns)) + ' |' + '\n'
# body = '\n'.join(['| ' + ' | '.join(row) + ' |' for row in table.values.tolist()]) + '\n\n'
body = '\n'.join(['| ' + ' | '.join(str(cell) for cell in row) + ' |' for row in table.values.tolist()]) + '\n\n'
output_file.write(header + separator + body)

# Add a page break (horizontal rule) after each page except the last one
if page != book.pages[-1]:
output_file.write('---\n\n')

return output_file_path
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .argument_parser import ArgumentParser
from .logger import LOG
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import argparse

class ArgumentParser:
def __init__(self):
self.parser = argparse.ArgumentParser(description='A translation tool that supports translations in any language pair.')
self.parser.add_argument('--config_file', type=str, default='config.yaml', help='Configuration file with model and API settings.')
self.parser.add_argument('--model_name', type=str, help='Name of the Large Language Model.')
self.parser.add_argument('--input_file', type=str, help='PDF file to translate.')
self.parser.add_argument('--output_file_format', type=str, help='The file format of translated book. Now supporting PDF and Markdown')
self.parser.add_argument('--source_language', type=str, help='The language of the original book to be translated.')
self.parser.add_argument('--target_language', type=str, help='The target language for translating the original book.')

def parse_arguments(self):
args = self.parser.parse_args()
return args
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from loguru import logger
import os
import sys

LOG_FILE = "translation.log"
ROTATION_TIME = "02:00"

class Logger:
def __init__(self, name="translation", log_dir="logs", debug=False):
if not os.path.exists(log_dir):
os.makedirs(log_dir)
log_file_path = os.path.join(log_dir, LOG_FILE)

# Remove default loguru handler
logger.remove()

# Add console handler with a specific log level
level = "DEBUG" if debug else "INFO"
logger.add(sys.stdout, level=level)
# Add file handler with a specific log level and timed rotation
logger.add(log_file_path, rotation=ROTATION_TIME, level="DEBUG")
self.logger = logger

LOG = Logger(debug=True).logger

if __name__ == "__main__":
log = Logger().logger

log.debug("This is a debug message.")
log.info("This is an info message.")
log.warning("This is a warning message.")
log.error("This is an error message.")
549 changes: 549 additions & 0 deletions selected_homework/openai-translator_v4/openai_api_demo/api_server.py

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
version: "3.6"

services:
glm3_api:
image: python:3.10.13-slim
restart: unless-stopped
working_dir: /glm3
container_name: glm3_api
env_file: ./.env
networks:
- v_glm3
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
environment:
- MODEL_PATH=/models/chatglm3-6b
- EMBEDDING_PATH=/models/bge-large-zh-v1.5
- TZ=Asia/Shanghai
- PYTHONDONTWRITEBYTECODE=1
- PYTHONUNBUFFERED=1
- DOCKER=True
ports:
- 8100:8000
volumes:
- ./:/glm3
- ${LOCAL_MODEL_PATH}:/models/chatglm3-6b
- ${LOCAL_EMBEDDING_MODEL_PATH}:/models/bge-large-zh-v1.5
command:
- sh
- -c
- |
sed -i s/deb.debian.org/mirrors.tencentyun.com/g /etc/apt/sources.list
sed -i s/security.debian.org/mirrors.tencentyun.com/g /etc/apt/sources.list
apt-get update
python -m pip install -i https://mirror.sjtu.edu.cn/pypi/web/simple --upgrade pip
pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
python api_server.py
networks:
v_glm3:
driver: bridge
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
"""
This script is designed for interacting with a local GLM3 AI model using the `ChatGLM3` class
from the `langchain_community` library. It facilitates continuous dialogue with the GLM3 model.
1. Start the Local Model Service: Before running this script, you need to execute the `api_server.py` script
to start the GLM3 model's service.
2. Run the Script: The script includes functionality for initializing the LLMChain object and obtaining AI responses,
allowing the user to input questions and receive AI answers.
3. This demo is not support for streaming.
"""
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.schema.messages import HumanMessage, SystemMessage, AIMessage
from langchain_community.llms.chatglm3 import ChatGLM3


def initialize_llm_chain(messages: list):
template = "{input}"
prompt = PromptTemplate.from_template(template)

endpoint_url = "http://127.0.0.1:8000/v1/chat/completions"
llm = ChatGLM3(
endpoint_url=endpoint_url,
max_tokens=4096,
prefix_messages=messages,
top_p=0.9
)
return LLMChain(prompt=prompt, llm=llm)


def get_ai_response(llm_chain, user_message):
ai_response = llm_chain.invoke({"input": user_message})
return ai_response


def continuous_conversation():
messages = [
SystemMessage(content="You are an intelligent AI assistant, named ChatGLM3."),
]
while True:
user_input = input("Human (or 'exit' to quit): ")
if user_input.lower() == 'exit':
break
llm_chain = initialize_llm_chain(messages=messages)
ai_response = get_ai_response(llm_chain, user_input)
print("ChatGLM3: ", ai_response["text"])
messages += [
HumanMessage(content=user_input),
AIMessage(content=ai_response["text"]),
]


if __name__ == "__main__":
continuous_conversation()
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
"""
This script is an example of using the OpenAI API to create various interactions with a ChatGLM3 model.
It includes functions to:
1. Conduct a basic chat session, asking about weather conditions in multiple cities.
2. Initiate a simple chat in Chinese, asking the model to tell a short story.
3. Retrieve and print embeddings for a given text input.
Each function demonstrates a different aspect of the API's capabilities, showcasing how to make requests
and handle responses.
"""

from openai import OpenAI

base_url = "http://127.0.0.1:8000/v1/"
client = OpenAI(api_key="EMPTY", base_url=base_url)


def function_chat():
messages = [{"role": "user", "content": "What's the weather like in San Francisco, Tokyo, and Paris?"}]
tools = [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
},
"required": ["location"],
},
},
}
]

response = client.chat.completions.create(
model="chatglm3-6b",
messages=messages,
tools=tools,
tool_choice="auto",
)
if response:
content = response.choices[0].message.content
print(content)
else:
print("Error:", response.status_code)


def simple_chat(use_stream=True):
messages = [
{
"role": "system",
"content": "You are ChatGLM3, a large language model trained by Zhipu.AI. Follow the user's "
"instructions carefully. Respond using markdown.",
},
{
"role": "user",
"content": "你好,请你用生动的话语给我讲一个小故事吧"
}
]
response = client.chat.completions.create(
model="chatglm3-6b",
messages=messages,
stream=use_stream,
max_tokens=256,
temperature=0.8,
presence_penalty=1.1,
top_p=0.8)
if response:
if use_stream:
for chunk in response:
print(chunk.choices[0].delta.content)
else:
content = response.choices[0].message.content
print(content)
else:
print("Error:", response.status_code)


def embedding():
response = client.embeddings.create(
model="bge-large-zh-1.5",
input=["你好,给我讲一个故事,大概100字"],
)
embeddings = response.data[0].embedding
print("嵌入完成,维度:", len(embeddings))


if __name__ == "__main__":
simple_chat(use_stream=False)
simple_chat(use_stream=True)
embedding()
function_chat()
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@

"""
Description: You can customize the developed langchain tool overview information here,
just like the sample code already given in this script.
"""


tool_param_start_with = "```python\ntool_call"


""" Fill this dictionary with the mapping from tool class names to tool classes that you defined.
Like:
from tools.Calculator import Calculator
tool_class = {"Calculator": Calculator, ...}
It is required that your customized tool class must define the format for the langchain tool
and implement the parameter verification function in the class:
parameter_validation(self, para: str) -> bool
Tool class definition reference: ChatGLM3/langchain_demo/tools.
"""
tool_class = {}


""" Describe your tool names and parameters in this dictionary.
Like:
tool_def = [
{"name": "Calculator",
"description": "数学计算器,计算数学问题",
"parameters": {"type": "object", "properties": {"symbol": {"description": "要计算的数学公式"}}, "required": []}
},...
]
"""
tool_def = []
192 changes: 192 additions & 0 deletions selected_homework/openai-translator_v4/openai_api_demo/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
import gc
import json
import torch
from transformers import PreTrainedModel, PreTrainedTokenizer
from transformers.generation.logits_process import LogitsProcessor
from typing import Union, Tuple


class InvalidScoreLogitsProcessor(LogitsProcessor):
def __call__(
self, input_ids: torch.LongTensor, scores: torch.FloatTensor
) -> torch.FloatTensor:
if torch.isnan(scores).any() or torch.isinf(scores).any():
scores.zero_()
scores[..., 5] = 5e4
return scores


def process_response(output: str, use_tool: bool = False) -> Union[str, dict]:
content = ""
for response in output.split("<|assistant|>"):
metadata, content = response.split("\n", maxsplit=1)
if not metadata.strip():
content = content.strip()
content = content.replace("[[训练时间]]", "2023年")
else:
if use_tool:
content = "\n".join(content.split("\n")[1:-1])

def tool_call(**kwargs):
return kwargs

parameters = eval(content)
content = {
"name": metadata.strip(),
"arguments": json.dumps(parameters, ensure_ascii=False)
}
else:
content = {
"name": metadata.strip(),
"content": content
}
return content


@torch.inference_mode()
def generate_stream_chatglm3(model: PreTrainedModel, tokenizer: PreTrainedTokenizer, params: dict):
messages = params["messages"]
tools = params["tools"]
temperature = float(params.get("temperature", 1.0))
repetition_penalty = float(params.get("repetition_penalty", 1.0))
top_p = float(params.get("top_p", 1.0))
max_new_tokens = int(params.get("max_tokens", 256))
echo = params.get("echo", True)
messages = process_chatglm_messages(messages, tools=tools)
query, role = messages[-1]["content"], messages[-1]["role"]

inputs = tokenizer.build_chat_input(query, history=messages[:-1], role=role)
inputs = inputs.to(model.device)
input_echo_len = len(inputs["input_ids"][0])

if input_echo_len >= model.config.seq_length:
print(f"Input length larger than {model.config.seq_length}")

eos_token_id = [
tokenizer.eos_token_id,
tokenizer.get_command("<|user|>"),
tokenizer.get_command("<|observation|>")
]

gen_kwargs = {
"max_new_tokens": max_new_tokens,
"do_sample": True if temperature > 1e-5 else False,
"top_p": top_p,
"repetition_penalty": repetition_penalty,
"logits_processor": [InvalidScoreLogitsProcessor()],
}
if temperature > 1e-5:
gen_kwargs["temperature"] = temperature

total_len = 0
for total_ids in model.stream_generate(**inputs, eos_token_id=eos_token_id, **gen_kwargs):
total_ids = total_ids.tolist()[0]
total_len = len(total_ids)
if echo:
output_ids = total_ids[:-1]
else:
output_ids = total_ids[input_echo_len:-1]

response = tokenizer.decode(output_ids)
if response and response[-1] != "�":
response, stop_found = apply_stopping_strings(response, ["<|observation|>"])

yield {
"text": response,
"usage": {
"prompt_tokens": input_echo_len,
"completion_tokens": total_len - input_echo_len,
"total_tokens": total_len,
},
"finish_reason": "function_call" if stop_found else None,
}

if stop_found:
break

# Only last stream result contains finish_reason, we set finish_reason as stop
ret = {
"text": response,
"usage": {
"prompt_tokens": input_echo_len,
"completion_tokens": total_len - input_echo_len,
"total_tokens": total_len,
},
"finish_reason": "stop",
}
yield ret

gc.collect()
torch.cuda.empty_cache()


def process_chatglm_messages(messages, tools=None):
_messages = messages
messages = []
msg_has_sys = False
if tools:
messages.append(
{
"role": "system",
"content": "Answer the following questions as best as you can. You have access to the following tools:",
"tools": tools
}
)
msg_has_sys = True

for m in _messages:
role, content, func_call = m.role, m.content, m.function_call
if role == "function":
messages.append(
{
"role": "observation",
"content": content
}
)

elif role == "assistant" and func_call is not None:
for response in content.split("<|assistant|>"):
metadata, sub_content = response.split("\n", maxsplit=1)
messages.append(
{
"role": role,
"metadata": metadata,
"content": sub_content.strip()
}
)
else:
if role == "system" and msg_has_sys:
msg_has_sys = False
continue
messages.append({"role": role, "content": content})
return messages


def generate_chatglm3(model: PreTrainedModel, tokenizer: PreTrainedTokenizer, params: dict):
for response in generate_stream_chatglm3(model, tokenizer, params):
pass
return response


def apply_stopping_strings(reply, stop_strings) -> Tuple[str, bool]:
stop_found = False
for string in stop_strings:
idx = reply.find(string)
if idx != -1:
reply = reply[:idx]
stop_found = True
break

if not stop_found:
# If something like "\nYo" is generated just before "\nYou: is completed, trim it
for string in stop_strings:
for j in range(len(string) - 1, 0, -1):
if reply[-j:] == string[:j]:
reply = reply[:-j]
break
else:
continue

break

return reply, stop_found
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
"""
This script is an example of using the Zhipu API to create various interactions with a ChatGLM3 model. It includes
functions to:
1. Conduct a basic chat session, asking about weather conditions in multiple cities.
2. Initiate a simple chat in Chinese, asking the model to tell a short story.
3. Retrieve and print embeddings for a given text input.
Each function demonstrates a different aspect of the API's capabilities,
showcasing how to make requests and handle responses.
Note: Make sure your Zhipu API key is set as an environment
variable formate as xxx.xxx (just for check, not need a real key).
"""

from zhipuai import ZhipuAI

base_url = "http://127.0.0.1:8000/v1/"
client = ZhipuAI(api_key="EMP.TY", base_url=base_url)


def function_chat():
messages = [{"role": "user", "content": "What's the weather like in San Francisco, Tokyo, and Paris?"}]
tools = [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
},
"required": ["location"],
},
},
}
]

response = client.chat.completions.create(
model="chatglm3_6b",
messages=messages,
tools=tools,
tool_choice="auto",
)
if response:
content = response.choices[0].message.content
print(content)
else:
print("Error:", response.status_code)


def simple_chat(use_stream=True):
messages = [
{
"role": "system",
"content": "You are ChatGLM3, a large language model trained by Zhipu.AI. Follow "
"the user's instructions carefully. Respond using markdown.",
},
{
"role": "user",
"content": "你好,请你介绍一下chatglm3-6b这个模型"
}
]
response = client.chat.completions.create(
model="chatglm3_",
messages=messages,
stream=use_stream,
max_tokens=256,
temperature=0.8,
top_p=0.8)
if response:
if use_stream:
for chunk in response:
print(chunk.choices[0].delta.content)
else:
content = response.choices[0].message.content
print(content)
else:
print("Error:", response.status_code)


def embedding():
response = client.embeddings.create(
model="bge-large-zh-1.5",
input=["ChatGLM3-6B 是一个大型的中英双语模型。"],
)
embeddings = response.data[0].embedding
print("嵌入完成,维度:", len(embeddings))


if __name__ == "__main__":
simple_chat(use_stream=False)
simple_chat(use_stream=True)
embedding()
function_chat()