diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..48e7b8d --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +/venv +/__pycache__ +/.idea +*/__pycache__ \ No newline at end of file diff --git a/README.md b/README.md index 950ba8a..b765afb 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,151 @@ # PassageSummary -PassageSummary是一个基于Gpt工作的Api,提供文章总结,话题提取等若干总结功能 +PassageSummary是一个基于Gpt工作的Api,提供文章总结,话题提取等若干总结功能。 + +# 约束 + +ApiService提供基本的文本缓存服务,支持对于完全相同的文本的缓存。此外,在ApiService返回异常的时候,会返回`errno`和`message`两个参数,其分别表示全局唯一错误码和错误信息。 + +ApiService的返回内容均为Json格式,其中token参数表示OpenAI的ApiKey,使用`gpt-3.5-turbo`模型。 + +# Api + +得到ApiService的工作状态: + +``` +GET / +{ + "message":"This server is working normally." +} +``` + +上传文本: + +``` +POST /passage +请求: +{ + "content":"This is a content", + "token":"sk-ss" +} + +返回: +正常返回: +{ + "hash":"d622ac64268ce69eef0f3dc8277d06a9182f71c7" +} +错误返回: +{ + "errno":1, + "message":"文本转换异常" +} +``` + +询问文本: + +``` +POST /passage/{hash} +请求: +{ + "action":"ask", + "param":"这篇文章主要描述了什么?", + "token":"sk-xxx" +} + +返回: +正常返回: +{ + "content":"这篇文章主要讲述了...." +} +错误返回: +{ + "errno":2, + "message":"文章内容不存在..." +} +``` + +得到文章话题: + +``` +POST /passage/{hash} +请求: +{ + "action":"topic", + "token":"sk-xxx" +} + +返回: +正常返回: +{ + "topics": + [ + { + "topic":"原神怎么你了?", + "relative":"0.2" + } + ] +} +//topic表示的是话题,relative表示话题相关度 +错误返回: +{ + "errno":2, + "message":"文章内容不存在..." +} +``` + +判断文章与话题的相关度: + +``` +POST /passage/{hash} +请求: +{ + "action":"getTopicRelative", + "param":"原神,原批", + "token":"sk-xxx" +} + +返回: +正常返回: +{ + "topics": + [ + { + "topic":"原神", + "relative":"0.2" + }, + { + "topic":"原批", + "relative":"0.9" + } + ] +} +//topic表示的是话题,relative表示话题相关度 +错误返回: +{ + "errno":2, + "message":"文章内容不存在..." +} +``` + +总结文章: + +``` +POST /passage/{hash} +请求: +{ + "action":"summary", + "token":"sk-xxx" +} + +返回: +正常返回: +{ + "content":"这篇文章讲述了一个原批转换为星批的故事。" +} + +错误返回: +{ + "errno":2, + "message":"文章内容不存在..." +} +``` + diff --git a/main.py b/main.py new file mode 100644 index 0000000..088f39c --- /dev/null +++ b/main.py @@ -0,0 +1,31 @@ +import os +from models import request +from passages import savePassage +from passages import passageAnalysis +from fastapi import FastAPI + +# 初始化,创建缓存目录: +if not os.path.exists('cache'): + os.mkdir('cache') + +app = FastAPI() + + +@app.get("/") +async def root(): + return {"message": "This server is working normally."} + + +@app.post("/passage") +async def say_hello(req: request.SavePassageRequest): + return savePassage.save_passage(req.content, req.token) + + +@app.post("/passage/{hash}") +async def action(hash: str, req: request.PassageRequest): + if not os.path.exists(os.path.join('cache', hash)): + return { + "errno": 10001, + "message": "hash对应的文件不存在,或者是文件读取异常" + } + return passageAnalysis.dispatch_action(req, hash) diff --git a/models/__init__.py b/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/models/request.py b/models/request.py new file mode 100644 index 0000000..8af5360 --- /dev/null +++ b/models/request.py @@ -0,0 +1,12 @@ +from pydantic import BaseModel + + +class SavePassageRequest(BaseModel): + content: str + token: str + + +class PassageRequest(BaseModel): + action: str + param: object | None + token: str diff --git a/passages/__init__.py b/passages/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/passages/passageAnalysis.py b/passages/passageAnalysis.py new file mode 100644 index 0000000..aec6754 --- /dev/null +++ b/passages/passageAnalysis.py @@ -0,0 +1,124 @@ +import os +import json +from langchain import OpenAI +from models import request +from llama_index import ( + GPTSimpleVectorIndex, + PromptHelper, + LLMPredictor, + QuestionAnswerPrompt, + ServiceContext +) + + +def dispatch_action(req: request.PassageRequest, hash: str): + path = os.path.join('cache', hash) + vector = os.path.join(path, 'index.json') + match req.action: + case "ask": + return ask(vector, str(req.param), req.token) + case "topic": + return get_topics(vector, req.token) + case "getTopicRelative": + return get_topic_relative(vector, str(req.param), req.token) + case "summary": + return summary(vector, req.token) + + +def ask(vector: str, ask_question: str, token: str): + response = common_ask(vector, ask_question, token) + if response.response is None: + return { + "errno": 10002, + "message": "Gpt未返回信息,请检查Token是否有效!" + } + return { + "content": response.response + } + + +def summary(vector: str, token: str): + response = common_ask(vector, "Summary this passage in Chinese", token) + if response.response is None: + return { + "errno": 10002, + "message": "Gpt未返回信息,请检查Token是否有效!" + } + return { + "content": response.response + } + + +def get_topics(vector: str, token: str): + response = common_ask(vector, "Analysis this passage, getting the topic or key word of it. " + "Returning {{xxx#relative}}. 'xxx' is the the topic or key word of the passage and " + "relative is a num between 0 and 1 presenting the closeness of " + "the topic or key word and the text. " + "For example, returning '{{Minecraft#0.2}},{{Game#0.8}}'", token) + return get_topic_with_relative(response) + + +def get_topic_relative(vector: str, key_word: str, token: str): + response = common_ask(vector, "Analysis this passage, getting the relative of the topic or key word " + "with the passage. " + "Returning {{xxx#relative}}. 'xxx' is the the topic or key word given and " + "relative is a num between 0 and 1 presenting the closeness of " + "the topic or key word and the text. " + "For example, giving 'TopicA,TopicB' returning " + "'{{TopicA#0.2}},{{TopicB#0.8}}'. Now the giving keyword is " + key_word, token) + return get_topic_with_relative(response) + + +def get_topic_with_relative(response): + if response.response is None: + return { + "errno": 10002, + "message": "Gpt未返回信息,请检查Token是否有效!" + } + topic = [] + topics = response.response.split(",") + for i in topics: + cts = i.split("#") + if len(cts) != 2: + return { + "errno": 10003, + "message": "Gpt返回无效信息,请尝试重新请求或舍弃请求." + } + temp = { + "topic": str(cts[0]).replace("{{", "").replace("\n", "").replace("{", ""), + "relative": str(cts[1]).replace("}}", "").replace("}", "") + } + topic.append(temp) + return { + "content": topic + } + + +def common_ask(vector: str, ask_question: str, token: str, prompt: str = "Please answer the question with the context " + "information"): + llm_predictor, prompt_helper = prepare_llama_para(token) + + service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, prompt_helper=prompt_helper) + + qa_prompt_impl = ( + "We have provided context information below. \n" + "---------------------\n" + "{context_str}" + "\n---------------------\n" + f"{prompt}: {{query_str}}\n" + ) + qa_prompt = QuestionAnswerPrompt(qa_prompt_impl) + index = GPTSimpleVectorIndex.load_from_disk(vector, service_context=service_context) + response = index.query(ask_question, response_mode="compact", text_qa_template=qa_prompt) + return response + + +def prepare_llama_para(token): + os.environ["OPENAI_API_KEY"] = token + max_input_size = 4096 + num_outputs = 1024 + max_chunk_overlap = 20 + chunk_size_limit = 1000 + llm_predictor = LLMPredictor(llm=OpenAI(temperature=0, model_name="text-davinci-003", max_tokens=num_outputs)) + prompt_helper = PromptHelper(max_input_size, num_outputs, max_chunk_overlap, chunk_size_limit=chunk_size_limit) + return llm_predictor, prompt_helper diff --git a/passages/savePassage.py b/passages/savePassage.py new file mode 100644 index 0000000..669893f --- /dev/null +++ b/passages/savePassage.py @@ -0,0 +1,40 @@ +import os +from passages import passageAnalysis +from llama_index import ( + GPTSimpleVectorIndex, + SimpleDirectoryReader, + LLMPredictor, + ServiceContext, + PromptHelper +) + + +def save_passage(content: str, token: str): + name = hash(str) + res = { + "hash": name + } + + # permanently cache + dir_path = os.path.join('cache', str(name)) + if not os.path.exists(dir_path): + os.mkdir(dir_path) + else: + return res + file_name = os.path.join(dir_path, 'raw') + index_name = os.path.join(dir_path, 'index.json') + with open(file_name, "w") as file: + file.write(content) + llm_predictor, prompt_helper = passageAnalysis.prepare_llama_para(token) + documents = SimpleDirectoryReader(dir_path).load_data() + service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, prompt_helper=prompt_helper) + index = GPTSimpleVectorIndex.from_documents( + documents, service_context=service_context + ) + index.save_to_disk(index_name) + return res + + +def get_passage_content(hash: str): + with open(hash, "r") as file: + return file.read() diff --git a/unitTests/fileAnalysis.http b/unitTests/fileAnalysis.http new file mode 100644 index 0000000..e539ca0 --- /dev/null +++ b/unitTests/fileAnalysis.http @@ -0,0 +1,44 @@ +### 询问文本 +POST http://127.0.0.1:8000/passage/{{hash}} +Accept: application/json +Content-Type: application/json + +{ + "action":"ask", + "param":"Which passage is mentioned that was written by Lu Xun.", + "token":"{{token}}" +} + + +### 得到话题 +POST http://127.0.0.1:8000/passage/{{hash}} +Accept: application/json +Content-Type: application/json + +{ + "action":"topic", + "token":"{{token}}" +} + + +### 判断文章与话题的相关度 +POST http://127.0.0.1:8000/passage/{{hash}} +Accept: application/json +Content-Type: application/json + +{ + "action":"getTopicRelative", + "param": "游戏,鲁迅,教育", + "token":"{{token}}" +} + + +### 总结文章 +POST http://127.0.0.1:8000/passage/{{hash}} +Accept: application/json +Content-Type: application/json + +{ + "action":"summary", + "token":"{{token}}" +} \ No newline at end of file diff --git a/unitTests/fileTest.http b/unitTests/fileTest.http new file mode 100644 index 0000000..b79ef7c --- /dev/null +++ b/unitTests/fileTest.http @@ -0,0 +1,13 @@ +### 判断是否运行正常 +GET http://127.0.0.1:8000 +Accept: application/json + +### 上传文本 +POST http://127.0.0.1:8000/passage +Accept: application/json +Content-Type: application/json + +{ + "content":"上过中学的人,应该都听说过这么一个说法,叫中学生有三怕,“一怕文言文,二怕写作文,三怕周树人。”可以说,鲁迅的文章是所有上过中学的人的噩梦。那么为什么鲁迅的文章会那么难懂呢?上过中学的人,应该都听说过这么一个说法,叫中学生有三怕,“一怕文言文,二怕写作文,三怕周树人。”可以说,鲁迅的文章是所有上过中学的人的噩梦。那么为什么鲁迅的文章会那么难懂呢?上过中学的人,应该都听说过这么一个说法,叫中学生有三怕,“一怕文言文,二怕写作文,三怕周树人。”可以说,鲁迅的文章是所有上过中学的人的噩梦。那么为什么鲁迅的文章会那么难懂呢?这篇小说叫《药》。《药》讲了一个什么故事呢?准确的说它将了两个平行的故事。一个是华家的故事,华老栓有个儿子华小栓,华小栓得了肺结核,在晚清时期,这个病就是绝症,所以华老栓四处求医问药都没有效果。然后,有一天他就得了一个偏方,说用人血沾馒头吃了就会好了,所以,他就花重金从刽子手康大叔哪里买了个人血馒头,然后给华小栓趁热吃了。但是小栓还是死了。这个故事体现的是华家的麻木、迷信、愚笨,以及其中带着的一点自私和残忍。另外一个故事是夏家的故事,这是上一个故事背后的潜故事,就是它是在众人的讲述中出现的。大意是夏家的儿子夏瑜要造清廷的反,结果被清廷抓住,并且在菜市口被斩首了,他的血被众人沾了人血馒头。结合上一个故事,你就会知道,华小栓吃的人血馒头的血就是夏瑜的。要理解两个故事背后的寓意,需要看明白鲁迅在这个小说里玩的两个文字游戏。第一个是这两家的姓氏,一家姓华,一家姓夏,连起来就是华夏,我们知道,从古代起,这个词就是用来指代中国人的。而在这个故事中,华家却“吃”了夏家的血,这是在象征什么,应该就不言自明了吧?要理解两个故事背后的寓意,需要看明白鲁迅在这个小说里玩的两个文字游戏。第一个是这两家的姓氏,一家姓华,一家姓夏,连起来就是华夏,我们知道,从古代起,这个词就是用来指代中国人的。而在这个故事中,华家却“吃”了夏家的血,这是在象征什么,应该就不言自明了吧?要理解两个故事背后的寓意,需要看明白鲁迅在这个小说里玩的两个文字游戏。第一个是这两家的姓氏,一家姓华,一家姓夏,连起来就是华夏,我们知道,从古代起,这个词就是用来指代中国人的。而在这个故事中,华家却“吃”了夏家的血,这是在象征什么,应该就不言自明了吧?网上曾经流行一句话,我很喜欢,叫“为众人抱火的,不可使他冻毙于风雪”,然而在晚清时期,像秋瑾这样为众人抱火的,却不仅冻毙于风雪,而且使她冻毙的可能恰恰就是她为之抱火的人。网上曾经流行一句话,我很喜欢,叫“为众人抱火的,不可使他冻毙于风雪”,然而在晚清时期,像秋瑾这样为众人抱火的,却不仅冻毙于风雪,而且使她冻毙的可能恰恰就是她为之抱火的人。用我们今天社会上经常出现的一个小例子,就很好理解了。", + "token": "{{token}}" +} \ No newline at end of file diff --git a/unitTests/http-client.env.json b/unitTests/http-client.env.json new file mode 100644 index 0000000..2905ec0 --- /dev/null +++ b/unitTests/http-client.env.json @@ -0,0 +1,5 @@ +{ + "dev": { + "hash": "" + } +} \ No newline at end of file diff --git a/unitTests/http-client.private.env.json b/unitTests/http-client.private.env.json new file mode 100644 index 0000000..df24756 --- /dev/null +++ b/unitTests/http-client.private.env.json @@ -0,0 +1,5 @@ +{ + "dev": { + "token": "" + } +} \ No newline at end of file