diff --git a/docker/yao-knowledge-dev/compose.yml b/docker/yao-knowledge-dev/compose.yml index 31eac0d..03f6ab9 100644 --- a/docker/yao-knowledge-dev/compose.yml +++ b/docker/yao-knowledge-dev/compose.yml @@ -12,7 +12,7 @@ services: - "5080" - --scheme - http - image: semitechnologies/weaviate:1.18.3 + image: semitechnologies/weaviate:latest ports: - 5080:5080 restart: on-failure:0 diff --git a/neo/neo.yml b/neo/neo.yml index 1df105d..b54dc0c 100644 --- a/neo/neo.yml +++ b/neo/neo.yml @@ -18,7 +18,7 @@ prompts: - Answer my questions in Chinese from now on. option: - temperature: 0.8 + temperature: 0.6 allows: - "http://127.0.0.1:8000" diff --git a/scripts/doc.js b/scripts/doc.js index edbd674..ceb8857 100644 --- a/scripts/doc.js +++ b/scripts/doc.js @@ -1,3 +1,8 @@ +/** + * vector database (on Weaviate) + * Will be replaced by the new vector process + */ + /** * Document schema structure */ @@ -581,6 +586,244 @@ function Delete(fingerprint) { return ids; } +/** + * Search the part of file with the input + * yao run scripts.doc.Query '::{"input":"YAO 怎么写关联查询", "distance": 0.2}' 1 2 + * + * @param {*} input + */ +function Query(params, page, pagesize) { + params = params || {}; + page = page ? parseInt(page) : 1; + pagesize = pagesize ? parseInt(pagesize) : 20; + + const input = params.input || ""; + const distance = params.distance || 0.2; + const connector = "openai.text-embedding-ada-002"; + + if (!input || input == "") { + throw new Exception("input is required", 400); + } + + const resp = Process("openai.Embeddings", connector, input); + if (!resp || resp.code || !resp.data) { + const message = resp.message || "openai.Embeddings error"; + const code = resp.code || 500; + throw new Exception(message, code); + } + + const vector = JSON.stringify(resp.data[0].embedding || []); + const offset = page ? (page - 1) * pagesize : 0; + const cfg = setting(); + const url = `${cfg.host}/v1/graphql`; + + let payload = { + query: `{ + Aggregate { + Document ( + nearVector: { + vector: ${vector} + distance: ${distance} + } + ) + { + meta { + count + } + } + } + Get { + Document( + limit: ${pagesize} + offset: ${offset} + nearVector: { + vector: ${vector} + distance: ${distance} + } + ) + { + name + summary + content + path + type + url + part + fingerprint + _additional{ + id + lastUpdateTimeUnix, + distance + } + } + } + }`, + }; + + response = post(url, payload, cfg.key); + if (response && response.code && response.code != 200 && response.message) { + throw new Exception(response.message, response.code); + } + + if (response && response.errors) { + throw new Exception(response.errors[0].message, 500); + } + + let data = response.data || { + Get: { Document: [] }, + Aggregate: { Document: [{ meta: { count: 0 } }] }, + }; + + const meta = + data.Aggregate.Document[0] && data.Aggregate.Document[0].meta + ? data.Aggregate.Document[0].meta + : { count: 0 }; + + let total = meta.count; + let pages = Math.ceil(total / pagesize); + let prev = page > 1 ? page - 1 : 1; + let next = page < pages ? page + 1 : pages; + let items = data.Get.Document || []; + for (let i = 0; i < items.length; i++) { + let item = items[i]; + item.id = item._additional.id; + item.lastUpdateTimeUnix = item._additional.lastUpdateTimeUnix; + item.distance = item._additional.distance; + delete item._additional; + } + + return { + data: items, + next: next, + page: parseInt(page), + pagecnt: pages, + prev: prev, + total: total, + }; +} + +/** + * Search the file of file with the input + * yao run scripts.doc.Search '::{"input":"YAO 怎么写关联查询", "distance": 0.2}' 1 10 + * + * const distance = 0.2; + * const distancePrompts = 2; + * @param {*} input + */ +function Search(params, page, pagesize) { + params = params || {}; + page = page ? parseInt(page) : 1; + pagesize = pagesize ? parseInt(pagesize) : 20; + + const input = params.input || ""; + const distance = params.distance || 0.2; + const connector = "openai.text-embedding-ada-002"; + + if (!input || input == "") { + throw new Exception("input is required", 400); + } + + const resp = Process("openai.Embeddings", connector, input); + if (!resp || resp.code || !resp.data) { + const message = resp.message || "openai.Embeddings error"; + const code = resp.code || 500; + throw new Exception(message, code); + } + + const vector = JSON.stringify(resp.data[0].embedding || []); + const offset = page ? (page - 1) * pagesize : 0; + const cfg = setting(); + const url = `${cfg.host}/v1/graphql`; + + let payload = { + query: `{ + Aggregate { + Document ( + nearVector: { + vector: ${vector} + distance: ${distance} + } + groupBy: ["fingerprint"] + ) + { + meta { + count + } + groupedBy { value path } + } + } + Get { + Document( + limit: ${pagesize} + offset: ${offset} + nearVector: { + vector: ${vector} + distance: ${distance} + } + groupBy: { + path: ["fingerprint"] + groups: ${pagesize} + objectsPerGroup: 9999 + } + ) + { + name + summary + content + path + type + url + part + fingerprint + _additional{ + id + lastUpdateTimeUnix, + distance + group { + groupedBy { value path } + } + } + } + } + }`, + }; + + response = post(url, payload, cfg.key); + if (response && response.code && response.code != 200 && response.message) { + throw new Exception(response.message, response.code); + } + if (response && response.errors) { + throw new Exception(response.errors[0].message, 500); + } + + let data = response.data || { + Get: { Document: [] }, + Aggregate: { Document: [{ meta: { count: 0 } }] }, + }; + + let total = data.Aggregate.Document.length; + let pages = Math.ceil(total / pagesize); + let prev = page > 1 ? page - 1 : 1; + let next = page < pages ? page + 1 : pages; + let items = data.Get.Document || []; + for (let i = 0; i < items.length; i++) { + let item = items[i]; + item.id = item._additional.id; + item.lastUpdateTimeUnix = item._additional.lastUpdateTimeUnix; + item.distance = item._additional.distance; + delete item._additional; + } + + return { + data: items, + next: next, + page: parseInt(page), + pagecnt: pages, + prev: prev, + total: total, + }; +} + /** * Check if schema exists * yao run scripts.doc.SchemaExists diff --git a/scripts/stat.js b/scripts/stat.js deleted file mode 100644 index 64acdbb..0000000 --- a/scripts/stat.js +++ /dev/null @@ -1,94 +0,0 @@ - -/** - * before:data hook - * @param {*} params - * @returns - */ -function BeforeData(params) { - log.Info("[chart] before data hook: %s", JSON.stringify(params)); - return [params]; -} - -/** - * after:data hook - * @param {*} data - * @returns - */ -function AfterData(data) { - log.Info("[chart] after data hook: %s", JSON.stringify(data)); - return data; -} - -/** - * Get Data - * @param {*} params - */ -function Data(params) { - log.Info("[chart] process data query: %s", JSON.stringify(params)); - return { - income: [ - { value: 40300, date: "2022-1-1" }, - { value: 50800, date: "2022-2-1" }, - { value: 31300, date: "2022-3-1" }, - { value: 48800, date: "2022-4-1" }, - { value: 69900, date: "2022-5-1" }, - { value: 37800, date: "2022-6-1" }, - ], - cost: [ - { value: 28100, date: "2022-1-1" }, - { value: 23000, date: "2022-2-1" }, - { value: 29300, date: "2022-3-1" }, - { value: 26700, date: "2022-4-1" }, - { value: 26400, date: "2022-5-1" }, - { value: 31200, date: "2022-6-1" }, - ], - rate: [ - { value: 8.0, date: "2022-1-1" }, - { value: 7.6, date: "2022-2-1" }, - { value: 9.1, date: "2022-3-1" }, - { value: 8.4, date: "2022-4-1" }, - { value: 6.9, date: "2022-5-1" }, - { value: 9.0, date: "2022-6-1" }, - ], - pet_count: 54, - pet_type: 8, - income_monthly: 68900, - doctor_count: 23, - prev_pet_count: { current: 54, prev: 45 }, - prev_pet_type: { current: 8, prev: 13 }, - prev_income_monthly: { current: 68900, prev: 92000 }, - prev_doctor_count: { current: 23, prev: 27 }, - datasource_type: [ - { type: "猫猫", count: 18 }, - { type: "狗狗", count: 6 }, - { type: "其他", count: 3 }, - ], - datasource_status: [ - { status: "已查看", count: 3 }, - { status: "治疗中", count: 12 }, - { status: "已治愈", count: 9 }, - ], - datasource_cost: [ - { name: "毛毛", stay: 3, cost: 2000 }, - { name: "阿布", stay: 6, cost: 4200 }, - { name: "咪咪", stay: 7, cost: 6000 }, - { name: "狗蛋", stay: 1, cost: 1000 }, - ], - }; -} - -/** - * Compute out - * @param {*} field - * @param {*} value - * @param {*} data - * @returns - */ -function Income(field, value, data) { - log.Info( - "[chart] Income Compute: %s", - JSON.stringify({ field: field, value: value, data: data }) - ); - return value; -} - \ No newline at end of file diff --git a/scripts/vector.js b/scripts/vector.js index c606207..03e95a0 100644 --- a/scripts/vector.js +++ b/scripts/vector.js @@ -4,31 +4,17 @@ */ const MaxTokens = 1536; -const distance = 0.2; -const distancePrompts = 2; -const pageSize = 9; /** - * Query content from the vector database + * Match content from the vector database + * yao run scripts.vector.Match '::{"pathname":"/x/Table"}' '::[{"role":"user", "content":"Yao 是什么"}]' + * * @param {*} context * @param {*} messages * @returns */ function Match(context, messages) { - console.log(context, messages); - return [ - { - role: "system", - content: `{"name":"test.pdf", "url":"https://www.google.com"}`, - }, - { - role: "system", - content: ` - - The above content is my knowledge base. - - Please prioritize answering user questions based on my knowledge base provided to you. - `, - }, - ]; + return match(context, messages, 2048); } /** @@ -121,12 +107,12 @@ function Save(payload) { }); // debug - console.log(pages); + // console.log(pages); return { code: 200, message: "ok" }; } /** - * Validate the token size + * Reduce the content size * @param {*} content */ function Reduce(content) { @@ -164,595 +150,75 @@ function ReadFile(file) { } /** - * the schema of the vector database - */ -const DocumentSchema = { - class: "Document", - description: "Used to store documents", - vectorizer: "text2vec-openai", - moduleConfig: { - "text2vec-openai": { model: "ada", modelVersion: "002", type: "text" }, - }, - properties: [ - { - name: "type", - dataType: ["string"], - description: - "The type of the document (e.g. note, ppt, doc, xls, pdf, url, etc.)", - moduleConfig: { - "text2vec-openai": { skip: false, vectorizePropertyName: false }, - }, - }, - { - name: "path", - dataType: ["string"], - description: "the file path of the document", - moduleConfig: { - "text2vec-openai": { skip: true, vectorizePropertyName: false }, - }, - }, - { - name: "user", - dataType: ["string"], - description: "the user of the document", - moduleConfig: { - "text2vec-openai": { skip: true, vectorizePropertyName: false }, - }, - }, - { - name: "url", - dataType: ["string"], - description: "the url of the document", - moduleConfig: { - "text2vec-openai": { skip: true, vectorizePropertyName: false }, - }, - }, - { - name: "summary", - dataType: ["string"], - description: "The summary of the document", - moduleConfig: { - "text2vec-openai": { skip: false, vectorizePropertyName: false }, - }, - }, - { - name: "content", - dataType: ["text"], - description: "The content of the document", - moduleConfig: { - "text2vec-openai": { skip: false, vectorizePropertyName: false }, - }, - }, - ], -}; - -/** - * Create a schema (run when the application setup) - * yao run scripts.vector.SchemaCreate - */ -function SchemaCreate() { - let cfg = setting(); - let url = `${cfg.host}/v1/schema`; - post(url, DocumentSchema, cfg.key); - return "Document"; -} - -/** - * Create a schema (run when the application setup) - * yao run scripts.vector.SchemaDelete - */ -function SchemaDelete() { - let cfg = setting(); - let url = `${cfg.host}/v1/schema/Document`; - let response = http.Delete(url); - if (response.code != 200) { - let errors = response.data.error || response.data; - let message = errors.length > 0 ? errors[0].message : "unknown"; - throw new Exception(message, response.code || 500); - } - - return true; -} - -/** - * Create a schema (run when the application setup) - * yao run scripts.vector.SchemaReset - */ -function SchemaReset() { - SchemaDelete(); - return SchemaCreate(); -} - -/** - * SchemaGet - * yao run scripts.vector.SchemaGet - * @returns - */ -function SchemaGet() { - let cfg = setting(); - let url = `${cfg.host}/v1/schema`; - let data = get(url, {}, cfg.key); - if (data.classes.length < 1) { - throw new Exception("Document not found", 404); - } - return data.classes[0]; -} - -/** - * Make a test - * yao run scripts.vector.Test - */ -function Test() { - console.log("请稍等,这将花费一些时间..."); - let text = testContent(); - let contents = []; - - try { - contents = JSON.parse(text); - } catch (err) { - console.log(err, text); - } - - let ids = []; - contents.forEach((object) => { - let id = Insert(object); - ids.push(id); - }); - return ids; -} - -/** - * Insert Data - * - * yao run scripts.vector.Insert '::{"content": "这是一直测试文档,你需要从这里开始。"}' - * - * @param {*} data - */ -function Insert(data) { - data = data || {}; - if (!data.content) { - throw new Exception("content is required", 400); - } - - let cfg = setting(); - let url = `${cfg.host}/v1/objects?consistency_level=ALL`; - - let properties = {}; - properties.type = data.type || "note"; - properties.path = data.path || ""; - properties.user = data.user || "__public"; - properties.url = data.url || ""; - properties.content = data.content; // required - properties.summary = data.summary || getSummary(data.content); - let res = post(url, { class: "Document", properties: properties }, cfg.key); - return res.id; -} - -/** - * Get Objects - * yao run scripts.vector.Objects - */ -function Objects() { - let cfg = setting(); - let url = `${cfg.host}/v1/objects`; - return get(url, {}, cfg.key); -} - -/** - * Query Data - * yao run scripts.vector.Query 帮我写一份心血管健康研究的报告 张三 - * yao run scripts.vector.Query 帮我写一份心血管健康研究的报告 - * @param {*} input - * @param {*} user + * match the content + * @param {*} context + * @param {*} messages */ -function Query(input, user) { - let vector = getVector(input); - let cfg = setting(); - let url = `${cfg.host}/v1/graphql`; - - let where = `{ - operator: Or, - operands: { - path: ["user"], - operator: Equal, - valueString: "__public" - } - }`; - - if (user) { - where = `{ - operator: Or, - operands: [ - { - path: ["user"], - operator: Equal, - valueString: "${user}" - },{ - path: ["user"], - operator: Equal, - valueString: "__public" - } - ] - }`; - } - - let payload = { - query: `{ - Get { - Document( - limit: 10 - nearVector: { - vector: ${vector} - distance: ${distancePrompts} - } - where: ${where} - ) - { - user - path - type - url - summary - content - _additional{ - id - distance - lastUpdateTimeUnix - } - } - } - }`, - }; - - let response = post(url, payload, cfg.key); - let data = response.data || { Get: { Document: [] } }; - let items = data.Get.Document || []; - for (let i = 0; i < items.length; i++) { - let item = items[i]; - item.id = item._additional.id; - item.lastUpdateTimeUnix = item._additional.lastUpdateTimeUnix; - item.distance = item._additional.distance; - delete item._additional; - } - return items; -} +function match(context, messages, maxTokenSize) { + // ============================================================================= + // You can add your own code here + // Change the code to match your own knowledge base + // ============================================================================== -/** - * - * Search Data - * yao run scripts.vector.Search 帮我写一份心血管健康研究的报告 1 张三 - * yao run scripts.vector.Search 帮我写一份心血管健康研究的报告 - * @param {*} input - * @param {*} page - * @param {*} user - * @returns - */ -function Search(input, page, user) { - page = page ? parseInt(page) : 1; - let offset = page ? (page - 1) * pageSize : 0; - let vector = getVector(input); - let cfg = setting(); - let url = `${cfg.host}/v1/graphql`; - if (!user) { - let info = Process("session.Get", "user") || {}; - user = info.id; + messages = messages || []; + if (messages.length == 0) { + throw new Exception("messages is empty", 400); } - let where = `{ - operator: Or, - operands: { - path: ["user"], - operator: Equal, - valueString: "__public" - } - }`; - - if (user) { - where = `{ - operator: Or, - operands: [ - { - path: ["user"], - operator: Equal, - valueString: "${user}" - },{ - path: ["user"], - operator: Equal, - valueString: "__public" - } - ] - }`; + const input = messages[messages.length - 1].content || ""; + if (input == "") { + throw new Exception("input is empty", 400); } - let payload = { - query: `{ - Aggregate { - Document ( - nearVector: { - vector: ${vector} - distance: ${distance} - } - where: ${where} - ) - { - meta { - count - } - } - } - Get { - Document( - limit: ${pageSize} - offset: ${offset} - nearVector: { - vector: ${vector} - distance: ${distance} - } - where: ${where} - ) - { - user - path - type - url - summary - content - _additional{ - id - lastUpdateTimeUnix, - distance - } - } - } - }`, - }; - - response = post(url, payload, cfg.key); - data = response.data || { - Get: { Document: [] }, - Aggregate: { Document: [{ meta: { count: 0 } }] }, - }; - - let total = data.Aggregate.Document[0].meta.count; - let pages = Math.ceil(total / pageSize); - let prev = page > 1 ? page - 1 : 1; - let next = page < pages ? page + 1 : pages; - let items = data.Get.Document || []; - for (let i = 0; i < items.length; i++) { - let item = items[i]; - item.id = item._additional.id; - item.lastUpdateTimeUnix = item._additional.lastUpdateTimeUnix; - item.distance = item._additional.distance; - delete item._additional; + const payload = { input: input, distance: 1.9 }; + const resp = Process("scripts.doc.Query", payload, 1, 20); + if (resp && resp.code && resp.message) { + throw new Exception(resp.message, resp.code); } - return { - items: items, - total: total, - prev: prev, - next: next, - curr: page, - pages: pages, - }; + const docs = resp.data || []; + return ReduceMessage(messages, docs, maxTokenSize); } -function Find(id) { - let cfg = setting(); - let url = `${cfg.host}/v1/objects/Document/${id}`; - let response = get(url, { consistency_level: "ONE" }, cfg.key); - response = response || {}; - response.properties = response.properties || {}; - for (let key in response.properties) { - response[key] = response.properties[key]; - } - delete response.properties; - return response; -} - -// === utils ================================= - -function getVector(input, user) { - let response = Process( - "openai.Embeddings", - "openai.text-embedding-ada-002", - input, - user - ); - - let data = response.data || []; - let embedding = data.length > 0 ? data[0].embedding : []; - return JSON.stringify(embedding); -} +function ReduceMessage(messages, docs, maxTokenSize) { + // ============================================================================= + // You can add your own code here + // Change the code to match your own knowledge base + // ============================================================================== -function getSummary(content) { - let response = Process("openai.chat.Completions", "openai.gpt-3_5-turbo", [ + maxTokenSize = maxTokenSize == undefined ? MaxTokens : maxTokenSize; + newMessages = [ { role: "system", content: ` - 你只能回复200字的内容摘要。 - 不要解释你的答案,也不要使用标点符号。 - `, + - The above content is my knowledge base. + - The field "content" is the content of the document. + - The field "summary" is the summary of the document. + - The field "name" is the title of the document. + - The field "path" is the file path of the document. + - The field "type" is the type of the document. + - Please prioritize answering user questions based on my knowledge base provided to you. + `, }, - { role: "user", content: content }, - ]); - - let choices = response.choices || []; - if (choices.length < 1) { - throw new Exception("answer error", 400); - } - - let message = choices[0].message || {}; - return message.content; -} + ]; -function makeRequest(question, user, sid) { - sid = sid || Process("utils.str.UUID"); - let history = getHistory(sid); // get the conversation history - let docs = getDocs(question, history, user); // query the knowledge base - let summaries = getSummaries(docs); // get the summaries of the knowledge base - - let messages = []; - let size = 0; - - while (history.messages.length >= 0) { - messages = [ - { - role: "system", - content: ` - - The above content is my knowledge base. - - Please prioritize answering user questions based on my knowledge base provided to you. - `, - }, - ...history.messages, - { role: "user", content: question }, - ]; - - size = Process("scripts.openai.TokenSize", JSON.stringify(messages)); - if (size < MaxTokens - 1000) { + messageText = JSON.stringify(messages); + let tokenSize = 0; + while (tokenSize < maxTokenSize) { + const doc = docs.shift(); + if (!doc) { break; } - - history.messages.shift(); - history.messages.shift(); - } - - // add the documents to the messages - if (docs.length > 0) { - messages = [ - { role: "system", content: JSON.stringify(docs[0]) }, - ...messages, - ]; - - size = Process("scripts.openai.TokenSize", JSON.stringify(messages)); + newMessages.unshift({ role: "system", content: JSON.stringify(doc) }); + const text = JSON.stringify(newMessages) + messageText; + tokenSize = Process("openai.Tiktoken", "gpt-3.5-turbo", text); } - // add the summary to the messages - if (size < MaxTokens && summaries.length > 0) { - messages = [ - { role: "system", content: JSON.stringify(summaries) }, - ...messages, - ]; + console.log("--- Vector Query------", messages); - size = Process("scripts.openai.TokenSize", JSON.stringify(messages)); + if (newMessages.length > 1) { + console.log("--- Vector Match ---", newMessages); } - // add more documents to the messages - if (size < MaxTokens && docs.length > 1) { - for (let i = 1; i < docs.length; i++) { - messages = [ - { role: "system", content: JSON.stringify(docs[i]) }, - ...messages, - ]; - - size = Process("scripts.openai.TokenSize", JSON.stringify(messages)); - if (size >= MaxTokens) { - break; - } - } - } - - return messages; -} - -/** - * Generate test data - * yao run scripts.vector.testContent - */ -function testContent() { - let response = Process("openai.chat.Completions", "openai.gpt-3_5-turbo", [ - { role: "system", content: JSON.stringify(DocumentSchema.properties) }, - { - role: "system", - content: ` - - Generate a set of data according to the data type given to your data structure. - - You can only respond with: [{"":"", ...}...]" - - The type property is required, and the value can only be: "note", "ppt", "doc", "xls", "pdf", "url" - - If the type is url, the url property is required, otherwise, the path property is required. - - The property value should be Chinese generated according to the data type. - - The content property is required. - - The summary property value should be the summary of content property. - - all properties are required, but some properties can be empty. - - Do not explain your answer, and do not use punctuation. - `, - }, - { - role: "user", - content: `Generate 5 items, You must only respond JSON Object.`, - }, - ]); - - let choices = response.choices || []; - if (choices.length < 1) { - throw new Exception("answer error", 400); - } - - let message = choices[0].message || {}; - return message.content; -} - -/** - * Post data - * @param {*} url - * @param {*} payload - * @param {*} key - * @returns - */ -function get(url, query, key) { - let response = http.Get(url, query, null, null, { - "X-OpenAI-Api-Key": key, - }); - - if (response.code == 404) { - throw new Exception(`not found`, 404); - } - - if (response.code != 200) { - if (response.data && response.data.message && response.data.code) { - throw new Exception( - response.data.message.split(":")[0], - response.data.code - ); - } - - let errors = response.data.error || response.data; - let message = errors.length > 0 ? errors[0].message : "unknown"; - throw new Exception(message, response.code || 500); - } - - return response.data; -} - -/** - * Post data - * @param {*} url - * @param {*} payload - * @param {*} key - * @returns - */ -function post(url, payload, key) { - let response = http.Post(url, payload, null, null, { - "X-OpenAI-Api-Key": key, - }); - - if (response.code != 200) { - let errors = response.data.error || response.data; - let message = errors.length > 0 ? errors[0].message : "unknown"; - throw new Exception(message, response.code || 500); - } - - return response.data; -} - -/** - * Get Weaviate setting - * @returns {Map} - */ -function setting() { - let vars = Process("utils.env.GetMany", "WEAVIATE_HOST", "OPENAI_KEY"); - return { - key: vars["OPENAI_KEY"], - host: vars["WEAVIATE_HOST"] - ? vars["WEAVIATE_HOST"] - : "http://127.0.0.1:5080", - }; + return newMessages.length > 1 ? newMessages : []; }