-
Notifications
You must be signed in to change notification settings - Fork 64
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
(EAI-581): Chatbot verified answers eval/improvments + export `mongod…
…b-rag-core/dataSources` (#550) * Start on VA evals * prep evals for run * standard eval format * VA eval * separate data sources exports * latest changes * Remove evals from build * Fix broken tests * Update packages/chatbot-server-mongodb-public/src/verifiedAnswers.eval.ts Co-authored-by: Nick Larew <nick.larew@mongodb.com> * Update packages/chatbot-server-mongodb-public/src/verifiedAnswers.eval.ts Co-authored-by: Nick Larew <nick.larew@mongodb.com> * Update packages/mongodb-rag-core/src/index.ts Co-authored-by: Nick Larew <nick.larew@mongodb.com> --------- Co-authored-by: Nick Larew <nick.larew@mongodb.com>
- Loading branch information
Showing
28 changed files
with
363 additions
and
164 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
16 changes: 16 additions & 0 deletions
16
packages/chatbot-server-mongodb-public/src/test/cosineSimilarity.ts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
export function cosineSimilarity(a: number[], b: number[]) { | ||
// https://towardsdatascience.com/how-to-build-a-textual-similarity-analysis-web-app-aa3139d4fb71 | ||
|
||
const magnitudeA = Math.sqrt(dotProduct(a, a)); | ||
const magnitudeB = Math.sqrt(dotProduct(b, b)); | ||
if (magnitudeA && magnitudeB) { | ||
// https://towardsdatascience.com/how-to-measure-distances-in-machine-learning-13a396aa34ce | ||
return dotProduct(a, b) / (magnitudeA * magnitudeB); | ||
} else { | ||
return 0; | ||
} | ||
} | ||
|
||
function dotProduct(a: number[], b: number[]) { | ||
return a.reduce((acc, cur, i) => acc + cur * b[i], 0); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
279 changes: 279 additions & 0 deletions
279
packages/chatbot-server-mongodb-public/src/verifiedAnswers.eval.ts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,279 @@ | ||
import { Eval, EvalCase, EvalScorer } from "braintrust"; | ||
import { MongoDbTag } from "./mongoDbMetadata"; | ||
import { | ||
findVerifiedAnswer, | ||
verifiedAnswerConfig, | ||
verifiedAnswerStore, | ||
} from "./config"; | ||
import { FindVerifiedAnswerResult } from "mongodb-chatbot-server"; | ||
import { | ||
parseVerifiedAnswerYaml, | ||
VerifiedAnswerSpec, | ||
} from "mongodb-chatbot-verified-answers"; | ||
import path from "path"; | ||
import fs from "fs"; | ||
import "dotenv/config"; | ||
import { cosineSimilarity } from "./test/cosineSimilarity"; | ||
import { strict as assert } from "assert"; | ||
|
||
interface VerifiedAnswersEvalCaseInput { | ||
query: string; | ||
} | ||
interface VerifiedAnswersEvalCaseExpected { | ||
/** | ||
The expected verified answer to the query. | ||
If `undefined`, expects no verified answer. | ||
*/ | ||
answer?: string; | ||
} | ||
|
||
interface VerifiedAnswersEvalCaseMetadata extends Record<string, unknown> { | ||
similarVerifiedAnswerQuery?: string; | ||
description?: string; | ||
} | ||
|
||
type VerifiedAnswerTag = "perturbation" | "should_match" | "should_not_match"; | ||
|
||
type MongoDbVerifiedAnswerTag = MongoDbTag | VerifiedAnswerTag; | ||
|
||
interface VerifiedAnswersEvalCase | ||
extends EvalCase< | ||
VerifiedAnswersEvalCaseInput, | ||
VerifiedAnswersEvalCaseExpected, | ||
VerifiedAnswersEvalCaseMetadata | ||
> { | ||
tags?: MongoDbVerifiedAnswerTag[]; | ||
} | ||
|
||
type VerifiedAnswersTaskOutput = FindVerifiedAnswerResult; | ||
|
||
type VerifiedAnswersEvalCaseScorer = EvalScorer< | ||
VerifiedAnswersEvalCaseInput, | ||
VerifiedAnswersTaskOutput, | ||
VerifiedAnswersEvalCaseExpected, | ||
VerifiedAnswersEvalCaseMetadata | ||
>; | ||
|
||
const verifiedAnswersPath = path.resolve( | ||
__dirname, | ||
"..", | ||
"..", | ||
"..", | ||
"verified-answers.yaml" | ||
); | ||
const verifiedAnswerSpecs = parseVerifiedAnswerYaml( | ||
fs.readFileSync(verifiedAnswersPath, "utf-8") | ||
); | ||
const verifiedAnswerIndex = makeVerifiedAnswerIndex(verifiedAnswerSpecs); | ||
|
||
const verifiedAnswerEvalCases: VerifiedAnswersEvalCase[] = [ | ||
makeVerifiedAnswerEvalCase({ | ||
inputQuery: "what is the aggregation framework", | ||
similarVerifiedAnswerQuery: "What is aggregation in MongoDB?", | ||
tags: ["aggregation", "perturbation"], | ||
verifiedAnswerIndex, | ||
}), | ||
makeVerifiedAnswerEvalCase({ | ||
inputQuery: "agg framework", | ||
similarVerifiedAnswerQuery: "What is aggregation in MongoDB?", | ||
tags: ["aggregation", "perturbation", "should_match"], | ||
verifiedAnswerIndex, | ||
}), | ||
makeVerifiedAnswerEvalCase({ | ||
inputQuery: "what's the process to insert data into MongoDB", | ||
similarVerifiedAnswerQuery: "How do I insert data into MongoDB?", | ||
verifiedAnswerIndex, | ||
tags: ["perturbation", "should_match"], | ||
}), | ||
makeVerifiedAnswerEvalCase({ | ||
inputQuery: "How can I insert data into MongoDB?", | ||
similarVerifiedAnswerQuery: "How do I insert data into MongoDB?", | ||
verifiedAnswerIndex, | ||
tags: ["perturbation", "should_match"], | ||
}), | ||
makeVerifiedAnswerEvalCase({ | ||
inputQuery: "insert data into mongodb", | ||
similarVerifiedAnswerQuery: "How do I insert data into MongoDB?", | ||
verifiedAnswerIndex, | ||
tags: ["perturbation", "should_match"], | ||
}), | ||
makeVerifiedAnswerEvalCase({ | ||
inputQuery: "password reset", | ||
similarVerifiedAnswerQuery: "Can i reset my password", | ||
verifiedAnswerIndex, | ||
tags: ["perturbation", "iam", "should_match"], | ||
}), | ||
makeVerifiedAnswerEvalCase({ | ||
inputQuery: "reset my password", | ||
similarVerifiedAnswerQuery: "Can i reset my password", | ||
verifiedAnswerIndex, | ||
tags: ["perturbation", "iam", "should_match"], | ||
}), | ||
makeVerifiedAnswerEvalCase({ | ||
inputQuery: "reset database password", | ||
similarVerifiedAnswerQuery: "Can i reset my password", | ||
verifiedAnswerIndex, | ||
tags: ["perturbation", "iam", "should_match"], | ||
}), | ||
makeVerifiedAnswerEvalCase({ | ||
inputQuery: "connect to stream process", | ||
verifiedAnswerIndex, | ||
tags: ["atlas_stream_processing", "should_not_match"], | ||
}), | ||
makeVerifiedAnswerEvalCase({ | ||
inputQuery: "connect to database kotlin", | ||
verifiedAnswerIndex, | ||
tags: ["driver", "kotlin", "should_not_match"], | ||
}), | ||
makeVerifiedAnswerEvalCase({ | ||
inputQuery: "connect to database with Kotlin coroutine driver", | ||
verifiedAnswerIndex, | ||
tags: ["driver", "kotlin", "kotlin_coroutine_driver", "should_not_match"], | ||
}), | ||
// 👇 From EAI-580 👇 | ||
makeVerifiedAnswerEvalCase({ | ||
inputQuery: "how do I set up billing alerts in Atlas", | ||
// No similar verified answer | ||
tags: ["billing", "should_not_match"], | ||
verifiedAnswerIndex, | ||
}), | ||
]; | ||
|
||
// Helper function to create a verified answer eval case | ||
function makeVerifiedAnswerEvalCase(args: { | ||
inputQuery: string; | ||
similarVerifiedAnswerQuery?: string; | ||
description?: string; | ||
tags?: MongoDbVerifiedAnswerTag[]; | ||
verifiedAnswerIndex: VerifiedAnswerIndex; | ||
}): VerifiedAnswersEvalCase { | ||
return { | ||
input: { | ||
query: args.inputQuery, | ||
}, | ||
expected: { | ||
answer: args.similarVerifiedAnswerQuery | ||
? findExactVerifiedAnswer( | ||
args.similarVerifiedAnswerQuery, | ||
args.verifiedAnswerIndex | ||
) | ||
: undefined, | ||
}, | ||
tags: args.tags, | ||
metadata: { | ||
similarVerifiedAnswerQuery: args.similarVerifiedAnswerQuery, | ||
description: args.description, | ||
}, | ||
}; | ||
} | ||
|
||
// -- Evaluation metrics -- | ||
const MatchesSomeVerifiedAnswer: VerifiedAnswersEvalCaseScorer = (args) => { | ||
return { | ||
name: "MatchesSomeVerifiedAnswer", | ||
score: args.output.answer ? 1 : 0, | ||
}; | ||
}; | ||
|
||
const MatchesExpectedOutput: VerifiedAnswersEvalCaseScorer = (args) => { | ||
const isMatch = args.output.answer?.answer === args.expected.answer; | ||
const matchType = [ | ||
isMatch ? "true" : "false", | ||
args.expected.answer === undefined ? "negative" : "positive", | ||
].join("_"); | ||
|
||
return { | ||
name: "MatchesExpectedOutput", | ||
score: isMatch ? 1 : 0, | ||
metadata: { | ||
type: matchType, | ||
}, | ||
}; | ||
}; | ||
|
||
const SearchScore: VerifiedAnswersEvalCaseScorer = (args) => { | ||
return { | ||
name: "SearchScore", | ||
score: args.output.answer?.score ?? null, | ||
}; | ||
}; | ||
|
||
// BUG: Getting Mongo connection closed errors on this scorer with the clean up. | ||
const ReferenceAnswerCosineSimilarity: VerifiedAnswersEvalCaseScorer = async ( | ||
args | ||
) => { | ||
const name = "ReferenceAnswerCosineSimilarity"; | ||
const { similarVerifiedAnswerQuery } = args.metadata; | ||
|
||
if (!similarVerifiedAnswerQuery) { | ||
return { | ||
name, | ||
score: null, | ||
}; | ||
} | ||
const [verifiedAnswer] = await verifiedAnswerStore.find({ | ||
"question.text": similarVerifiedAnswerQuery, | ||
}); | ||
assert( | ||
verifiedAnswer, | ||
`No verified answer found for query: ${similarVerifiedAnswerQuery}` | ||
); | ||
return { | ||
name, | ||
score: cosineSimilarity( | ||
args.output.queryEmbedding, | ||
verifiedAnswer.question.embedding | ||
), | ||
}; | ||
}; | ||
|
||
type VerifiedAnswerIndex = Record<string, string>; | ||
/** | ||
Construct index of all verified answer for faster look up | ||
*/ | ||
function makeVerifiedAnswerIndex( | ||
verifiedAnswerSpecs: VerifiedAnswerSpec[] | ||
): VerifiedAnswerIndex { | ||
const verifiedAnswerIndex: VerifiedAnswerIndex = {}; | ||
for (const { questions, answer } of verifiedAnswerSpecs) { | ||
questions.forEach((question) => { | ||
verifiedAnswerIndex[question] = answer; | ||
}); | ||
} | ||
return verifiedAnswerIndex; | ||
} | ||
|
||
function findExactVerifiedAnswer( | ||
query: string, | ||
verifiedAnswerIndex: VerifiedAnswerIndex | ||
): string | undefined { | ||
return verifiedAnswerIndex[query]; | ||
} | ||
|
||
Eval< | ||
VerifiedAnswersEvalCaseInput, | ||
VerifiedAnswersTaskOutput, | ||
VerifiedAnswersEvalCaseExpected, | ||
VerifiedAnswersEvalCaseMetadata | ||
>("mongodb-chatbot-verified-answers", { | ||
experimentName: `mongodb-chatbot-latest-${verifiedAnswerConfig.embeddingModel}-minScore-${verifiedAnswerConfig.findNearestNeighborsOptions.minScore}`, | ||
metadata: { | ||
description: | ||
"Evaluates if gets the correct verified answers for a given query", | ||
verifiedAnswerConfig: verifiedAnswerConfig, | ||
}, | ||
async data() { | ||
return verifiedAnswerEvalCases; | ||
}, | ||
maxConcurrency: 5, | ||
async task(input) { | ||
const verifiedAnswer = await findVerifiedAnswer(input); | ||
return verifiedAnswer; | ||
}, | ||
scores: [ | ||
MatchesSomeVerifiedAnswer, | ||
MatchesExpectedOutput, | ||
ReferenceAnswerCosineSimilarity, | ||
SearchScore, | ||
], | ||
}); |
Oops, something went wrong.