diff --git a/Makefile b/Makefile index d02d7bd61..19e244e47 100644 --- a/Makefile +++ b/Makefile @@ -41,10 +41,10 @@ install: --with=dev --with=docs --with=lint --with=test install-editable: - @python3 -m pip install -e ".[contrib]" --upgrade + @python3 -m pip install -e ".[contrib, langchain]" --upgrade install-editable-mac-sys: - @python3 -m pip install -e ".[contrib]" --upgrade --user --break-system-packages + @python3 -m pip install -e ".[contrib, langchain]" --upgrade --user --break-system-packages # LINTING diff --git a/examples/FinanceBench/Makefile b/examples/FinanceBench/Makefile index 339e0e715..66a41df49 100644 --- a/examples/FinanceBench/Makefile +++ b/examples/FinanceBench/Makefile @@ -33,6 +33,9 @@ agent-solve-all-combos: @poetry run python htp_oodar_agent.py ${id} --knowledge --prog-store --llama3 +langchain-react-solve: + @poetry run python langchain_react.py ${id} + openai-assist: @poetry run python openai_assist.py ${id} diff --git a/examples/FinanceBench/eval.py b/examples/FinanceBench/eval.py index 575943f4b..77f491f4f 100644 --- a/examples/FinanceBench/eval.py +++ b/examples/FinanceBench/eval.py @@ -17,7 +17,7 @@ # pylint: disable=wrong-import-order from data_and_knowledge import (FbId, Question, Answer, Category, GroundTruth, FB_ID_COL_NAME, GROUND_TRUTHS, N_CASES, CAT_DISTRIB, - OUTPUT_FILE_PATH, get_or_create_output_df) + LOCAL_CACHE_DIR_PATH, OUTPUT_FILE_PATH, get_or_create_output_df) from log import switch_log_file if TYPE_CHECKING: @@ -191,6 +191,86 @@ def compare_eval(output_name: str, baseline_output_name: str = 'RAG-Default'): ['doc_name', 'category', baseline_output_name, output_name]] +def eval_accuracy_and_consistency_wrt_ground_truths(output_name: str, output_file_names: list[str]): + # pylint: disable=too-many-locals + + n_output_files: int = len(output_file_names) + correctness_col_name: str = f'{output_name}---CORRECTNESS' + + n_yes_scores_by_fb_id: defaultdict = defaultdict(int) + incorrect_answer_fb_ids: dict[FbId, str] = {} + + for output_df in (read_csv(LOCAL_CACHE_DIR_PATH / output_file_name, index_col=FB_ID_COL_NAME) + for output_file_name in output_file_names): + + for fb_id, correctness in output_df[correctness_col_name].items(): + ground_truth: GroundTruth = GROUND_TRUTHS[fb_id] + + if notna(correctness) and correctness: + n_yes_scores_by_fb_id[fb_id] += 1 + + else: + incorrect_answer_fb_ids[fb_id]: str = ('expert answer inadequate' + if ground_truth.get('answer-inadequate') + else ('evaluator unreliable' + if ground_truth.get('evaluator-unreliable') + else '')) + + cumu_avg_accuracy_scores_by_category: defaultdict = defaultdict(int) + cumu_consistency_scores_by_category: defaultdict = defaultdict(float) + + for fb_id, ground_truth in GROUND_TRUTHS.items(): + cumu_avg_accuracy_scores_by_category[cat := ground_truth['category']] += (a := n_yes_scores_by_fb_id[fb_id] / n_output_files) + cumu_consistency_scores_by_category[cat] += 2 * abs(a - 0.5) + + print(f'TOTAL CORRECT: {(n := sum(cumu_avg_accuracy_scores_by_category.values()))} / {N_CASES} = {n / N_CASES:.1%}') + + pprint({category: (f'{(n := cumu_avg_accuracy_scores_by_category[category])} / {n_for_category} ' + f'= {n / n_for_category:.1%}') + for category, n_for_category in CAT_DISTRIB.items()}) + + pprint({ + 'EASY': (f'{(e := sum(cumu_avg_accuracy_scores_by_category[easy_cat] + for easy_cat in (Category.RETRIEVE, Category.COMPARE, Category.CALC_CHANGE)))} / ' + f'{(se := sum(CAT_DISTRIB[easy_cat] + for easy_cat in (Category.RETRIEVE, Category.COMPARE, Category.CALC_CHANGE)))} ' + f'= {e / se:.1%}'), + + 'HARD': (f'{(h := sum(cumu_avg_accuracy_scores_by_category[hard_cat] + for hard_cat in (Category.CALC_COMPLEX, Category.CALC_AND_JUDGE, + Category.EXPLAIN_FACTORS, Category.OTHER_ADVANCED)))} / ' + f'{(sh := sum(CAT_DISTRIB[hard_cat] + for hard_cat in (Category.CALC_COMPLEX, Category.CALC_AND_JUDGE, + Category.EXPLAIN_FACTORS, Category.OTHER_ADVANCED)))} ' + f'= {h / sh:.1%}') + }) + + print(f'\nTOTAL CONSISTENT: {(n := sum(cumu_consistency_scores_by_category.values()))} / {N_CASES} = {n / N_CASES:.1%}') + + pprint({category: (f'{(n := cumu_consistency_scores_by_category[category])} / {n_for_category} ' + f'= {n / n_for_category:.1%}') + for category, n_for_category in CAT_DISTRIB.items()}) + + pprint({ + 'EASY': (f'{(e := sum(cumu_consistency_scores_by_category[easy_cat] + for easy_cat in (Category.RETRIEVE, Category.COMPARE, Category.CALC_CHANGE)))} / ' + f'{(se := sum(CAT_DISTRIB[easy_cat] + for easy_cat in (Category.RETRIEVE, Category.COMPARE, Category.CALC_CHANGE)))} ' + f'= {e / se:.1%}'), + + 'HARD': (f'{(h := sum(cumu_consistency_scores_by_category[hard_cat] + for hard_cat in (Category.CALC_COMPLEX, Category.CALC_AND_JUDGE, + Category.EXPLAIN_FACTORS, Category.OTHER_ADVANCED)))} / ' + f'{(sh := sum(CAT_DISTRIB[hard_cat] + for hard_cat in (Category.CALC_COMPLEX, Category.CALC_AND_JUDGE, + Category.EXPLAIN_FACTORS, Category.OTHER_ADVANCED)))} ' + f'= {h / sh:.1%}') + }) + + print('\nINCORRECT:') + pprint(incorrect_answer_fb_ids) + + if __name__ == '__main__': arg_parser = argparse.ArgumentParser() diff --git a/examples/FinanceBench/export-multi-runs.py b/examples/FinanceBench/export-multi-runs.py new file mode 100644 index 000000000..d48aa7a1b --- /dev/null +++ b/examples/FinanceBench/export-multi-runs.py @@ -0,0 +1,25 @@ +from argparse import ArgumentParser + +from pandas import DataFrame, read_csv + +from data_and_knowledge import FB_ID_COL_NAME, LOCAL_CACHE_DIR_PATH + + +EXPORT_FILE_NAME: str = 'export-multi-runs.csv' + + +arg_parser = ArgumentParser() +arg_parser.add_argument('output_name') +arg_parser.add_argument('output_file_names', nargs='+') +args = arg_parser.parse_args() + + +for i, df in enumerate(read_csv(LOCAL_CACHE_DIR_PATH / output_file_name, index_col=FB_ID_COL_NAME) + for output_file_name in args.output_file_names): + if not i: + export_df: DataFrame = df[['question']] + + export_df.loc[:, f'answer {i + 1}'] = df[args.output_name] # pylint: disable=possibly-used-before-assignment + + +export_df.to_csv(LOCAL_CACHE_DIR_PATH / EXPORT_FILE_NAME, index=True) diff --git a/examples/FinanceBench/ground-truths.yml b/examples/FinanceBench/ground-truths.yml index a9b417862..17ae0078d 100644 --- a/examples/FinanceBench/ground-truths.yml +++ b/examples/FinanceBench/ground-truths.yml @@ -545,7 +545,7 @@ financebench_id_01319: evaluator-unreliable: true -financebench_id_00540: # TODO: retrieve COGS +financebench_id_00540: sector: Utilities company: AES Corporation @@ -769,7 +769,9 @@ financebench_id_01935: category: 0-RETRIEVE correctness: >- - the answer mentions supplemental indentures related to debt + the answer mentions indenture(s) + + evaluator-unreliable: true financebench_id_00799: @@ -945,6 +947,8 @@ financebench_id_01928: 2018 million, 2.018 billion, 2000 million or 2 billion + evaluator-unreliable: true + financebench_id_01930: sector: Materials @@ -1094,9 +1098,9 @@ financebench_id_01198: category: 0-RETRIEVE correctness: |- the answer mentions at least 2 of the following: - - EPYC server processors; - - Gaming; and - - inclusion of Xilinx + - "Data Center" and/or "EPYC"; + - "Gaming" and/or "semi-custom"; and + - "Embedded" and/or "Xilinx" evaluator-unreliable: true @@ -1228,6 +1232,8 @@ financebench_id_00476: the answer concludes that there are no debt securities traded, or, alternatively, that no such debt securities are explicitly reported + evaluator-unreliable: true + financebench_id_01028: sector: Financials @@ -1338,7 +1344,10 @@ financebench_id_01351: category: 2-CALC-CHANGE correctness: >- - the answer says Effective Tax Rate changed from 24.6% to 21.6%, and/or that it decreased by 3 pencentage points + the answer says Effective Tax Rate changed from 24.6% to 21.6%, + and/or that it decreased by 3 pencentage points or 3% + + evaluator-unreliable: true financebench_id_01964: @@ -1472,8 +1481,7 @@ financebench_id_00070: data? If working capital is not a useful or relevant metric for this company, then please state that and explain why. - answer: Yes. American Water Works had postivie working capital of $ 124Mn by FY - 2022. + answer: No, American Water Works had negative working capital of -$1561M in FY 2022. justification: 'Accounts receivable+Income tax receivable+Unbilled revenues+Materials and supplies+other-Accounts payable-Accrued liabilities-Accrued taxes @@ -1484,7 +1492,9 @@ financebench_id_00070: category: 3-CALC-COMPLEX correctness: >- the answer contains a calculated (Net) Working Capital metric value in dollars - answer-inadequate: true + that is NEGATIVE and equivalent to or approximately equal to + minus/negative 1561, minus/negative 1561 million, minus/negative 1.561 billion, + minus/negative 1600, minus/negative 1600 million or minus/negative 1.6 billion evaluator-unreliable: true @@ -1583,7 +1593,7 @@ financebench_id_00685: category: 4-CALC-AND-JUDGE correctness: >- the answer contains calculated Gross Margin - percentage values for 2022 and 2023 that are within 2 percentage points of each other, + percentage values for 2022 and 2023 that are within 2 percentage points (or 2%) of each other, or, alternatively, calculated decimal values that are within 0.02 of each other answer-inadequate: true @@ -1909,7 +1919,7 @@ financebench_id_01091: evaluator-unreliable: true -financebench_id_00678: # tricky: Gross Income is implicit, with missing label +financebench_id_00678: # note: Gross Income is implicit, with missing label sector: Industrials company: Boeing @@ -2266,7 +2276,7 @@ financebench_id_01346: category: 2-CALC-CHANGE correctness: >- the answer says that Effective Tax Rate changed from approximately 20% to approximately 23%, - and/or that it increased by approximately 3 percentage points + and/or that it increased by approximately 3 percentage points or 3% evaluator-unreliable: true @@ -2777,7 +2787,7 @@ financebench_id_00711: evaluator-unreliable: true -financebench_id_00651: +financebench_id_00651: # TODO: retrieve growth rates sector: Health Care company: Johnson & Johnson @@ -2826,6 +2836,8 @@ financebench_id_01484: correctness: >- the answer mentions US sales increased and international sales decreased + evaluator-unreliable: true + financebench_id_01488: sector: Health Care @@ -3191,7 +3203,7 @@ financebench_id_03718: evaluator-unreliable: true -financebench_id_04171: # TODO: retrieve Accounts Payable +financebench_id_04171: sector: Consumer Discretionary company: MGM Resorts @@ -3218,6 +3230,8 @@ financebench_id_04171: # TODO: retrieve Accounts Payable 303, 303 million, 0.303 billion, 300, 300 million or 0.3 billion + evaluator-unreliable: true + financebench_id_03849: sector: Consumer Discretionary @@ -3481,7 +3495,7 @@ financebench_id_04458: (if the answer is a single number, assume that it is that calculated EBITDA Margin metric value) -financebench_id_03282: # TODO: retrieve Total Current Liabilities +financebench_id_03282: sector: Communication Services company: Netflix @@ -3508,6 +3522,8 @@ financebench_id_03282: # TODO: retrieve Total Current Liabilities 5466, 5466 million, 5.466 billion, 5500, 5500 million or 5.5 billion + evaluator-unreliable: true + financebench_id_04302: sector: Consumer Discretionary @@ -4005,7 +4021,9 @@ financebench_id_01476: category: 2-CALC-CHANGE correctness: >- - the answer mentions growth guidance raised from 8% to 9%, and/or growth guidance raised by 1 percentage point + the answer mentions growth guidance raised from 8% to 9%, and/or growth guidance raised by 1 percentage point or 1% + + evaluator-unreliable: true financebench_id_00302: @@ -4080,7 +4098,7 @@ financebench_id_02416: # note: Therachon is mentioned on separate following pag category: 0-RETRIEVE correctness: >- - the answer mentions Arena and Trillium + the answer mentions Trillium and Array financebench_id_00283: diff --git a/examples/FinanceBench/langchain_react.py b/examples/FinanceBench/langchain_react.py new file mode 100644 index 000000000..4d8ccee1b --- /dev/null +++ b/examples/FinanceBench/langchain_react.py @@ -0,0 +1,75 @@ +from __future__ import annotations + +from argparse import ArgumentParser +from functools import cache +from typing import TYPE_CHECKING + +from langchain import hub +from langchain.agents.agent import AgentExecutor +from langchain.agents.react.agent import create_react_agent +from langchain_community.document_loaders import PyPDFLoader +from langchain_community.tools.vectorstore.tool import VectorStoreQATool +from langchain_community.vectorstores.faiss import FAISS +from langchain_openai.embeddings.base import OpenAIEmbeddings +from langchain_openai.chat_models.base import ChatOpenAI +from langchain_text_splitters.character import RecursiveCharacterTextSplitter + +from data_and_knowledge import DocName, FbId, Answer, Doc, FB_ID_COL_NAME, DOC_NAMES_BY_FB_ID, QS_BY_FB_ID +from util import enable_batch_qa_and_eval, log_qa_and_update_output_file + +from openssa.core.util.lm.config import LMConfig + +if TYPE_CHECKING: + from langchain_core.documents.base import Document + from langchain_core.embeddings.embeddings import Embeddings + from langchain_core.language_models.llms import BaseLLM + from langchain_core.tools import BaseTool + from langchain_core.vectorstores.base import VectorStore + + +EMBED_MODEL: Embeddings = OpenAIEmbeddings(model='text-embedding-3-large', dimensions=3072, chunk_size=2048) +LLM: BaseLLM = ChatOpenAI(model_name='gpt-4o', temperature=0, seed=LMConfig.DEFAULT_SEED, n=1, max_tokens=2048) + +REACT_PROMPT_TEMPLATE: str = hub.pull('hwchase17/react') + + +@cache +def get_or_create_react_agent_executor(doc_name: DocName): + doc: Doc = Doc(name=doc_name) + + tools: list[BaseTool] = [ + VectorStoreQATool( + name=doc_name, + description=f'{doc.type} SEC Filing by {doc.company} for financial period {doc.period}', + vectorstore=FAISS.from_documents( + documents=(PyPDFLoader(file_path=doc.file_path) + .load_and_split(text_splitter=RecursiveCharacterTextSplitter())), + embedding=EMBED_MODEL), + llm=LLM) + ] + + return AgentExecutor(agent=create_react_agent(llm=LLM, tools=tools, prompt=REACT_PROMPT_TEMPLATE), + tools=tools, + return_intermediate_steps=True, + max_iterations=15, + max_execution_time=None, + early_stopping_method='force', # TODO: 'generate' + handle_parsing_errors=True, + trim_intermediate_steps=-1) + + +@enable_batch_qa_and_eval(output_name='LangChain-ReAct') +@log_qa_and_update_output_file(output_name='LangChain-ReAct') +def solve(fb_id: FbId) -> Answer: + return (get_or_create_react_agent_executor(doc_name=DOC_NAMES_BY_FB_ID[fb_id]) + .invoke({'input': QS_BY_FB_ID[fb_id]})['output']) + + +if __name__ == '__main__': + arg_parser = ArgumentParser() + arg_parser.add_argument('fb_id') + args = arg_parser.parse_args() + + solve(fb_id + if (fb_id := args.fb_id).startswith(FB_ID_COL_NAME) + else f'{FB_ID_COL_NAME}_{fb_id}') diff --git a/examples/FinanceBench/make.bat b/examples/FinanceBench/make.bat index 309bc8304..67f51a9bb 100644 --- a/examples/FinanceBench/make.bat +++ b/examples/FinanceBench/make.bat @@ -15,6 +15,7 @@ IF "%TARGET%"=="agent-solve-w-knowledge-w-llama3" GOTO agent-solve-w-knowledge-w IF "%TARGET%"=="agent-solve-w-knowledge-and-prog-store-w-llama3" GOTO agent-solve-w-knowledge-and-prog-store-w-llama3 IF "%TARGET%"=="agent-solve-all-combos" GOTO agent-solve-all-combos +IF "%TARGET%"=="langchain-react-solve" GOTO langchain-react-solve IF "%TARGET%"=="openai-assist" GOTO openai-assist IF "%TARGET%"=="rag-default-answer" GOTO rag-default-answer @@ -74,6 +75,10 @@ IF "%TARGET%"=="streamlit-run" GOTO streamlit-run GOTO end +:langchain-react-solve + poetry run python langchain_react.py %2 + GOTO end + :openai-assist poetry run python openai_assist.py %2 GOTO end diff --git a/make.bat b/make.bat index 9bc169a38..28555adb2 100644 --- a/make.bat +++ b/make.bat @@ -56,7 +56,7 @@ IF "%TARGET%"=="launch-solver" GOTO launch-solver GOTO end :install-editable - python3 -m pip install -e ".[contrib]" --upgrade --user + python3 -m pip install -e ".[contrib, langchain]" --upgrade --user GOTO end diff --git a/pyproject.toml b/pyproject.toml index fccca245e..d5d92a4a9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -99,6 +99,9 @@ streamlit = {version = ">=1.38", optional = true} streamlit-extras = {version = ">=0.4", optional = true} streamlit-mic-recorder = {version = ">=0.0.8", optional = true} +langchainhub = ">=0.1" +faiss-cpu = ">=1.8" + [tool.poetry.extras] contrib = [ "streamlit", @@ -106,6 +109,11 @@ contrib = [ "streamlit-mic-recorder", ] +langchain = [ + "langchainhub", + "faiss-cpu", +] + [build-system] build-backend = "poetry.core.masonry.api" @@ -128,6 +136,7 @@ disable = [ "missing-class-docstring", "missing-function-docstring", "missing-module-docstring", + "no-name-in-module", "raw-checker-failed", "redefined-outer-name", "relative-beyond-top-level",