Skip to content

Commit

Permalink
Merge pull request #347 from aitomatic/example-langchain-react-agent
Browse files Browse the repository at this point in the history
add LangChain ReAct impl to examples/FinanceBench
  • Loading branch information
TheVinhLuong102 authored Sep 15, 2024
2 parents e890687 + e797c0e commit 0e73028
Show file tree
Hide file tree
Showing 9 changed files with 236 additions and 21 deletions.
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,10 @@ install:
--with=dev --with=docs --with=lint --with=test

install-editable:
@python3 -m pip install -e ".[contrib]" --upgrade
@python3 -m pip install -e ".[contrib, langchain]" --upgrade

install-editable-mac-sys:
@python3 -m pip install -e ".[contrib]" --upgrade --user --break-system-packages
@python3 -m pip install -e ".[contrib, langchain]" --upgrade --user --break-system-packages


# LINTING
Expand Down
3 changes: 3 additions & 0 deletions examples/FinanceBench/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ agent-solve-all-combos:
@poetry run python htp_oodar_agent.py ${id} --knowledge --prog-store --llama3


langchain-react-solve:
@poetry run python langchain_react.py ${id}

openai-assist:
@poetry run python openai_assist.py ${id}

Expand Down
82 changes: 81 additions & 1 deletion examples/FinanceBench/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
# pylint: disable=wrong-import-order
from data_and_knowledge import (FbId, Question, Answer, Category, GroundTruth,
FB_ID_COL_NAME, GROUND_TRUTHS, N_CASES, CAT_DISTRIB,
OUTPUT_FILE_PATH, get_or_create_output_df)
LOCAL_CACHE_DIR_PATH, OUTPUT_FILE_PATH, get_or_create_output_df)
from log import switch_log_file

if TYPE_CHECKING:
Expand Down Expand Up @@ -191,6 +191,86 @@ def compare_eval(output_name: str, baseline_output_name: str = 'RAG-Default'):
['doc_name', 'category', baseline_output_name, output_name]]


def eval_accuracy_and_consistency_wrt_ground_truths(output_name: str, output_file_names: list[str]):
# pylint: disable=too-many-locals

n_output_files: int = len(output_file_names)
correctness_col_name: str = f'{output_name}---CORRECTNESS'

n_yes_scores_by_fb_id: defaultdict = defaultdict(int)
incorrect_answer_fb_ids: dict[FbId, str] = {}

for output_df in (read_csv(LOCAL_CACHE_DIR_PATH / output_file_name, index_col=FB_ID_COL_NAME)
for output_file_name in output_file_names):

for fb_id, correctness in output_df[correctness_col_name].items():
ground_truth: GroundTruth = GROUND_TRUTHS[fb_id]

if notna(correctness) and correctness:
n_yes_scores_by_fb_id[fb_id] += 1

else:
incorrect_answer_fb_ids[fb_id]: str = ('expert answer inadequate'
if ground_truth.get('answer-inadequate')
else ('evaluator unreliable'
if ground_truth.get('evaluator-unreliable')
else ''))

cumu_avg_accuracy_scores_by_category: defaultdict = defaultdict(int)
cumu_consistency_scores_by_category: defaultdict = defaultdict(float)

for fb_id, ground_truth in GROUND_TRUTHS.items():
cumu_avg_accuracy_scores_by_category[cat := ground_truth['category']] += (a := n_yes_scores_by_fb_id[fb_id] / n_output_files)
cumu_consistency_scores_by_category[cat] += 2 * abs(a - 0.5)

print(f'TOTAL CORRECT: {(n := sum(cumu_avg_accuracy_scores_by_category.values()))} / {N_CASES} = {n / N_CASES:.1%}')

pprint({category: (f'{(n := cumu_avg_accuracy_scores_by_category[category])} / {n_for_category} '
f'= {n / n_for_category:.1%}')
for category, n_for_category in CAT_DISTRIB.items()})

pprint({
'EASY': (f'{(e := sum(cumu_avg_accuracy_scores_by_category[easy_cat]
for easy_cat in (Category.RETRIEVE, Category.COMPARE, Category.CALC_CHANGE)))} / '
f'{(se := sum(CAT_DISTRIB[easy_cat]
for easy_cat in (Category.RETRIEVE, Category.COMPARE, Category.CALC_CHANGE)))} '
f'= {e / se:.1%}'),

'HARD': (f'{(h := sum(cumu_avg_accuracy_scores_by_category[hard_cat]
for hard_cat in (Category.CALC_COMPLEX, Category.CALC_AND_JUDGE,
Category.EXPLAIN_FACTORS, Category.OTHER_ADVANCED)))} / '
f'{(sh := sum(CAT_DISTRIB[hard_cat]
for hard_cat in (Category.CALC_COMPLEX, Category.CALC_AND_JUDGE,
Category.EXPLAIN_FACTORS, Category.OTHER_ADVANCED)))} '
f'= {h / sh:.1%}')
})

print(f'\nTOTAL CONSISTENT: {(n := sum(cumu_consistency_scores_by_category.values()))} / {N_CASES} = {n / N_CASES:.1%}')

pprint({category: (f'{(n := cumu_consistency_scores_by_category[category])} / {n_for_category} '
f'= {n / n_for_category:.1%}')
for category, n_for_category in CAT_DISTRIB.items()})

pprint({
'EASY': (f'{(e := sum(cumu_consistency_scores_by_category[easy_cat]
for easy_cat in (Category.RETRIEVE, Category.COMPARE, Category.CALC_CHANGE)))} / '
f'{(se := sum(CAT_DISTRIB[easy_cat]
for easy_cat in (Category.RETRIEVE, Category.COMPARE, Category.CALC_CHANGE)))} '
f'= {e / se:.1%}'),

'HARD': (f'{(h := sum(cumu_consistency_scores_by_category[hard_cat]
for hard_cat in (Category.CALC_COMPLEX, Category.CALC_AND_JUDGE,
Category.EXPLAIN_FACTORS, Category.OTHER_ADVANCED)))} / '
f'{(sh := sum(CAT_DISTRIB[hard_cat]
for hard_cat in (Category.CALC_COMPLEX, Category.CALC_AND_JUDGE,
Category.EXPLAIN_FACTORS, Category.OTHER_ADVANCED)))} '
f'= {h / sh:.1%}')
})

print('\nINCORRECT:')
pprint(incorrect_answer_fb_ids)


if __name__ == '__main__':
arg_parser = argparse.ArgumentParser()

Expand Down
25 changes: 25 additions & 0 deletions examples/FinanceBench/export-multi-runs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from argparse import ArgumentParser

from pandas import DataFrame, read_csv

from data_and_knowledge import FB_ID_COL_NAME, LOCAL_CACHE_DIR_PATH


EXPORT_FILE_NAME: str = 'export-multi-runs.csv'


arg_parser = ArgumentParser()
arg_parser.add_argument('output_name')
arg_parser.add_argument('output_file_names', nargs='+')
args = arg_parser.parse_args()


for i, df in enumerate(read_csv(LOCAL_CACHE_DIR_PATH / output_file_name, index_col=FB_ID_COL_NAME)
for output_file_name in args.output_file_names):
if not i:
export_df: DataFrame = df[['question']]

export_df.loc[:, f'answer {i + 1}'] = df[args.output_name] # pylint: disable=possibly-used-before-assignment


export_df.to_csv(LOCAL_CACHE_DIR_PATH / EXPORT_FILE_NAME, index=True)
52 changes: 35 additions & 17 deletions examples/FinanceBench/ground-truths.yml
Original file line number Diff line number Diff line change
Expand Up @@ -545,7 +545,7 @@ financebench_id_01319:
evaluator-unreliable: true


financebench_id_00540: # TODO: retrieve COGS
financebench_id_00540:
sector: Utilities

company: AES Corporation
Expand Down Expand Up @@ -769,7 +769,9 @@ financebench_id_01935:

category: 0-RETRIEVE
correctness: >-
the answer mentions supplemental indentures related to debt
the answer mentions indenture(s)
evaluator-unreliable: true


financebench_id_00799:
Expand Down Expand Up @@ -945,6 +947,8 @@ financebench_id_01928:
2018 million, 2.018 billion,
2000 million or 2 billion
evaluator-unreliable: true


financebench_id_01930:
sector: Materials
Expand Down Expand Up @@ -1094,9 +1098,9 @@ financebench_id_01198:
category: 0-RETRIEVE
correctness: |-
the answer mentions at least 2 of the following:
- EPYC server processors;
- Gaming; and
- inclusion of Xilinx
- "Data Center" and/or "EPYC";
- "Gaming" and/or "semi-custom"; and
- "Embedded" and/or "Xilinx"
evaluator-unreliable: true

Expand Down Expand Up @@ -1228,6 +1232,8 @@ financebench_id_00476:
the answer concludes that there are no debt securities traded,
or, alternatively, that no such debt securities are explicitly reported
evaluator-unreliable: true


financebench_id_01028:
sector: Financials
Expand Down Expand Up @@ -1338,7 +1344,10 @@ financebench_id_01351:

category: 2-CALC-CHANGE
correctness: >-
the answer says Effective Tax Rate changed from 24.6% to 21.6%, and/or that it decreased by 3 pencentage points
the answer says Effective Tax Rate changed from 24.6% to 21.6%,
and/or that it decreased by 3 pencentage points or 3%
evaluator-unreliable: true


financebench_id_01964:
Expand Down Expand Up @@ -1472,8 +1481,7 @@ financebench_id_00070:
data? If working capital is not a useful or relevant metric for this company,
then please state that and explain why.

answer: Yes. American Water Works had postivie working capital of $ 124Mn by FY
2022.
answer: No, American Water Works had negative working capital of -$1561M in FY 2022.
justification: 'Accounts receivable+Income tax receivable+Unbilled revenues+Materials
and supplies+other-Accounts payable-Accrued liabilities-Accrued taxes
Expand All @@ -1484,7 +1492,9 @@ financebench_id_00070:
category: 3-CALC-COMPLEX
correctness: >-
the answer contains a calculated (Net) Working Capital metric value in dollars
answer-inadequate: true
that is NEGATIVE and equivalent to or approximately equal to
minus/negative 1561, minus/negative 1561 million, minus/negative 1.561 billion,
minus/negative 1600, minus/negative 1600 million or minus/negative 1.6 billion
evaluator-unreliable: true

Expand Down Expand Up @@ -1583,7 +1593,7 @@ financebench_id_00685:
category: 4-CALC-AND-JUDGE
correctness: >-
the answer contains calculated Gross Margin
percentage values for 2022 and 2023 that are within 2 percentage points of each other,
percentage values for 2022 and 2023 that are within 2 percentage points (or 2%) of each other,
or, alternatively, calculated decimal values that are within 0.02 of each other
answer-inadequate: true

Expand Down Expand Up @@ -1909,7 +1919,7 @@ financebench_id_01091:
evaluator-unreliable: true


financebench_id_00678: # tricky: Gross Income is implicit, with missing label
financebench_id_00678: # note: Gross Income is implicit, with missing label
sector: Industrials

company: Boeing
Expand Down Expand Up @@ -2266,7 +2276,7 @@ financebench_id_01346:
category: 2-CALC-CHANGE
correctness: >-
the answer says that Effective Tax Rate changed from approximately 20% to approximately 23%,
and/or that it increased by approximately 3 percentage points
and/or that it increased by approximately 3 percentage points or 3%
evaluator-unreliable: true

Expand Down Expand Up @@ -2777,7 +2787,7 @@ financebench_id_00711:
evaluator-unreliable: true


financebench_id_00651:
financebench_id_00651: # TODO: retrieve growth rates
sector: Health Care

company: Johnson & Johnson
Expand Down Expand Up @@ -2826,6 +2836,8 @@ financebench_id_01484:
correctness: >-
the answer mentions US sales increased and international sales decreased
evaluator-unreliable: true


financebench_id_01488:
sector: Health Care
Expand Down Expand Up @@ -3191,7 +3203,7 @@ financebench_id_03718:
evaluator-unreliable: true


financebench_id_04171: # TODO: retrieve Accounts Payable
financebench_id_04171:
sector: Consumer Discretionary

company: MGM Resorts
Expand All @@ -3218,6 +3230,8 @@ financebench_id_04171: # TODO: retrieve Accounts Payable
303, 303 million, 0.303 billion,
300, 300 million or 0.3 billion
evaluator-unreliable: true


financebench_id_03849:
sector: Consumer Discretionary
Expand Down Expand Up @@ -3481,7 +3495,7 @@ financebench_id_04458:
(if the answer is a single number, assume that it is that calculated EBITDA Margin metric value)
financebench_id_03282: # TODO: retrieve Total Current Liabilities
financebench_id_03282:
sector: Communication Services

company: Netflix
Expand All @@ -3508,6 +3522,8 @@ financebench_id_03282: # TODO: retrieve Total Current Liabilities
5466, 5466 million, 5.466 billion,
5500, 5500 million or 5.5 billion
evaluator-unreliable: true


financebench_id_04302:
sector: Consumer Discretionary
Expand Down Expand Up @@ -4005,7 +4021,9 @@ financebench_id_01476:

category: 2-CALC-CHANGE
correctness: >-
the answer mentions growth guidance raised from 8% to 9%, and/or growth guidance raised by 1 percentage point
the answer mentions growth guidance raised from 8% to 9%, and/or growth guidance raised by 1 percentage point or 1%
evaluator-unreliable: true


financebench_id_00302:
Expand Down Expand Up @@ -4080,7 +4098,7 @@ financebench_id_02416: # note: Therachon is mentioned on separate following pag

category: 0-RETRIEVE
correctness: >-
the answer mentions Arena and Trillium
the answer mentions Trillium and Array
financebench_id_00283:
Expand Down
Loading

0 comments on commit 0e73028

Please sign in to comment.