Merge pull request #347 from aitomatic/example-langchain-react-agent

add LangChain ReAct impl to examples/FinanceBench
aitomatic · Sep 15, 2024 · 0e73028 · 0e73028
2 parents e890687 + e797c0e
commit 0e73028
Show file tree

Hide file tree

Showing 9 changed files with 236 additions and 21 deletions.
diff --git a/Makefile b/Makefile
@@ -41,10 +41,10 @@ install:
 		--with=dev --with=docs --with=lint --with=test
 
 install-editable:
-	@python3 -m pip install -e ".[contrib]" --upgrade
+	@python3 -m pip install -e ".[contrib, langchain]" --upgrade
 
 install-editable-mac-sys:
-	@python3 -m pip install -e ".[contrib]" --upgrade --user --break-system-packages
+	@python3 -m pip install -e ".[contrib, langchain]" --upgrade --user --break-system-packages
 
 
 # LINTING

diff --git a/examples/FinanceBench/Makefile b/examples/FinanceBench/Makefile
@@ -33,6 +33,9 @@ agent-solve-all-combos:
 	@poetry run python htp_oodar_agent.py ${id} --knowledge --prog-store --llama3
 
 
+langchain-react-solve:
+	@poetry run python langchain_react.py ${id}
+
 openai-assist:
 	@poetry run python openai_assist.py ${id}
 

diff --git a/examples/FinanceBench/eval.py b/examples/FinanceBench/eval.py
@@ -17,7 +17,7 @@
 # pylint: disable=wrong-import-order
 from data_and_knowledge import (FbId, Question, Answer, Category, GroundTruth,
                                 FB_ID_COL_NAME, GROUND_TRUTHS, N_CASES, CAT_DISTRIB,
-                                OUTPUT_FILE_PATH, get_or_create_output_df)
+                                LOCAL_CACHE_DIR_PATH, OUTPUT_FILE_PATH, get_or_create_output_df)
 from log import switch_log_file
 
 if TYPE_CHECKING:
@@ -191,6 +191,86 @@ def compare_eval(output_name: str, baseline_output_name: str = 'RAG-Default'):
                          ['doc_name', 'category', baseline_output_name, output_name]]
 
 
+def eval_accuracy_and_consistency_wrt_ground_truths(output_name: str, output_file_names: list[str]):
+    # pylint: disable=too-many-locals
+
+    n_output_files: int = len(output_file_names)
+    correctness_col_name: str = f'{output_name}---CORRECTNESS'
+
+    n_yes_scores_by_fb_id: defaultdict = defaultdict(int)
+    incorrect_answer_fb_ids: dict[FbId, str] = {}
+
+    for output_df in (read_csv(LOCAL_CACHE_DIR_PATH / output_file_name, index_col=FB_ID_COL_NAME)
+                      for output_file_name in output_file_names):
+
+        for fb_id, correctness in output_df[correctness_col_name].items():
+            ground_truth: GroundTruth = GROUND_TRUTHS[fb_id]
+
+            if notna(correctness) and correctness:
+                n_yes_scores_by_fb_id[fb_id] += 1
+
+            else:
+                incorrect_answer_fb_ids[fb_id]: str = ('expert answer inadequate'
+                                                       if ground_truth.get('answer-inadequate')
+                                                       else ('evaluator unreliable'
+                                                             if ground_truth.get('evaluator-unreliable')
+                                                             else ''))
+
+    cumu_avg_accuracy_scores_by_category: defaultdict = defaultdict(int)
+    cumu_consistency_scores_by_category: defaultdict = defaultdict(float)
+
+    for fb_id, ground_truth in GROUND_TRUTHS.items():
+        cumu_avg_accuracy_scores_by_category[cat := ground_truth['category']] += (a := n_yes_scores_by_fb_id[fb_id] / n_output_files)
+        cumu_consistency_scores_by_category[cat] += 2 * abs(a - 0.5)
+
+    print(f'TOTAL CORRECT: {(n := sum(cumu_avg_accuracy_scores_by_category.values()))} / {N_CASES} = {n / N_CASES:.1%}')
+
+    pprint({category: (f'{(n := cumu_avg_accuracy_scores_by_category[category])} / {n_for_category} '
+                       f'= {n / n_for_category:.1%}')
+            for category, n_for_category in CAT_DISTRIB.items()})
+
+    pprint({
+        'EASY': (f'{(e := sum(cumu_avg_accuracy_scores_by_category[easy_cat]
+                              for easy_cat in (Category.RETRIEVE, Category.COMPARE, Category.CALC_CHANGE)))} / '
+                 f'{(se := sum(CAT_DISTRIB[easy_cat]
+                               for easy_cat in (Category.RETRIEVE, Category.COMPARE, Category.CALC_CHANGE)))} '
+                 f'= {e / se:.1%}'),
+
+        'HARD': (f'{(h := sum(cumu_avg_accuracy_scores_by_category[hard_cat]
+                              for hard_cat in (Category.CALC_COMPLEX, Category.CALC_AND_JUDGE,
+                                               Category.EXPLAIN_FACTORS, Category.OTHER_ADVANCED)))} / '
+                 f'{(sh := sum(CAT_DISTRIB[hard_cat]
+                               for hard_cat in (Category.CALC_COMPLEX, Category.CALC_AND_JUDGE,
+                                                Category.EXPLAIN_FACTORS, Category.OTHER_ADVANCED)))} '
+                 f'= {h / sh:.1%}')
+    })
+
+    print(f'\nTOTAL CONSISTENT: {(n := sum(cumu_consistency_scores_by_category.values()))} / {N_CASES} = {n / N_CASES:.1%}')
+
+    pprint({category: (f'{(n := cumu_consistency_scores_by_category[category])} / {n_for_category} '
+                       f'= {n / n_for_category:.1%}')
+            for category, n_for_category in CAT_DISTRIB.items()})
+
+    pprint({
+        'EASY': (f'{(e := sum(cumu_consistency_scores_by_category[easy_cat]
+                              for easy_cat in (Category.RETRIEVE, Category.COMPARE, Category.CALC_CHANGE)))} / '
+                 f'{(se := sum(CAT_DISTRIB[easy_cat]
+                               for easy_cat in (Category.RETRIEVE, Category.COMPARE, Category.CALC_CHANGE)))} '
+                 f'= {e / se:.1%}'),
+
+        'HARD': (f'{(h := sum(cumu_consistency_scores_by_category[hard_cat]
+                              for hard_cat in (Category.CALC_COMPLEX, Category.CALC_AND_JUDGE,
+                                               Category.EXPLAIN_FACTORS, Category.OTHER_ADVANCED)))} / '
+                 f'{(sh := sum(CAT_DISTRIB[hard_cat]
+                               for hard_cat in (Category.CALC_COMPLEX, Category.CALC_AND_JUDGE,
+                                                Category.EXPLAIN_FACTORS, Category.OTHER_ADVANCED)))} '
+                 f'= {h / sh:.1%}')
+    })
+
+    print('\nINCORRECT:')
+    pprint(incorrect_answer_fb_ids)
+
+
 if __name__ == '__main__':
     arg_parser = argparse.ArgumentParser()
 

diff --git a/examples/FinanceBench/export-multi-runs.py b/examples/FinanceBench/export-multi-runs.py
@@ -0,0 +1,25 @@
+from argparse import ArgumentParser
+
+from pandas import DataFrame, read_csv
+
+from data_and_knowledge import FB_ID_COL_NAME, LOCAL_CACHE_DIR_PATH
+
+
+EXPORT_FILE_NAME: str = 'export-multi-runs.csv'
+
+
+arg_parser = ArgumentParser()
+arg_parser.add_argument('output_name')
+arg_parser.add_argument('output_file_names', nargs='+')
+args = arg_parser.parse_args()
+
+
+for i, df in enumerate(read_csv(LOCAL_CACHE_DIR_PATH / output_file_name, index_col=FB_ID_COL_NAME)
+                       for output_file_name in args.output_file_names):
+    if not i:
+        export_df: DataFrame = df[['question']]
+
+    export_df.loc[:, f'answer {i + 1}'] = df[args.output_name]  # pylint: disable=possibly-used-before-assignment
+
+
+export_df.to_csv(LOCAL_CACHE_DIR_PATH / EXPORT_FILE_NAME, index=True)
diff --git a/examples/FinanceBench/ground-truths.yml b/examples/FinanceBench/ground-truths.yml
@@ -545,7 +545,7 @@ financebench_id_01319:
   evaluator-unreliable: true
 
 
-financebench_id_00540:  # TODO: retrieve COGS
+financebench_id_00540:
   sector: Utilities
 
   company: AES Corporation
@@ -769,7 +769,9 @@ financebench_id_01935:
 
   category: 0-RETRIEVE
   correctness: >-
-    the answer mentions supplemental indentures related to debt
+    the answer mentions indenture(s)
+
+  evaluator-unreliable: true
 
 
 financebench_id_00799:
@@ -945,6 +947,8 @@ financebench_id_01928:
     2018 million, 2.018 billion,
     2000 million or 2 billion
 
+  evaluator-unreliable: true
+
 
 financebench_id_01930:
   sector: Materials
@@ -1094,9 +1098,9 @@ financebench_id_01198:
   category: 0-RETRIEVE
   correctness: |-
     the answer mentions at least 2 of the following:
-    - EPYC server processors;
-    - Gaming; and
-    - inclusion of Xilinx
+    - "Data Center" and/or "EPYC";
+    - "Gaming" and/or "semi-custom"; and
+    - "Embedded" and/or "Xilinx"
 
   evaluator-unreliable: true
 
@@ -1228,6 +1232,8 @@ financebench_id_00476:
     the answer concludes that there are no debt securities traded,
     or, alternatively, that no such debt securities are explicitly reported
 
+  evaluator-unreliable: true
+
 
 financebench_id_01028:
   sector: Financials
@@ -1338,7 +1344,10 @@ financebench_id_01351:
 
   category: 2-CALC-CHANGE
   correctness: >-
-    the answer says Effective Tax Rate changed from 24.6% to 21.6%, and/or that it decreased by 3 pencentage points
+    the answer says Effective Tax Rate changed from 24.6% to 21.6%,
+    and/or that it decreased by 3 pencentage points or 3%
+
+  evaluator-unreliable: true
 
 
 financebench_id_01964:
@@ -1472,8 +1481,7 @@ financebench_id_00070:
     data? If working capital is not a useful or relevant metric for this company,
     then please state that and explain why.
 
-  answer: Yes. American Water Works had postivie working capital of $ 124Mn by FY
-    2022.
+  answer: No, American Water Works had negative working capital of -$1561M in FY 2022.
   justification: 'Accounts receivable+Income tax receivable+Unbilled revenues+Materials
     and supplies+other-Accounts payable-Accrued liabilities-Accrued taxes
 
@@ -1484,7 +1492,9 @@ financebench_id_00070:
   category: 3-CALC-COMPLEX
   correctness: >-
     the answer contains a calculated (Net) Working Capital metric value in dollars
-  answer-inadequate: true
+    that is NEGATIVE and equivalent to or approximately equal to
+    minus/negative 1561, minus/negative 1561 million, minus/negative 1.561 billion,
+    minus/negative 1600, minus/negative 1600 million or minus/negative 1.6 billion
 
   evaluator-unreliable: true
 
@@ -1583,7 +1593,7 @@ financebench_id_00685:
   category: 4-CALC-AND-JUDGE
   correctness: >-
     the answer contains calculated Gross Margin
-    percentage values for 2022 and 2023 that are within 2 percentage points of each other,
+    percentage values for 2022 and 2023 that are within 2 percentage points (or 2%) of each other,
     or, alternatively, calculated decimal values that are within 0.02 of each other
   answer-inadequate: true
 
@@ -1909,7 +1919,7 @@ financebench_id_01091:
   evaluator-unreliable: true
 
 
-financebench_id_00678:  # tricky: Gross Income is implicit, with missing label
+financebench_id_00678:  # note: Gross Income is implicit, with missing label
   sector: Industrials
 
   company: Boeing
@@ -2266,7 +2276,7 @@ financebench_id_01346:
   category: 2-CALC-CHANGE
   correctness: >-
     the answer says that Effective Tax Rate changed from approximately 20% to approximately 23%,
-    and/or that it increased by approximately 3 percentage points
+    and/or that it increased by approximately 3 percentage points or 3%
 
   evaluator-unreliable: true
 
@@ -2777,7 +2787,7 @@ financebench_id_00711:
   evaluator-unreliable: true
 
 
-financebench_id_00651:
+financebench_id_00651:  # TODO: retrieve growth rates
   sector: Health Care
 
   company: Johnson & Johnson
@@ -2826,6 +2836,8 @@ financebench_id_01484:
   correctness: >-
     the answer mentions US sales increased and international sales decreased
 
+  evaluator-unreliable: true
+
 
 financebench_id_01488:
   sector: Health Care
@@ -3191,7 +3203,7 @@ financebench_id_03718:
   evaluator-unreliable: true
 
 
-financebench_id_04171:  # TODO: retrieve Accounts Payable
+financebench_id_04171:
   sector: Consumer Discretionary
 
   company: MGM Resorts
@@ -3218,6 +3230,8 @@ financebench_id_04171:  # TODO: retrieve Accounts Payable
     303, 303 million, 0.303 billion,
     300, 300 million or 0.3 billion
 
+  evaluator-unreliable: true
+
 
 financebench_id_03849:
   sector: Consumer Discretionary
@@ -3481,7 +3495,7 @@ financebench_id_04458:
     (if the answer is a single number, assume that it is that calculated EBITDA Margin metric value)
 
 
-financebench_id_03282:  # TODO: retrieve Total Current Liabilities
+financebench_id_03282:
   sector: Communication Services
 
   company: Netflix
@@ -3508,6 +3522,8 @@ financebench_id_03282:  # TODO: retrieve Total Current Liabilities
     5466, 5466 million, 5.466 billion,
     5500, 5500 million or 5.5 billion
 
+  evaluator-unreliable: true
+
 
 financebench_id_04302:
   sector: Consumer Discretionary
@@ -4005,7 +4021,9 @@ financebench_id_01476:
 
   category: 2-CALC-CHANGE
   correctness: >-
-    the answer mentions growth guidance raised from 8% to 9%, and/or growth guidance raised by 1 percentage point
+    the answer mentions growth guidance raised from 8% to 9%, and/or growth guidance raised by 1 percentage point or 1%
+
+  evaluator-unreliable: true
 
 
 financebench_id_00302:
@@ -4080,7 +4098,7 @@ financebench_id_02416:  # note: Therachon is mentioned on separate following pag
 
   category: 0-RETRIEVE
   correctness: >-
-    the answer mentions Arena and Trillium
+    the answer mentions Trillium and Array
 
 
 financebench_id_00283: