From 7f97413de702e86bc099040597cdef13a9e5c685 Mon Sep 17 00:00:00 2001 From: Daniel Wigh Date: Mon, 15 Jan 2024 17:44:07 +0100 Subject: [PATCH 01/29] rm rxn with no solv set to False --- Makefile | 4 ++-- README.md | 2 +- orderly/clean/cleaner.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index 038bd1dd..9de70f56 100644 --- a/Makefile +++ b/Makefile @@ -161,10 +161,10 @@ paper_3: paper_plot_uspto_no_trust_unfiltered_num_rxn_components paper_plot_uspt # 4. clean (filtered) paper_clean_uspto_no_trust_filtered: #requires: paper_extract_uspto_no_trust - python -m orderly.clean --output_path="data/orderly/uspto_no_trust/filtered/filtered_orderly_ord.parquet" --ord_extraction_path="data/orderly/uspto_no_trust/extracted_ords" --molecules_to_remove_path="data/orderly/uspto_no_trust/all_molecule_names.csv" --min_frequency_of_occurrence=0 --map_rare_molecules_to_other=True --set_unresolved_names_to_none_if_mapped_rxn_str_exists_else_del_rxn=True --remove_rxn_with_unresolved_names=False --set_unresolved_names_to_none=False --num_product=1 --num_reactant=2 --num_solv=2 --num_agent=3 --num_cat=0 --num_reag=0 --consistent_yield=False --remove_reactions_with_no_solvents=True --remove_reactions_with_no_agents=True --train_size=0.0 + python -m orderly.clean --output_path="data/orderly/uspto_no_trust/filtered/filtered_orderly_ord.parquet" --ord_extraction_path="data/orderly/uspto_no_trust/extracted_ords" --molecules_to_remove_path="data/orderly/uspto_no_trust/all_molecule_names.csv" --min_frequency_of_occurrence=0 --map_rare_molecules_to_other=True --set_unresolved_names_to_none_if_mapped_rxn_str_exists_else_del_rxn=True --remove_rxn_with_unresolved_names=False --set_unresolved_names_to_none=False --num_product=1 --num_reactant=2 --num_solv=2 --num_agent=3 --num_cat=0 --num_reag=0 --consistent_yield=False --remove_reactions_with_no_solvents=False --remove_reactions_with_no_agents=True --train_size=0.0 paper_clean_uspto_with_trust_filtered: #requires: paper_extract_uspto_with_trust - python -m orderly.clean --output_path="data/orderly/uspto_with_trust/filtered/filtered_orderly_ord.parquet" --ord_extraction_path="data/orderly/uspto_with_trust/extracted_ords" --molecules_to_remove_path="data/orderly/uspto_with_trust/all_molecule_names.csv" --min_frequency_of_occurrence=0 --map_rare_molecules_to_other=True --set_unresolved_names_to_none_if_mapped_rxn_str_exists_else_del_rxn=True --remove_rxn_with_unresolved_names=False --set_unresolved_names_to_none=False --num_product=1 --num_reactant=2 --num_solv=2 --num_agent=0 --num_cat=1 --num_reag=2 --consistent_yield=False --remove_reactions_with_no_solvents=True --remove_reactions_with_no_agents=True --train_size=0.0 + python -m orderly.clean --output_path="data/orderly/uspto_with_trust/filtered/filtered_orderly_ord.parquet" --ord_extraction_path="data/orderly/uspto_with_trust/extracted_ords" --molecules_to_remove_path="data/orderly/uspto_with_trust/all_molecule_names.csv" --min_frequency_of_occurrence=0 --map_rare_molecules_to_other=True --set_unresolved_names_to_none_if_mapped_rxn_str_exists_else_del_rxn=True --remove_rxn_with_unresolved_names=False --set_unresolved_names_to_none=False --num_product=1 --num_reactant=2 --num_solv=2 --num_agent=0 --num_cat=1 --num_reag=2 --consistent_yield=False --remove_reactions_with_no_solvents=False --remove_reactions_with_no_agents=True --train_size=0.0 paper_4: paper_clean_uspto_no_trust_filtered paper_clean_uspto_with_trust_filtered diff --git a/README.md b/README.md index 781dea34..a1e9558d 100644 --- a/README.md +++ b/README.md @@ -167,7 +167,7 @@ If you would like to extract all data in ORD (instead of just USPTO data) simply ### ORDerly-condition -```python -m orderly.clean --output_path="../orderly_generated_datasets/orderly_condition.parquet" --ord_extraction_path="data/orderly/extracted_ords" --molecules_to_remove_path="data/orderly/all_molecule_names.csv" --min_frequency_of_occurrence=100 --map_rare_molecules_to_other=False --set_unresolved_names_to_none_if_mapped_rxn_str_exists_else_del_rxn=True --remove_rxn_with_unresolved_names=False --set_unresolved_names_to_none=False --num_product=1 --num_reactant=2 --num_solv=2 --num_agent=3 --num_cat=0 --num_reag=0 --consistent_yield=False --scramble=False --train_test_split_fraction=0.9 --remove_reactions_with_no_reactants=True --remove_reactions_with_no_products=True --remove_reactions_with_no_solvents=True --remove_reactions_with_no_agents=True``` +```python -m orderly.clean --output_path="../orderly_generated_datasets/orderly_condition.parquet" --ord_extraction_path="data/orderly/extracted_ords" --molecules_to_remove_path="data/orderly/all_molecule_names.csv" --min_frequency_of_occurrence=100 --map_rare_molecules_to_other=False --set_unresolved_names_to_none_if_mapped_rxn_str_exists_else_del_rxn=True --remove_rxn_with_unresolved_names=False --set_unresolved_names_to_none=False --num_product=1 --num_reactant=2 --num_solv=2 --num_agent=3 --num_cat=0 --num_reag=0 --consistent_yield=False --scramble=False --train_test_split_fraction=0.9 --remove_reactions_with_no_reactants=True --remove_reactions_with_no_products=True --remove_reactions_with_no_solvents=False --remove_reactions_with_no_agents=True``` ### ORDerly-forward diff --git a/orderly/clean/cleaner.py b/orderly/clean/cleaner.py index 4e0468af..15bb7245 100644 --- a/orderly/clean/cleaner.py +++ b/orderly/clean/cleaner.py @@ -938,7 +938,7 @@ def get_matching_indices( @click.option( "--remove_reactions_with_no_solvents", type=bool, - default=True, + default=False, show_default=True, help="Remove reactions with no solvents", ) From 6c577e2c526c9da8e2d71cd12be7f536ee4434be Mon Sep 17 00:00:00 2001 From: Daniel Wigh Date: Mon, 15 Jan 2024 17:46:17 +0100 Subject: [PATCH 02/29] rm rxn with no agents set to F --- Makefile | 4 ++-- README.md | 2 +- orderly/clean/cleaner.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index 9de70f56..d5e3755c 100644 --- a/Makefile +++ b/Makefile @@ -161,10 +161,10 @@ paper_3: paper_plot_uspto_no_trust_unfiltered_num_rxn_components paper_plot_uspt # 4. clean (filtered) paper_clean_uspto_no_trust_filtered: #requires: paper_extract_uspto_no_trust - python -m orderly.clean --output_path="data/orderly/uspto_no_trust/filtered/filtered_orderly_ord.parquet" --ord_extraction_path="data/orderly/uspto_no_trust/extracted_ords" --molecules_to_remove_path="data/orderly/uspto_no_trust/all_molecule_names.csv" --min_frequency_of_occurrence=0 --map_rare_molecules_to_other=True --set_unresolved_names_to_none_if_mapped_rxn_str_exists_else_del_rxn=True --remove_rxn_with_unresolved_names=False --set_unresolved_names_to_none=False --num_product=1 --num_reactant=2 --num_solv=2 --num_agent=3 --num_cat=0 --num_reag=0 --consistent_yield=False --remove_reactions_with_no_solvents=False --remove_reactions_with_no_agents=True --train_size=0.0 + python -m orderly.clean --output_path="data/orderly/uspto_no_trust/filtered/filtered_orderly_ord.parquet" --ord_extraction_path="data/orderly/uspto_no_trust/extracted_ords" --molecules_to_remove_path="data/orderly/uspto_no_trust/all_molecule_names.csv" --min_frequency_of_occurrence=0 --map_rare_molecules_to_other=True --set_unresolved_names_to_none_if_mapped_rxn_str_exists_else_del_rxn=True --remove_rxn_with_unresolved_names=False --set_unresolved_names_to_none=False --num_product=1 --num_reactant=2 --num_solv=2 --num_agent=3 --num_cat=0 --num_reag=0 --consistent_yield=False --remove_reactions_with_no_solvents=False --remove_reactions_with_no_agents=False --train_size=0.0 paper_clean_uspto_with_trust_filtered: #requires: paper_extract_uspto_with_trust - python -m orderly.clean --output_path="data/orderly/uspto_with_trust/filtered/filtered_orderly_ord.parquet" --ord_extraction_path="data/orderly/uspto_with_trust/extracted_ords" --molecules_to_remove_path="data/orderly/uspto_with_trust/all_molecule_names.csv" --min_frequency_of_occurrence=0 --map_rare_molecules_to_other=True --set_unresolved_names_to_none_if_mapped_rxn_str_exists_else_del_rxn=True --remove_rxn_with_unresolved_names=False --set_unresolved_names_to_none=False --num_product=1 --num_reactant=2 --num_solv=2 --num_agent=0 --num_cat=1 --num_reag=2 --consistent_yield=False --remove_reactions_with_no_solvents=False --remove_reactions_with_no_agents=True --train_size=0.0 + python -m orderly.clean --output_path="data/orderly/uspto_with_trust/filtered/filtered_orderly_ord.parquet" --ord_extraction_path="data/orderly/uspto_with_trust/extracted_ords" --molecules_to_remove_path="data/orderly/uspto_with_trust/all_molecule_names.csv" --min_frequency_of_occurrence=0 --map_rare_molecules_to_other=True --set_unresolved_names_to_none_if_mapped_rxn_str_exists_else_del_rxn=True --remove_rxn_with_unresolved_names=False --set_unresolved_names_to_none=False --num_product=1 --num_reactant=2 --num_solv=2 --num_agent=0 --num_cat=1 --num_reag=2 --consistent_yield=False --remove_reactions_with_no_solvents=False --remove_reactions_with_no_agents=False --train_size=0.0 paper_4: paper_clean_uspto_no_trust_filtered paper_clean_uspto_with_trust_filtered diff --git a/README.md b/README.md index a1e9558d..4ac2ae0f 100644 --- a/README.md +++ b/README.md @@ -167,7 +167,7 @@ If you would like to extract all data in ORD (instead of just USPTO data) simply ### ORDerly-condition -```python -m orderly.clean --output_path="../orderly_generated_datasets/orderly_condition.parquet" --ord_extraction_path="data/orderly/extracted_ords" --molecules_to_remove_path="data/orderly/all_molecule_names.csv" --min_frequency_of_occurrence=100 --map_rare_molecules_to_other=False --set_unresolved_names_to_none_if_mapped_rxn_str_exists_else_del_rxn=True --remove_rxn_with_unresolved_names=False --set_unresolved_names_to_none=False --num_product=1 --num_reactant=2 --num_solv=2 --num_agent=3 --num_cat=0 --num_reag=0 --consistent_yield=False --scramble=False --train_test_split_fraction=0.9 --remove_reactions_with_no_reactants=True --remove_reactions_with_no_products=True --remove_reactions_with_no_solvents=False --remove_reactions_with_no_agents=True``` +```python -m orderly.clean --output_path="../orderly_generated_datasets/orderly_condition.parquet" --ord_extraction_path="data/orderly/extracted_ords" --molecules_to_remove_path="data/orderly/all_molecule_names.csv" --min_frequency_of_occurrence=100 --map_rare_molecules_to_other=False --set_unresolved_names_to_none_if_mapped_rxn_str_exists_else_del_rxn=True --remove_rxn_with_unresolved_names=False --set_unresolved_names_to_none=False --num_product=1 --num_reactant=2 --num_solv=2 --num_agent=3 --num_cat=0 --num_reag=0 --consistent_yield=False --scramble=False --train_test_split_fraction=0.9 --remove_reactions_with_no_reactants=True --remove_reactions_with_no_products=True --remove_reactions_with_no_solvents=False --remove_reactions_with_no_agents=False``` ### ORDerly-forward diff --git a/orderly/clean/cleaner.py b/orderly/clean/cleaner.py index 15bb7245..99e584d8 100644 --- a/orderly/clean/cleaner.py +++ b/orderly/clean/cleaner.py @@ -945,7 +945,7 @@ def get_matching_indices( @click.option( "--remove_reactions_with_no_agents", type=bool, - default=True, + default=False, show_default=True, help="Remove reactions with no agents (ie no reagents AND no catalysts). Does not consider solvents", ) From b11d0dceb39dbf21a1a34cbb71903a3c05080a1e Mon Sep 17 00:00:00 2001 From: Daniel Wigh Date: Wed, 17 Jan 2024 13:46:08 +0100 Subject: [PATCH 03/29] added orderly-cond-prelim --- Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile b/Makefile index d5e3755c..f8708ad4 100644 --- a/Makefile +++ b/Makefile @@ -186,6 +186,10 @@ paper_5 : paper_plot_uspto_no_trust_filtered_min_frequency_of_occurrence_10_100 # 6. clean (final) +# NB: I changed this one, min_freq=0, train_size=1 +paper_gen_orderly_cond_prelim: #requires: paper_extract_uspto_no_trust + python -m orderly.clean --output_path="data/orderly/datasets_$(dataset_version)/orderly_cond_prelim.parquet" --ord_extraction_path="data/orderly/uspto_no_trust/extracted_ords" --molecules_to_remove_path="data/orderly/uspto_no_trust/all_molecule_names.csv" --min_frequency_of_occurrence=0 --map_rare_molecules_to_other=False --set_unresolved_names_to_none_if_mapped_rxn_str_exists_else_del_rxn=True --remove_rxn_with_unresolved_names=False --set_unresolved_names_to_none=False --num_product=1 --num_reactant=2 --num_solv=2 --num_agent=3 --num_cat=0 --num_reag=0 --consistent_yield=False --scramble=True --train_size=1 + paper_gen_uspto_no_trust_no_map: #requires: paper_extract_uspto_no_trust python -m orderly.clean --output_path="data/orderly/datasets_$(dataset_version)/orderly_no_trust_no_map.parquet" --ord_extraction_path="data/orderly/uspto_no_trust/extracted_ords" --molecules_to_remove_path="data/orderly/uspto_no_trust/all_molecule_names.csv" --min_frequency_of_occurrence=100 --map_rare_molecules_to_other=False --set_unresolved_names_to_none_if_mapped_rxn_str_exists_else_del_rxn=True --remove_rxn_with_unresolved_names=False --set_unresolved_names_to_none=False --num_product=1 --num_reactant=2 --num_solv=2 --num_agent=3 --num_cat=0 --num_reag=0 --consistent_yield=False --scramble=True --train_size=0.9 From b22e6d515d00ca7d843744cc0d85d67e556c11f8 Mon Sep 17 00:00:00 2001 From: Daniel Wigh Date: Wed, 17 Jan 2024 13:46:26 +0100 Subject: [PATCH 04/29] added test case for removing C when Pd present --- tests/test_extract.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tests/test_extract.py b/tests/test_extract.py index c69987ab..e369b28b 100644 --- a/tests/test_extract.py +++ b/tests/test_extract.py @@ -338,6 +338,21 @@ def test_rxn_string_and_is_mapped( [], False, ], + [ # Test that [C] is removed correctly when [Pd] is present + "ord_dataset-0b70410902ae4139bd5d334881938f69", + 262, + None, + [ + "O=C1C[C@@H](C(=O)N[C@@H](Cc2c[nH]cn2)C(=O)N2CCC[C@H]2C(=O)OCc2ccccc2)N1", + ], + [ + "[Pd]", + ], + ["O=C1C[C@@H](C(=O)N[C@@H](Cc2c[nH]cn2)C(=O)N2CCC[C@H]2C(=O)O)N1"], + 'C([O:8][C:9](=[O:32])[C@@H:10]1[CH2:14][CH2:13][CH2:12][N:11]1[C:15](=[O:31])[C@H:16]([CH2:25][C:26]1[N:30]=[CH:29][NH:28][CH:27]=1)[NH:17][C:18]([C@H:20]1[NH:23][C:22](=[O:24])[CH2:21]1)=[O:19])C1C=CC=CC=1>CO.[C].[Pd]>[NH:23]1[C@H:20]([C:18]([NH:17][C@H:16]([C:15]([N:11]2[CH2:12][CH2:13][CH2:14][C@H:10]2[C:9]([OH:32])=[O:8])=[O:31])[CH2:25][C:26]2[N:30]=[CH:29][NH:28][CH:27]=2)=[O:19])[CH2:21][C:22]1=[O:24]', + [], + False, + ], [ "ord_dataset-0bb2e99daa66408fb8dbd6a0781d241c", 0, From d92c33e2c5eac5d8b3909120b0c2fd5d8263ef8a Mon Sep 17 00:00:00 2001 From: Daniel Wigh Date: Wed, 17 Jan 2024 13:46:59 +0100 Subject: [PATCH 05/29] added [Pd]/[C] -> [Pd] replacement --- orderly/extract/defaults.py | 1 + 1 file changed, 1 insertion(+) diff --git a/orderly/extract/defaults.py b/orderly/extract/defaults.py index b619b742..2e310281 100644 --- a/orderly/extract/defaults.py +++ b/orderly/extract/defaults.py @@ -79,6 +79,7 @@ def get_molecule_replacements() -> Dict[MOLECULE_IDENTIFIER, SMILES]: molecule_replacements["[Pd on-carbon]"] = "[Pd]" molecule_replacements["[Pd].C"] = "[Pd]" molecule_replacements["[Pd]/C"] = "[Pd]" + molecule_replacements["[Pd]/[C]]"] = "[Pd]" molecule_replacements["[TEA]"] = "OCCN(CCO)CCO" molecule_replacements["[Ti-superoxide]"] = "O=[O-].[Ti]" molecule_replacements[ From 9f39b6e4417e5a9fd91a99955a2d414d03953262 Mon Sep 17 00:00:00 2001 From: Daniel Wigh Date: Wed, 17 Jan 2024 13:47:18 +0100 Subject: [PATCH 06/29] delete [C] when transition metal present --- orderly/extract/extractor.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/orderly/extract/extractor.py b/orderly/extract/extractor.py index ab9d4b45..ad3bd310 100644 --- a/orderly/extract/extractor.py +++ b/orderly/extract/extractor.py @@ -1007,6 +1007,20 @@ def move_unresolvable_names_to_end_of_list( catalysts, rxn_non_smiles_names_set ) + # Add paladium on carbon exception: Delete carbon if Pd exists. Expand exception to other transition metals + def contains_transition_metal(agents): + for agent in agents: + if orderly.extract.defaults.has_transition_metal(agent): + return True + return False + + # Check if any agent contains a transition metal + if contains_transition_metal(agents): + # Remove "[C]" and "C" from agents + agents = [a for a in agents if a not in ["[C]", "C"]] + + + if _yields == []: _yields = [None] * len(products) yields = _yields From 88613465efd9e0ace508639752c4800666ab5e53 Mon Sep 17 00:00:00 2001 From: Daniel Wigh Date: Wed, 17 Jan 2024 13:47:31 +0100 Subject: [PATCH 07/29] change back to df.applymap --- orderly/clean/cleaner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/orderly/clean/cleaner.py b/orderly/clean/cleaner.py index 99e584d8..a61c5383 100644 --- a/orderly/clean/cleaner.py +++ b/orderly/clean/cleaner.py @@ -518,7 +518,7 @@ def _replace_None_with_NA( all_component_cols += component_columns sub_df = df[all_component_cols] - sub_df = sub_df.map(lambda x: pd.NA if x is None else x) + sub_df = sub_df.applymap(lambda x: pd.NA if x is None else x) df = df.drop(all_component_cols, axis=1) df = pd.concat([df, sub_df], axis=1) From a880d9ce5bdcd843b2ac6a42e9394f0c6b538120 Mon Sep 17 00:00:00 2001 From: Daniel Wigh Date: Wed, 17 Jan 2024 13:47:48 +0100 Subject: [PATCH 08/29] find [H][H] regex fix --- notebooks/orderly_benchmark_stats.ipynb | 216 ++++++++++++++++++++++-- 1 file changed, 201 insertions(+), 15 deletions(-) diff --git a/notebooks/orderly_benchmark_stats.ipynb b/notebooks/orderly_benchmark_stats.ipynb index bb64af05..0f1099b0 100644 --- a/notebooks/orderly_benchmark_stats.ipynb +++ b/notebooks/orderly_benchmark_stats.ipynb @@ -272,7 +272,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -281,15 +281,14 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "833112\n", - "1\n" + "Number of strings containing '[H][H]': 9738, that's 0.011688704519920491 % of the data. Len(df)=833112.\n" ] } ], @@ -297,8 +296,48 @@ "# path = \"../data/orderly/datasets/orderly_no_trust_no_map_train.parquet\"\n", "path = \"/Users/danielwigh/projects_local/ORDerly_project/orderly-benchmarks/orderly_forward_train.parquet\"\n", "df = pd.read_parquet(path)\n", - "print(len(df))\n", - "print(df['rxn_str'].str.contains('[H][H]').sum())\n" + "# contains_hh = df['rxn_str'].str.contains(r'\\[H\\]\\[H\\]', regex=True, na=False)\n", + "contains_hh = df['rxn_str'].str.contains('[H][H]', regex=False, na=False)\n", + "count = contains_hh.sum()\n", + "print(f\"Number of strings containing '[H][H]': {count}, that's {count/len(df)} % of the data. Len(df)={len(df)}.\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1862\n" + ] + } + ], + "source": [ + "contains_hh = df['agent_002'].str.contains('[H][H]', regex=False, na=False)\n", + "count = contains_hh.sum()\n", + "print(count)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0\n" + ] + } + ], + "source": [ + "contains_ = df['agent_002'].str.contains('.', regex=False, na=False)\n", + "count = contains_.sum()\n", + "print(count)" ] }, { @@ -345,24 +384,171 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "833112\n", + "1\n" + ] + } + ], + "source": [ + "# path = \"../data/orderly/datasets/orderly_no_trust_no_map_train.parquet\"\n", + "path = \"/Users/danielwigh/projects_local/ORDerly_project/orderly-benchmarks/orderly_forward_train.parquet\"\n", + "df = pd.read_parquet(path)\n" + ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "833112\n", + "0\n" + ] + } + ], + "source": [ + "print(len(df))\n", + "print(df['rxn_str'].str.contains('procedure_details').sum())" + ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
agent_000agent_001agent_002date_of_experimentextracted_from_filegrant_dateis_mappedprocedure_detailsproduct_000product_001...reactant_001reactant_002rxn_strrxn_timesolvent_000solvent_001solvent_002temperatureyield_000yield_001
index
\n", + "

0 rows × 21 columns

\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [agent_000, agent_001, agent_002, date_of_experiment, extracted_from_file, grant_date, is_mapped, procedure_details, product_000, product_001, reactant_000, reactant_001, reactant_002, rxn_str, rxn_time, solvent_000, solvent_001, solvent_002, temperature, yield_000, yield_001]\n", + "Index: []\n", + "\n", + "[0 rows x 21 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "hydrogenation_rxn = '[CH3:1][O:2][c:3]1[cH:4][cH:5][c:6]2[c:7]([cH:12]1)[CH:8]=[CH:9][N:10]2[CH3:11]>>[CH3:1][O:2][c:3]1[cH:4][cH:5][c:6]2[c:7]([cH:12]1)[CH2:8][CH2:9][N:10]2[CH3:11]'\n", + "df[df['rxn_str']==hydrogenation_rxn]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "index\n", + "127288 [C:1]1([C:7]2[CH:12]=[CH:11][C:10]([OH:13])=[C...\n", + "865425 N[C:2]1[N:7]=[C:6]([S:8][CH2:9][C:10]2[CH:11]=...\n", + "49540 [NH2:1][CH:2]1[CH2:11][CH:10]2[N:5]([CH2:6][CH...\n", + "327337 F[C:2]1([O:9][C:10]#[C:11][CH3:12])[CH:7]=[C:6...\n", + "703910 [ClH:1].O1CCOCC1.[OH:8][C:9]1[CH:10]=[C:11]([C...\n", + " ... \n", + "137770 [N+]([C:4]1[CH:5]=[C:6]([CH3:11])[N+:7]([O-:10...\n", + "377939 [CH3:1][CH2:2][O:3][C:4]1[N:12]([CH2:13][C:14]...\n", + "321854 C[O:2][C:3]1[C:17]2[C:12](=[CH:13][CH:14]=[CH:...\n", + "492935 [CH3:1][O:2][C:3]1[CH:8]=[CH:7][CH:6]=[CH:5][C...\n", + "109743 [OH-].[C:2]1([C:10]([O:12]CC)=[O:11])([C:5]([O...\n", + "Name: rxn_str, Length: 833112, dtype: object" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['rxn_str']" + ] }, { "cell_type": "code", From 7b4b0f545cb6adf64afe0ff157d0de052abd517d Mon Sep 17 00:00:00 2001 From: Daniel Wigh Date: Wed, 17 Jan 2024 13:48:45 +0100 Subject: [PATCH 09/29] changed paths --- notebooks/inspect_orderly_data.ipynb | 140 +++++++++++++++++++++++---- 1 file changed, 121 insertions(+), 19 deletions(-) diff --git a/notebooks/inspect_orderly_data.ipynb b/notebooks/inspect_orderly_data.ipynb index 06fa5bda..75de1c6d 100644 --- a/notebooks/inspect_orderly_data.ipynb +++ b/notebooks/inspect_orderly_data.ipynb @@ -10,7 +10,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -23,7 +23,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -32,7 +32,7 @@ "'[S:1](=[O:4])(=[O:3])=[O:2].[S:5](=[O:9])(=[O:8])([OH:7])[OH:6]>>[OH:8][S:5]([OH:9])(=[O:7])=[O:6].[O:2]=[S:1](=[O:4])=[O:3] |f:2.3|'" ] }, - "execution_count": 10, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -53,7 +53,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -62,7 +62,7 @@ "'uspto-grants-1993_09'" ] }, - "execution_count": 12, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -73,13 +73,13 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "# Can also find ORD file just given the hash:\n", "def open_ord(ord_hash):\n", - " folder_path = '/Users/dsw46/Projects_local/ORDerly/data/ord'\n", + " folder_path = '/Users/danielwigh/projects_local/ORDerly_project/ORDerly/data/ord'\n", " # look for files within that folder path or deeper for a file that contains the ord_hash\n", " for root, dirs, files in os.walk(folder_path):\n", " for file in files:\n", @@ -93,25 +93,45 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'uspto-grants-2001_07'" + "'uspto-grants-1986_09'" ] }, - "execution_count": 39, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "data = open_ord('ord_dataset-85c00026681b46f89ef8634d2b8618c3')\n", + "data = open_ord('ord_dataset-0b70410902ae4139bd5d334881938f69')\n", "data.name" ] }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'/Users/danielwigh/projects_local/ORDerly_project/ORDerly/notebooks'" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pwd" + ] + }, { "attachments": {}, "cell_type": "markdown", @@ -122,7 +142,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -131,34 +151,116 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ - "path = \"../data/orderly/uspto_no_trust/extracted_ords/uspto-grants-1993_09.parquet\"\n", + "path = \"../data/orderly/uspto_no_trust/extracted_ords/uspto-grants-1986_09.parquet\"\n", "df = pd.read_parquet(path)" ] }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'[S:1](=[O:4])(=[O:3])=[O:2].[S:5](=[O:9])(=[O:8])([OH:7])[OH:6]>>[OH:8][S:5]([OH:9])(=[O:7])=[O:6].[O:2]=[S:1](=[O:4])=[O:3]'" + "262 C([O:8][C:9](=[O:32])[C@@H:10]1[CH2:14][CH2:13...\n", + "265 C(OC([N:11]1[CH2:15][CH2:14][CH2:13][C@H:12]1[...\n", + "273 C([O:8][C:9](=[O:35])[C@@H:10]1[CH2:14][CH2:13...\n", + "275 C([O:8][C:9](=[O:25])[C@H:10]([CH2:19][C:20]1[...\n", + "276 C(OC([N:11]1[CH2:15][CH2:14][CH2:13][C@H:12]1[...\n", + "277 C(OC([N:11]1[CH2:23][CH2:22][CH2:21][C@H:12]1[...\n", + "279 C(OC([N:11]1[CH2:15][CH2:14][CH2:13][C@H:12]1[...\n", + "281 C(OC([N:11]1[CH2:15][CH2:14][CH2:13][C@H:12]1[...\n", + "285 C(OC([N:11]1[CH2:15][CH2:14][CH2:13][C@H:12]1[...\n", + "368 [ClH:1].C(OC([NH:12][CH:13]([CH2:23][CH2:24][C...\n", + "377 [C:1]([O:4][CH2:5][CH2:6][CH2:7][NH:8]C(OCC1C=...\n", + "379 [ClH:1].[C:2]([O:5][CH2:6][CH2:7][CH2:8][NH:9]...\n", + "582 [CH3:1][C:2]1[O:3][C:4](=[O:14])[C:5](=[CH:7][...\n", + "Name: rxn_str, dtype: object" ] }, - "execution_count": 31, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df['rxn_str'][0]" + "# find df['rxn_str'] which contains [Pd] and [C]\n", + "\n", + "filtered_df = df[df['rxn_str'].str.contains('\\[Pd\\]') & df['rxn_str'].str.contains('\\[C\\]')]\n", + "filtered_df['rxn_str']\n" ] }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'C([O:8][C:9](=[O:32])[C@@H:10]1[CH2:14][CH2:13][CH2:12][N:11]1[C:15](=[O:31])[C@H:16]([CH2:25][C:26]1[N:30]=[CH:29][NH:28][CH:27]=1)[NH:17][C:18]([C@H:20]1[NH:23][C:22](=[O:24])[CH2:21]1)=[O:19])C1C=CC=CC=1>CO.[C].[Pd]>[NH:23]1[C@H:20]([C:18]([NH:17][C@H:16]([C:15]([N:11]2[CH2:12][CH2:13][CH2:14][C@H:10]2[C:9]([OH:32])=[O:8])=[O:31])[CH2:25][C:26]2[N:30]=[CH:29][NH:28][CH:27]=2)=[O:19])[CH2:21][C:22]1=[O:24]'" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.loc[262]['rxn_str']" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "rxn = 'C([O:8][C:9](=[O:32])[C@@H:10]1[CH2:14][CH2:13][CH2:12][N:11]1[C:15](=[O:31])[C@H:16]([CH2:25][C:26]1[N:30]=[CH:29][NH:28][CH:27]=1)[NH:17][C:18]([C@H:20]1[NH:23][C:22](=[O:24])[CH2:21]1)=[O:19])C1C=CC=CC=1>CO.[C].[Pd]>[NH:23]1[C@H:20]([C:18]([NH:17][C@H:16]([C:15]([N:11]2[CH2:12][CH2:13][CH2:14][C@H:10]2[C:9]([OH:32])=[O:8])=[O:31])[CH2:25][C:26]2[N:30]=[CH:29][NH:28][CH:27]=2)=[O:19])[CH2:21][C:22]1=[O:24]'\n", + "react,ag,prod=rxn.split('>')" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'O=C1C[C@@H](C(=O)N[C@@H](Cc2c[nH]cn2)C(=O)N2CCC[C@H]2C(=O)O)N1'" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.loc[262]['product_000']\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "O=C1C[C@@H](C(=O)N[C@@H](Cc2c[nH]cn2)C(=O)N2CCC[C@H]2C(=O)O)N1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "attachments": {}, "cell_type": "markdown", @@ -1248,7 +1350,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.10.13" }, "orig_nbformat": 4 }, From 1899235f81b5254fe705f5f286442f411c4c29e0 Mon Sep 17 00:00:00 2001 From: Daniel Wigh Date: Wed, 17 Jan 2024 13:53:43 +0100 Subject: [PATCH 10/29] make black --- orderly/extract/extractor.py | 2 -- tests/test_extract.py | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/orderly/extract/extractor.py b/orderly/extract/extractor.py index ad3bd310..37768638 100644 --- a/orderly/extract/extractor.py +++ b/orderly/extract/extractor.py @@ -1018,8 +1018,6 @@ def contains_transition_metal(agents): if contains_transition_metal(agents): # Remove "[C]" and "C" from agents agents = [a for a in agents if a not in ["[C]", "C"]] - - if _yields == []: _yields = [None] * len(products) diff --git a/tests/test_extract.py b/tests/test_extract.py index e369b28b..de8f6e65 100644 --- a/tests/test_extract.py +++ b/tests/test_extract.py @@ -338,7 +338,7 @@ def test_rxn_string_and_is_mapped( [], False, ], - [ # Test that [C] is removed correctly when [Pd] is present + [ # Test that [C] is removed correctly when [Pd] is present "ord_dataset-0b70410902ae4139bd5d334881938f69", 262, None, @@ -349,7 +349,7 @@ def test_rxn_string_and_is_mapped( "[Pd]", ], ["O=C1C[C@@H](C(=O)N[C@@H](Cc2c[nH]cn2)C(=O)N2CCC[C@H]2C(=O)O)N1"], - 'C([O:8][C:9](=[O:32])[C@@H:10]1[CH2:14][CH2:13][CH2:12][N:11]1[C:15](=[O:31])[C@H:16]([CH2:25][C:26]1[N:30]=[CH:29][NH:28][CH:27]=1)[NH:17][C:18]([C@H:20]1[NH:23][C:22](=[O:24])[CH2:21]1)=[O:19])C1C=CC=CC=1>CO.[C].[Pd]>[NH:23]1[C@H:20]([C:18]([NH:17][C@H:16]([C:15]([N:11]2[CH2:12][CH2:13][CH2:14][C@H:10]2[C:9]([OH:32])=[O:8])=[O:31])[CH2:25][C:26]2[N:30]=[CH:29][NH:28][CH:27]=2)=[O:19])[CH2:21][C:22]1=[O:24]', + "C([O:8][C:9](=[O:32])[C@@H:10]1[CH2:14][CH2:13][CH2:12][N:11]1[C:15](=[O:31])[C@H:16]([CH2:25][C:26]1[N:30]=[CH:29][NH:28][CH:27]=1)[NH:17][C:18]([C@H:20]1[NH:23][C:22](=[O:24])[CH2:21]1)=[O:19])C1C=CC=CC=1>CO.[C].[Pd]>[NH:23]1[C@H:20]([C:18]([NH:17][C@H:16]([C:15]([N:11]2[CH2:12][CH2:13][CH2:14][C@H:10]2[C:9]([OH:32])=[O:8])=[O:31])[CH2:25][C:26]2[N:30]=[CH:29][NH:28][CH:27]=2)=[O:19])[CH2:21][C:22]1=[O:24]", [], False, ], From 957060aeb02610d179113059e72e3d37d5fce033 Mon Sep 17 00:00:00 2001 From: Daniel Wigh Date: Wed, 17 Jan 2024 13:53:50 +0100 Subject: [PATCH 11/29] regen test data --- .../extract_config.json | 1 + .../uspto-grants-1976_01.parquet | Bin 835375 -> 835352 bytes .../uspto-grants-1979_09.parquet | Bin 319641 -> 319631 bytes .../uspto-grants-1980_08.parquet | Bin 403081 -> 403061 bytes .../uspto-grants-1982_08.parquet | Bin 580051 -> 580054 bytes .../uspto-grants-1986_09.parquet | Bin 453402 -> 453362 bytes .../uspto-grants-1995_11.parquet | Bin 678317 -> 678280 bytes .../uspto-grants-2001_07.parquet | Bin 2067902 -> 2067818 bytes .../extract_config.json | 1 + 9 files changed, 2 insertions(+) diff --git a/orderly/data/test_data/extracted_ord_test_data_dont_trust_labelling/extract_config.json b/orderly/data/test_data/extracted_ord_test_data_dont_trust_labelling/extract_config.json index a01d232b..a9b26011 100644 --- a/orderly/data/test_data/extracted_ord_test_data_dont_trust_labelling/extract_config.json +++ b/orderly/data/test_data/extracted_ord_test_data_dont_trust_labelling/extract_config.json @@ -340,6 +340,7 @@ "[Pd on-carbon]": "[Pd]", "[Pd].C": "[Pd]", "[Pd]/C": "[Pd]", + "[Pd]/[C]]": "[Pd]", "[TEA]": "OCCN(CCO)CCO", "[Ti-superoxide]": "O=[O-].[Ti]", "[[Pd].c1ccc(P(c2ccccc2)c2ccccc2)cc1]": "[Pd]c1ccc(P(c2ccccc2)c2ccccc2)cc1", diff --git a/orderly/data/test_data/extracted_ord_test_data_dont_trust_labelling/extracted_ords/uspto-grants-1976_01.parquet b/orderly/data/test_data/extracted_ord_test_data_dont_trust_labelling/extracted_ords/uspto-grants-1976_01.parquet index 20d01210209976cb7b5e63a5a5cd273619a21e52..90e2f09e9cc325aaaecde7a5ffc3cda563dace30 100644 GIT binary patch delta 2723 zcmZXUdvFuS8Nk1_@9>>5(n_tkMyBUf~*<*hQp1v!|Y13 z8Nf09Szz2jd@92f4ikS?&KMKp4zXs%mKV0)HjX>U0T zyc<&(5^M;w&N=q_@RkuR`K!C@*CQH=fs+&A_RB& z#e;(Qquj{dvoFqiEUqyLVS4_yMM}q5$IG0Z!w-1~;VP%Y;bLDEnPefL20}HN0IBj5 z0RpnYuC2_qqYt&Wa%}Zp-c{wNcWUy&xJus0!B1(CT>c>ty zD=?m*ya8JWE&O><9T&8RxpqUJ+tuf;&V^uEk(Q~rxufY^kf_#fEu>7~o)@1VOo9 zTjG{bySC0CH_uGeN&m7zi`rHhG~T=Q*vSU<%r+Z7TRMA*oh-VN73bes5Zo^;klVA`cL*c`_#Yt}&ed)RWPb_-|Jx(W1_+n!RNIawD3&TSga0d_ zNNOW^T#vOm-g-1y>-FZK?b;{a@he+r!67fS)s2WYL~D=yBjs)&ac?6os~kNhMX{3wm9Ybiz+IzGkbK%M+; zwly&iWiu^f@m3z5!dkj~r=Dvmc8iTUE1^#G<>0}hlEZwrh?LMQR^n-`MDw78R?n;1 z{Tp~7uK=E9y^~M7#6z2BflP&-D|m?^ zl_PeLDw||j(7=|x8{o+^&}syJ2FZe^4e+@W@WtYl_)nbo0feojUJOaJ&Kw)+*z3z? zno@J{{2-3-_={3nkzbS|6X}Vaevs~9oa|-%YIJe^KNr_WS){Pmv*~5c>D5n!#yJI~ z{u8Eg4$tM>c3Br2z%ToFarF#vi$^MYQgEG&5L2}*x)a!)QO8)9$uTzC1;bL_MH5-u zQIu0Ox#4!*K+{cGto4BV_QX5IPo6_L_}olfks{u8mV9Y%dI`v;Smv{*9BisKRFr}C zYstQ>tf?jw2^WX`)?J*Z2`@F#oR}R8gi+*F;*qy%4L%!Mq0RG=89#NR`d;0Jd^Ib8 ztc%^~AKrW;sX^xo6-8RZ@;Q+dKmH5Oudpaf%1b>*-+g_M$-sDU8w_otHQ)i|0iz0F_4&WPFQ73VBO3g>PHMszd zIK2t4Ef`!d&bnY6-kO%bMY}TIVf)I+mR9XTtX#DRvoe~4UpLXJ9Uio_z<5-!rZMn( zV*pW2pqfaPp~_MnMKy^kN0q0VOx5r~V?bEC&~~{iQ68LX0r|j^@g)Ej8x09a2{b;g zxAo6{BUtUb^+b6sp%!od4#k`7)XlpEr+Mf75#)gWOpF}ZoDlDGOkW)%$9hMQ_x0m3 z@_ye4(y8CwE#%9cS17rZ`Gdc_aYgq<-x3jA3D@@Pry@d@ z`P%-(c*8z@C?a^xeFsL6Z93T_xTI|-%j11+e626uBjidqzOIZ%{hRf7W2k>~RXn;J z(f>|2lrKj{pl$ai>NoZXX1!>ykSfzvyzjM7YlaOL?iEZDUBw$b?$EpT3X1%=qc$Er z+EF{~dp+h$SMdh7j_8T+3EAdbM@FFadMPFJ8mGS}hDk=v{CQdNB`;aJ^ju; zAv-pkdTFSyZ`4bB>6*^XbX|IK(_EXBY>~i{!LL{__s4Ty^!bRCAYaOqPcw28&24HC QpIP$yp%0Gt3hk!<0d#rNp#T5? delta 2758 zcmZXVdsGzH8Nj~_GdKgoWf^t`c3BqoG5dfluop!@7i9-FtTDT23~G>ibd{JFu_U!= zZ19mlEk{9_*zu4iCulH&VyF*%B(cXNrM4z%qIr5+Pc$`&rg~Cyq9>`Yy_@()3p3w0 z_k8E~-I+W0d)#et@=BbHYe4Qji=W z9+Z|dT)nRuugC@U-}CTOB1d8#acwI7JbGP-kuq;pi&<>SMMdXc!uMf|>ZP|o5bbCd z+l+uOyo2}WA{(-Vd&u*^X9e%cB~Iu|2eBp#f1RDvq|U5w_C(B6wa@H|6-jNIxMMNs zD}_Ecut)J0%<}8veU(r*_d>qraFkDPG0rUkByp-ky{2TLx^?q2IkGdcX!i49A!6mm zMucSV23(ZJsF8IYd`io5AZB3u+eB@vqsX^DgCVhlr zR6rDoNE)q6#k|0%fsoxr+7dg24hv0DACVvDL@kHSI_`y9{-8L?&8U7X<*@?ejmT@U z`7;xL5>)Fk?TRxmZPeo$_0%~bQf1R*#l!8&<|0HjZ`4A{3~sK-c|u;XOxvVXWVe|= zYBLiW=XA)tMx?|jUB*kbzbmVgZdkR2uF7dIxw|GIHcDKGor1YkA}5@%wSZBhcr*d+R&l)_cS+_P?H$()^iM6_UCP%e)k_iYs{Td`xmC17>vA`v zUu$2vm5kns;wLKbo!o^Eai$ADsaZT_#c$ZG>+>K|{>L2lOwCx!0>G9Mh{z`h1k7J& zGqIRlk`*M#*3Lu&;tK64PZ4@W8}yW=EuLodl&FD73D&ePJsNsfTkYL6DLW9)4n(rG zjKZVby3%u5&}3G%_X}sEe`waC92C~37R^D=Yrie}Q_}Y#t>KQh@=i2xKGKXoNWIq~ z1|#5E_<+2sVAYEP83g>H=J!d`vuf*Tl41%P8GK7XgP9HBbzRY#d|@<08}QlDChdf8 z`WmeAG>OVzM6=uw-vH$sKy2gi9w+CCsQ5%ZsK<9#wO7$d^S#PQ&3s~kczQxw(*c%? zL$o0AX&2{LRm4LxcIPpEuYBDGew4-4HJtKIfdHxFbr;3o5BC+#n72k(`d z$jn63kY=@ZPHI^Q(v0%OsgQ!l=82j4_t}8Vh5pm{!8BTRc_7s^%J9G}wrb~8p4^ov zLVgh`fW~fEbKU{?uVwfzobMXM%+|0NmT1XYpR8xkwkM5ua8O2w3AV+zgUoJ^=S*?sOo{ivPg30j*rjbLvDa75x>$dq@tiEyc)@eA z_HxHcJ_7aEyMA@5$lge5i|P%(^t~N65oFpgdcoP z`%g(h!EA$p#L8lR^8n{<#19!DU|X7qO}fab#4B&srj=UIW7>jJlJcknh4*gJHkTTM z`HJ*3%iDKk`0v{8$vj#bQK3 z#Y`aDddomsQC&*S0j+yl#JRQP3WF3zX^JmN{z)EL5`P=0{O~pmd~S`~tKaIKo@?14 z8dOir{}{fQ4nF)HkN6zg_9Lnq!EM^q>BU@lHb)CmtkoWwZd$W4FTOG_v@-AP^M-hR z(kUuIbITC;wT_&A7bya9QAX&vGvowlz?qG>z357;(j2SAo3jdg3L5)>JPl(6$c#iB zewYotGk`t4s)JRwRbNPFD}n>G?+0JPtc;f7bq3mqZ$E^nH`;(#rfF=T3@J}n27Xf+ zKva{cCR1govQ$&3PNK?D<*BApO*>Q>5Ptq(%1BSLyzw4=w7K&C#^%K=Ol8aR-WXcS%V={H{%9LD#)n?TO$lXp>a z_SvNIbH30YN{}!1P9W#??Fn-JtqJ6?{^>5EP(J)AC8zTLZwcLJbkm@4hq3$2gjq-R zg@b|<9o5?h1y@qnh`w`BC@^-7B#&RxRsGDM;4@yGNXGQsA;Be$y;ePbPWNTKaY%4V z-Ir^|qi442Jqh&8*5G*5Ii#PUFEn-zO+aIM(r&>mja{uBKdI{jy?VExNL?R<#-pK5 z{fPt`>b!G2+SRY`OQ2o-6HxD{{;%DF&Dc9S0UgzCdjuQZGj)&PPTsgVrq3A`vh=lk z1cOYi@gI5N!fl5m)Ip-w1cz;U!?0jfZYR?xTC%*sLX+u*ZOg{H?mY0Lt6nrL7>qj) zOyCFf=&+y|51gDpj_DhQX`qkk!^46*sqVCPuJcN+qNG zFPL=DdQjUt@W5k1Hngc35u2c^#RrS5$=Tz-6h-{1M&%N<$d2N(I! zl{6vVhe+5_NMZ&8)VOHLdrQ!u>H;H8xaPEspYHo%~CD%ipZ=mO;xS zi|p_@!Dj|<4FuAmeuP&pK)n&UFP45ckOKX*TeMW1X>`?cHj~rH>B|)}CrbUkuYj^F zb^Z!}or$DYN2E*K>~ov0f6U#zF)yMM`##{Kdh|dpiDGVKjxNmw33>b^0$zy^@9agFQ_P_%~K!tOw(Iuxb8q(o=4)G|aLu3?Wb zw5*YB8cfReN1HbD^EfkZS4$6-)>)=!N+VmefeG1~*GOWCzFKGRnpo0MjIJ_4F}kyk zb9M7Uy10P_n!kZ9WIpItUp^W+cK11=6JrMPunUQMnd~1Jjc8i?;wPituu(!^T;m;d zVFSBKM?xc0M<+>Bge9krG#;bg2tLhLz7@f_C{EiW*vTFf$tpxwl2wTAB&)PJbKKK8 z?!}pO{#zdumTZMWlo5cCDx?80c^0?6Mdqp2^hc>phD|f7FK07sWhZp54RMsacup<_ zEw@cBizgR&j6kje?HQnMi1QbNIvP5POTREe0X)nxc#HvTVGvZePgx%2Kkf`d28696}Z@LtA_*b1f-t9D!Bs~nk8}rOS3N^o@Nb) zvZ<@G3`n=SM$l0N>?i#a@LNICjhqV-{7l_#{|PAx-lAy3JxQP)ZTi~A0-5IS^I|tI zLb7yZrOfgo>YTRqJ%(c3W&vWCtHcd}#YWx`2s!|Oa~n@CA&YHz5Q^I@$=I$?XKdo= z63XQ^FR5V$XNL&PG^t@8*Ct`UMGZ0hkN_WRS3_?AzY3vzzBf?fjxq-ZjNqLRD&!(W z4o)7#?l6*I5MK!+CtSsyVPxl5N1!@y8qb7Lp)ftFk*t4;MRV~9fD?)09Uut9vpC=-0z1B$20`7~DyY?(- zD1qmQzyw~8pgf&2j$Nl?Ze>cc0LeZQRa2}R>08+-1M1)UgW#I+&vix~1p zMm995gYo@VA<~K;jB9)me_tW*x%gKj37oH>qh#bN1?9m%xJ^O1WP6f~g7{rB3gY!- Mv^L9pKo5-n02q}2H2?qr diff --git a/orderly/data/test_data/extracted_ord_test_data_dont_trust_labelling/extracted_ords/uspto-grants-1980_08.parquet b/orderly/data/test_data/extracted_ord_test_data_dont_trust_labelling/extracted_ords/uspto-grants-1980_08.parquet index c363934b6cfe0188dd9900baf2220a00311df796..da8960319c18bdc0bb8ed5c72f404cd36eb805a8 100644 GIT binary patch delta 3088 zcmZWr4{#LK8UNm8-*S7o+q>JlT`p(H<#Jwf$0a8BkTV35k}MY!qIUsGEG9QLY;(cL zoK1vCu_dH57^xG5WmrrBAvXLQR3IrR>J%+nh!oR^w8jooQCjWTn1W)hdiEVK9h;kZ zJGi2pbBB@mg1=Dgs=G!yo z&bbfd(2p|X7h%E)*y-K&dLF5RT31=?G+gL?4TP2o&IQqTj)#XTirK{?kd;78;C+74 zBdXEn?0S*hpRz?lByeMp-NzgepDfEO{U|fEg2APgF_9ycDIfld38H0J5WYiy*Zl!* zegg+2oC0u#yRhn3rn&&SSI{u2Hw%#mah8ZIS9K54RwauYxBNm$|n$)I+rdRoehBuV_-gwdK{n#Ck<{)Jn*mAm(7M`5T%10*CJw>*&tRB+c_#k2hjfiF^D5R1Be_6Y-E19=c?5x>wz z4``ClT_Tfm%7`orCmo~^z%&?h#<|L$g56~0KQUPuV|mmcMN1Qxc+vp`x7o#Fkqh4? ztX8?{g8vZYk=O*u?6|;>bFO@nSWhS_HX~x)l8RR{v)KN$N z9!uG2E6GoV^_F+EWA8Pd5lCsu-WUd)$K2i8wgAA~yh`5~3Gj-Yf7s4Mi`t5StV?Ni z(@UTZPxtG$+Vzw`X_rSr#k9z2J!}^8~IZem`p7#-G8Jd z_gefNsy#l$Nv+vsUr#6Vzm`1fVtH(Angv7m$?>4u~TKiX68WNo^ zqtavtggPJ@flvhS$?7|d;d0s@{3qRG$xaA$rjWJ>GfHMo%<@RoR|bW>3wAN9zqMT2}B>QLAvhXJr#Ci|2*g~0K7Xnjkz?9X;=rM1hZOd zjr1^M(sq$=w}20u^)w&hplDR+sueZ}wPc8Q$E^sgxb;8*Za_F*taf{)u47IR_8Tpg zD+9A)Sq7rhwGVbLW1`Vb59&z8jT)s?z19x#qp<0hfIWL|Bh?U;i3p)BS$Iz8e#X1I zdNGg7sve@k?yLvWm)iR-%F^q`npk5^$b=@k0C#znRHJT4l`u<7leQzJG0{cYsIo$` z2R8MD{%cw@}#$g$aXeU4h{ z4<0hbdm-tBG5S6G?L{*KW`2o5(DX)`R{nDpdi-$J4uSHb!ldyLt)I+WZ?5Bp2c$ojLW(V#~eUKeP za;{gW_IkxG(HY*DylT8LoL7zfNNU$fGydOr&Wwc;r@pG@_JLeiat%XAVht01Gr0&4 zF7i&lv?{m%R#a-Fs#1a}YHI>9MM<784V*CLm-1+xBP9hz2_~Bi=_8B2%4;(8zoMI< z+A6s6Y=-cqf!2o9pt?LWn1#6q7pn6;B2v@_Pa|5r;i%`)qaH*r9lbK>#T-BC!7Dz- z|2~YQOPb5XN}*G&*`hH^ge(Ww+}e^@bzeyd5p_~8FMT$Iq)|r}NT*uog#fa$>3<4{ zI+}X^R_y^3>&%%LM$$M>J+`h#Q(Y$dv}a6go;cB$28}q#_ z;hKt^8r4cHtX=z=g|*mEji%v;v^{jDbZ9&c+47-FTV$nEC+VUhwWfxCe7#n0WkoGu zrJ)~R@1BaE)1p>3UpjX+4LScr-i^>t)2R}*rcV9Ej++kereafTszaaljg_8J-`+H2 zSeqrVZZxbd71#pZ@BwX7VC~+L8yY;c6-gsAsltX26i*GAeEYhq_NG9WOun6l_i3LB zl;ggCrXg>9bz^3^jkSnxe3gb@?9rCnSdWh0AGNXhy04$mp0}~?)P0QF=%n_gjm@LC lY&%F(mWswg;bicwfrS4l`c}M)@*}C3{qT{ zgKk4{LHq-CRt6(hTLhI;{shayB8tbKMcG{h^sI`y;Ln1`tp&B7b=k0QKs;`<=iE6j zFWso}EAu-6Hv*c&S`twu zDBm8r4Q5&lXw&15>1N4r-8!# z$pZKCADBVedF(UM6mM2~jzE}|Aind=!5ULT3uZ6Sp?AlYEiBI=1V&6E|b<6?}v(0b^UY|wk;Hj5R zjt~!LrvT?w`^3tOOBgT%WyT9(GU6sUNb*l|#f=J%lG`c@ztf2;0N72l+&3__cY(m# zxer;p(GpFB5@=TX7>ApHR2$0B0{4dw0VjjWaEN=xW{cxcP?_$5ke$IZp{wf9o#^?` zqBXNvTm?B6=CdFlT{hV`e&$mUTZ-`0(DN#Z;B)SDA^J4^6)k5=OK0zEMGIi7V+tdqEMe#!;jf%4JB%kFWEOqf@6Z#JMf5`fg zno4zQkw;9}s`uG(sjeT0b}XU-L)?&Ey{pJ=yiVY5QL{zkO{)YGNr5y){h(;BwaURr z(6IA%?`((o3HOj%?<@)XeWa6#0sAPJ(7KgHT#FOtvKFb_#!cj}{L~I%%3gW}Zwx1% z`-(*2PO6VNE3R2nOhO@*t1NX9Lh8_7`K^I=pxSe_&|M2nUPG=EBApQM8mE?rys`UKgGb3^ z3b{-n-h`+dm#EaaDj=ND$I3GFo=)%==@oNme2so3stYGkmb7;nVGOs}>WMZ6Nq+?s z)qk~7H^)8!e6`*(M0#1c?=S!#D4e7O6QO2h(xir#Fd236uz1p4kx;@4nE~X#pdzcU zdB%BiEaPUGz@5MD!Gv7XkQgMqA}%-mvW8W?Bfa=xk(+OWAZ0ZXkNif_k|cI(91v+M z5T|21;#L(|nw1ti)%u)F%|H|msc#ocnJ-x2do8xw1?8h#TxM)FEPAr5(U?dywHt6) zk2FH!T?DTzg49cJ-~B-OdxE$g!U`54veHG3U(#y~JXJr9qpD+zNpmil0eFqB*UiF% zYHlG!7DC`AeSS9S>?D9Ms3fVv(ORJXmK(CfhAgSK zpsoa_&OeM&6Z9bljyfXlZW9 zJzy9n(;vwC40U(s1}&GwJ!s+H@EV(^=9U|Nlx@omZ7-N&4P$yo`m#-iTp1iKgSxkJ zacWaMP;_charQqj&O-Q*3}384c}p^x+r#$mVN1ty<4~tlZ<=H>oV(uqf2HJlbM{5U zKp9YF=e8E`LuNt~sZlNTj|^U0yye>BbG#v!K}32pi2D4a+Ziglv*7yW4+Hw;z$mK%N5f825Dhsawuu-rs?w!H^52Q_+bL0GEe+Y=Efq zxb)~ek7#j;;Bqpp^V-sP+*nnGMT4}OlaB8~(z{NgbadWurDvC;Jgw1}v`+6v(f~)a z$eGJ(=F863t<~1Dbcwib?T^@HXhl|goj}5hY##KzcAlmE!uK2VkT?BK}DiMf3FLG5QYf(-8S4M)DzuG5h(kK`f8 zwFe0z9e+O$+4reW+i9l-?SzfC3uHIk=xz3IRUT4J z0t`QUeyz5{PK(<8cG@PKUt2O9@6)yuFGAmjJfvScV<)xg-<*e>(0n{CqZ3*!PnVIU zohP4oC!QNM-1g?<+D4wfLfCvf59-%GAWB02Kl6|QO>hu%1OLuLR;|+F4%%-Z>tY98 zYIyn~?QsV!>+1osN2j&V9CQg;Oack(yjCTU>Yv|SmDfNoYbhOi`I$UO`u&~_0{x!- Ezd20&U;qFB diff --git a/orderly/data/test_data/extracted_ord_test_data_dont_trust_labelling/extracted_ords/uspto-grants-1982_08.parquet b/orderly/data/test_data/extracted_ord_test_data_dont_trust_labelling/extracted_ords/uspto-grants-1982_08.parquet index ce0dc1f702b6993f14177e511bd5823c328906a6..21677838771165edba7d8a30ef5a54fa1c2149f1 100644 GIT binary patch delta 1414 zcmZXTZ%i9y9LIl8>9tS6RyOa>Qn3TtLjPQ8dm0u9jR_ruXhO}17^!BmB8iP5!51W3 zVrbBf$w(Yt;-f?>5aw7V8(6~BZAkxsX8K}*gb>pg@%*cLO?j z5a+-#^zJKG(FbKC#Ej6Fh4Ki*M!?jJ2Y!Dd7a0W8rGBR!wMt`7-dU{QwBl$qVCku1p42+K zMn5(JGTF^yRNgmXgh>uDHdCK;*ZKOfq8oZF^L4B%>I~BAE{ww4PS4Wp>3w4mr4AH>~^TszaejulIuR;>(iYc@ABZ-t;tk&iXrD zruDX#&(5@! zi_G~VV;BT)+t~YpFd+ztN<&parKMu1bX59Vf`E0>vom)z{Fc22_`NCA-U3ilOAi2W z8UScaknj)K#wJ!&q(Cmpr0|o99Fe;+Im)ZZBsr4FWKl&HiD!e7#kfW}!3H@ilZ|N= znI)geWOiOfO60ywO5do+F43p4o7>G&QZWB-)kV^k#$I+Yrz$Oxi8QvOC9;&pjcAqZ zq;WmFdZ|J=>I}gdEU+^ls>l@S&frFND&fEUxCMQw9=Kfk%Pwc>tszP>uV~KC&f+r>GOo$qVqf??OV8k-pWvHhp;pwKzf4Z6&}D6B8G zA+7^gH`~qBH|3;Ha5KzUTj^$J4ODTmHfd6aD?=1RH^#P97bk=iN8Rk-~=cqq4y*hZMgfFje)RB96ur`BhWhnN<kF3 zg67L1CM0(En|JedQNJtdH|QCu@1h8jDS#0G5%kA;%qnn^-A1DHp1*jFAM}m{G z9<^}cz~MmHW-M+fbTEN%Ly(I~k-mLzS^u)(kgLzQaSQkpZ`UXo^r>?h+lyE1we0;-iL~6-T3fA*g3xV#N02 zLk2)bi$#ipVt|AJVmxB)#sl)K*!A+GX6O(`>p53c4AcGBZ7Gkd8(B>!>fxa2V}WT8 z8vhU!0nw2Z5QwHLt=1X&+vaHf9SJ&&Cz`;Oxhs`<_)HCN=WC_EkyBDG?FwF!Z@VLw z4_}K_q01}nJ_p;Ttg!tnZ0)Y1~+lpe>5^iqtr%7j)-p}*2h8~k)D(Eq}n#D~hMjmFdlZ#F4(Qa$Z(K{&? zG?E`Cr*fDhT{&FG<%jobD+m6o9Mj0nng1()Rx5ALXgp6=bMznQ$)*L`+Zx%ufNk{i kjyg+_m^w?4tLki$JY2vH{A8J~@!}IQFW=DL!9SV)1qgCxv;Y7A diff --git a/orderly/data/test_data/extracted_ord_test_data_dont_trust_labelling/extracted_ords/uspto-grants-1986_09.parquet b/orderly/data/test_data/extracted_ord_test_data_dont_trust_labelling/extracted_ords/uspto-grants-1986_09.parquet index 820357f6e63c8a0fe8ff0419f4857f8e407b5d77..3e30ed0f9136b13ea14f9c7b5d5740d5c49e984d 100644 GIT binary patch delta 2599 zcmZvddvFuy5y1D8@6I|~I!R~UVac*A*?JmeTW1i2Al25@itV%#1I4tp#iPPXa2!o| zISqu^Y0ayp=5cY4DjBF9n^F(BOd#M8hcOHg+GYq38EiTMC&B$EjOZku(j-On6xtcE zIJhZ};}wyZH5jFGmZGKCRJqokL8@HH#TmF~AUv9AMkej)gQ2WbLU*+2Ev% zYX`6^&ttXW6XyV(#P}dicnWj5We`^aWs|EzF?TGsC*D$l;)@|^2i#s`!GbG2Ps#dM zDHg7sS?QSppf=aMJ`W*v%pHlu^xu&Fjk20%#Ck~Ru0sX!_kr=pD*zvs`)g<0bc*`Z z2yg$V?-T}kXBI+eO9UKSR>=8Q=h~+UO~8L$=GOpT+HzRoGl9Pv>d!%3jV{^Y^m-Iu z1r1A~fdwg>xE-u3jGVf9bHfIxSwJ_*d$LRsccd{^8;->G{~-%svg8iM<;D*HX9r-a zx7BTB=gJk{kTX=>u%iwm3FZY-#Z?1tK$GxPogvQniyZQ|z$3eXck-2z4{tS4BU)<|p0yoPY1Fvc?Dd>M9twa)Q|_cfxXoJ8 zlz`6|^65($Wj0rkE^|B>oC)Lul9*qv<^m9vUe3R6HDpDX0;dOj)vY2N_fi8IZgCnh z71sImYr?lP+RS1;<_}Xn3e4d${#gx9>ODE|ls{KTV{R9+P=jyp%}3srx8&SU(#J4g zw~#d1Uac z*QIC}qydZ+r&E;hJa3zC#s3ISW+2c7R7pDRbwXUff$Fd+m(s)9vYj?vS74V=SWt#` z3O^|LKuuZHjSFjMh8t)LUhgY33pztLdPVq?p$KgiJ~kNhj-N-&m-@%Y`Q|%?0huv1 zrJ$!2sRZZ}@oynzhyBu|j;DsQ2ZyrV31Ll9&++VjE+7Fm!yuofAFDjt*A_OM=OnPR0>z0`U3C&LQtEGs*P+kOEyrZ z{Pf z5qE&@%T0e!=KsB*xR86fZ##Q-^d{ugeO8bS<;!DJ! z0eJd`kSez2AF$*1&OB)I!fB1GC)wYijzqY15o)HephSm6p`pZzYJ^uyp3(075HY># z$J>1&B3$vz=P9N+etM|j6pioC)nwg~mQ!xVCb8~N6mm0?yP1nowwh25_cUb4H< zykGX(xAoeQq%ujfB*i4fPxjggasVGXhuA~n(JKTyci%b#qtG#+%O_Vp*Bo2FyrP0* zWbCkty>%Y3Zx!f(-M4Om2S8_#{!_rnq=ws$cZmA0qZ6wPbR<3r)A_5G$_| zB}~VG3^H+8eBmll%uF24KzE14UI}UM4rQRd;+PaqyZ2lMIVh^GN#x+i8Kh4Pq)Fe+ z|3fxkBTCu6Nr?=ro+vIkDSmK`n59jo*^_m5iVviCs8cj22%GHUZm}*wn6wvn%cdLl ziJKF|ET(Ty1{q0;rxS#g8A+O^JB^Kt?omQ3Dn);96-n-Qb{jYm~4t2ai~%ld~7a+oJ>{mR*<9I(spLcZ@ya z&!>6ESOy>YT>Q;-!t5FOJOg!g&M`1K-`zg>+MmakuV4LQ^P|gmcrJTEBVDX6@y>O^ zE=zQa${U1*Np#Pdp5E|TF(4uB@YxKsTYT*XVb$*bEQ9P5Po~MfuQEud_>VN{?C?xK zC@wn32n&jf!7(YLzx+;od5kcppYbumX1)8a=YOS;udHrvT_p{Y`@gl|Y;TrS%iVXy eNy!v-iHdQ;%yzve9r5JdW0P;3J$F7;XzpXbMIEVP(6&3J3S{`_cR|8yGaWbfXtZHq(rMgKG zA<2Ip#7fv=2eAmi&738w>rb5l^eaNDB`IjEZ34#NPX|_|)&--!XmoEg_!yfgL#+Ws z?8U&TB5+O+wJZGVptDG`sA)!te;RK8(;i&06T`^x(ITHzf8un!&v)KAu zh%mrcL>wTRfb3CMRL^q#Ez|ps=mhe9;0*$@`b-eqS_q!4`D&@r+eA|Os*3kj6?!1^ z8=5r%8&s4-4bJ`wBpC2B(7PVRHbLD|s1qQz7-DX4zGD4KTmE|82B@6Nxzuf+*uuVW zC|X?~jvd_7rB7PD>Us`Ima*&g;^HR~D0Y@|)$-`HX8Ve4qN1Md&=8(N z>uDW$*|Tbp*osUeNB9a+9{b~i>#|X)f1O%f_{8&30}X7yng~_Y_hom^!8NH^@zg%(xMsIU98gRBYMw#ol7??aNj*OG%4$tNsU z!>$X6-(r4tN-wUIPdNCK4=u&2R(4Xu#`Snq+xtLlhu+IjyY|WStxgKR9as00#NqVg zhtqw3(5rjfAW;mPib4Dd(szpd|7L^iXD?-lOMw@F;^ydx>RG0kKA`8wjsOp04UoCS z!dLyv0ZR_ItMngqsTM%n^+$Ek%SzTS^^q6gG9J?p4}UHn$tlubHN=V`;s&xNF+z5Z zL{=$n%Lsu#WrE6h;Cn(OO?A}5;A@3klkbo@TS03-{bE1uymxe%Ol(9?pV5tI;sKBk z*ezOO!Gg&1u;VSX)s@)5G;d%c??b`_?Iv)l6BFsTCh+f!s*aDU$aNLvwD>JVI?y|K zRB}Kg008@Ui`rA=k@am$-8}zo6Dd#QP45bS*Ht!#n-}YwpCck z^vwh@aXW4Fc3NZ%oMiJPsw`;QFBdzUNR;aw&u4A9i1@?0XGPR^vfHs)<%w<2OsqlA zJ~3>zr!};vZEYi@#^{Hp#3pFiqJFyDLRdQQiwmjgfWH)+j!NU5G)HtK0UPzZ+ZY!}`XgA2!ZXYW7X^VSX ziyPsk!Yd6g0x$A(i<_d4T0TC9gz+LH2>+Zv*AHN{;J+#0Rk*3Hxbof=%DDK<1tc6Z z;~nwWrta9h8(Dabkq2Woy<*Mk==$Zw#U;E-=+O$tE+U~{hrRHgsc!vB=2gnU_rI4y zy1NwVDwWH3ccq}7vq}p_hMu!2XiVwG@eE_tOJ-#j{YCAWPr*S zCU0TVqy0ujdOub64NwJ!-cM7!;>sY7hvLek0m`L%cdufPQ#Ql9dsR~pKdCHmaQ_+EYNn}u_4^leAfqNK}gUI`0~G?H^9z zd&ZU3*C@MT&v**DuEa1w*Oi`Ylv{OZzp`ftAM8;w(4qaar{=Kp+!OBfJ(bbXp6Yz(&;}2)d+WVf| z^L=~goW1wiCp~@c#eMGH9mbSnT{8aTp*j4q6drg5uLS5K=(;G?uTf~p1><`~@{!rz zN1*u`Xf^{sVay5$lt5BHpEpK)WT(l6#**_UXX@^FJlg++wOKEUF4650Ib#BEfI(aG z*gTS94x*#vNwYiQMj{TB{v8K~Ahv{bnLE)bQe}A#X~|zKHl!d|ET78O8OdMM>(OxH zwieBe9+0;jsND)!XSHBe^9f*|H~D|hjx&KTqt6gI3I0L;4DMh;LW9uY;Rnle1;GpK zw~5z$^1v*?0PG8Df02epvnu>ZnKfHCS+_AN7d;*#_`vp9_B znAaqW?uO5IgRljnWd6`3;}tbbJzjq%dI)(Bq39YJm9N&w?8(jNi8@1{vQ2@FsHW$N zt}DU2PWj%y62#oFt7)@AD;WQbSNS60O$Q|CuYw>bT>zDR?= z;l5O=x`Y>icL+KV)LDC?1iTK?tbK-^R7hXrL!sC$S#!s?6@$<4`b@r7S zuo_a>6KahpiXIm`*KBMxCseq_P}5#D?(yFkavUe41Cf=0bGZX-Boj^;fh{!SLm;p( zvFv7!z0GLcG|zD%?gFW^8B7fZt}G#)Ccehw7Hb4PE3z8crX1MXj6lGwuY98FUZ~?r)-0#jnk?o;B?e3yb}umwxJDq9(k;Sb$Mqv>_Btyn+d_k zvxV1taVn}6rpH*O4FNZRhD+0MEv)sX{X3ddI_FmB1%s?-JDbM5A1{>O->=<+8ib{Ag05hu`Ige8Ja|~OMJWaa zc2kzVDqKi|739`tB@~K+dsd5+Q<%8fxwy>ut$xHJxqXFoJ6>!&orH>~EW!5{@C)eWXcIIx!B=&F>+r_B@yI@T*8aT6Q^1c#HVEnV88!rX z31>jh}FhT(=t>g)B7T zg~GEK7xAUFDgXmFS;cm#(MWDK%SstY4j%8PB;S$#XkTJckp0@Mo0fZd1iLE@g2qVv zm`NavjHN*JEH08)KhsV>;)y)U@(54BA9@^7|+$HCM|!+{s;4m#=tnaFe*kt&0LY*xYmO-Qx<|y z9GhD5-JE0qBP%eD)1C`L#*hi_LYbzJEO)yn{h-v>$gxaHt{+H`Gk*v97sb^x|C2cY5( zbXThGA2obDt0fV)wp4gk5LgGiMF>Lq_KLU3*?fxz*}IWlmXmmhh z{RmvfvreSW6m9o9BW(lhhqNIWymyJB?G-5BQ?WR%trSi;Ve0tk7IpU)wY|G# zv(N*LI%qzF@DK=^wPZy>KGTt+>qt2_4_fb%qXly#Vg1U7ZMyd}p@!X;X?!^iLMpY~ z7rLBPey-Wb>Sk!LJ6bJYP_3#KhRHzBUSd?Ig?0dUX?iBTI_`Bw)G7}K@!94$6lnzD zjpt4BkQ@D3>8fyJxxB&C=p$P_CaovcY`3tJtsB4YnvxW01=aB49CF^XO-~+#5ba%U zV-UADKKcp&_9uMQmEPteTZA&rVUw07Tb%A*;Ex`0b{!#ig)zuTstY41nd~gIsnaSr zkDA@jMYra(ZYBRH9G1G;*;WDgwF}{B2vniAW{5hW(HW~dZC_xpMWH>`E8{=oew@{= z)ikSyzGcO`kalWobhhTc1nHMajCT?vb*@t6O1-NTJjVz3_bT=mK;#<6Bck^Bx9p^^ zWE6USrH^OUBu4LvaSgHfW-Cvblq0jl+&uDz&y9M?abHz=bVlm`DJN#6l5lCdCQ%tp zR2I|cayYph?5(7;G?Vm}X392Ta;9`5I?`7-bUh--pG7qORH}*Dhp($Z!Db{S#Qp$Ahr*X%zhwXfMpYq>G+30d@Kq?XRt z{vchoGEi*XB_|)2TPKc)i{6ti`urSzMQQsS$io$S5~;|X>xqy4deTN+ZBqOadF&Ul z{}=L&O^W$5D;LeIL{w!|OQxGQ%mBJ8jU`@sd8N^Y z2fU2t*>SZSrSS;{9t?U_{j{k~Pd!>tAn-EXu@v2jH)XmV4r)a?{mt5?A%4cDZU6AU zAT#}p(R8auitrsh1CXnGrI!PYTX%IYg^s2CBnrFsNrwVV4!>*P0JBp!CArAhWOQ_k zj2(4IA(63~I-&!Z^S+n;5K0LRWs;>1kVXWE`knnp4SS(4n)EamqfvDIKk3 htg-RS7`j=~RxwuH=3CUyPYX@A;e(wctC%TC{{edCZ$leY4Zw^KTT=!+E6GYg(@grWu>!@9#$q} z8Jl1Q2Lg`2_dsUGkb?BsY6Yo^ZVD99xruC3$(B}(mU8}jEg0%XY`W6}In0!jmpHLzX;~Y*_TFW@4 z3FRVg+RXr&kxj;U{iM*F6p560D=Q70%M$t8b2%wrskO_7zrS z>_|RCFBCe_B>HJ#o#rDgjoBKCxQ5b>a=`QZt^=+>e@?Fhu>wB;v$kwO5z*Q{n?7mt zp#;5P^A-$bIj?o%JoZO>sJ; zo27$SIVBjpUk-!+vYZF10rnbz__XM+_8o$}3f3*rsL#Uiv-Az8wd-fCzfxa7p%;+) zpoYoFgBo{U@;o>DBIxpPRWyMXNFc93*oD5m_K18i6I)SJ|9L5`4{g`J_?})VYjJf1 zGGogD-J(0H%an_m-_}f!Z-TS|4&JYAT-idl31%g%G&e16ksJ~vEZJH%C7&3Ot-JPh ztzTJoSwGo`uM0z3U5|V>h#r}J(*Twl%nPO31vddM!X5aTz{+B5ocO*T=W_LBx+YvH zHVu9ViNNjjiDF0Bn!y%Q;uIQ<(gqvx7@=*}t&AW~M-KjJK*$+dHnaZ>>pooL?)xJ-DGw3omc zD-f?8zW_2>Nyr|7jPOQ3D~Y0n@P>=k_q10s@MGw>^G!~1p(++OC@085i!g;QcbU*;UctGeiGJRx6jq1JrhegELXFd zH%>eyU=Lb_59;GfOXXdCxBx}uZ;euA+Y|Kz7zN%aEGNzp_g`4@+24_Z?s>R0)RpBt z+x_+VFJ{YiH@56{RIjlv(4ySZ1$G$O40yA9oBrmdrIR}7>XH(aMY~H(9Ty_AmL&Z@ zX$1KPaE1}}P5TwCO*?o(*Q-_HGT9X{*SAzM+Ty~dfWA_Z{VSGxg=>Yv*UE6M^|E>P zTF$-0`4rx0eIXZ3yJJ1V@*9B5ohPu4Z1T#E(Yov~z#WEz5$bT~7xzqTUkTz|5JTMIkh%EI4sW`cpN(vzojo1TWw-mTT4 zjER5H$}0fZN3d|wfNQ}gf6v@KIr=q-=E7i!P?GNp`q+|L+_)kOIeOM*$l2^gmrUXs ztV3s*Zo@iJK)UXIoU#n>7s?29;tldDj3EdlxsPeH+K58LK}qM>7u zaQLFq80z2N=|J0&4|2DwC+d3~i1g^bOO^`$N37fysa4uo?5 zinZ16Xz+oEd94`|p~W04a#RM2)kTkcZ$V$DqEEhQX%Q}pvI@I*bPZ%kE3{awALh$8 zAeXX*Cf)68rn?`qf5#eK$jNg?At))7o2{GZJ-&SOOS;>)-~6Zr+n3y?mDh}#Ew9S? zfRoN2NHN;zuhghAUGMj!1@w^L5?xU%aK_@hYC8*o)oD8;t;enucM_0m!F?`ZEh6`1 zn-bw>Wkm~#+RL65b091|P$x%$R2tN}(r6v!13Sx;F%p<0O`IT{&6^MpMpwY+KX!-x z@KKGp+w;W>?ewnHU5+eiU?{Mkn@e8PlE+-E9}nLRP`0-Ubi1r;_`x;!`$<~{c8L?v z;n+)%j6$pe`Xag;M-0CjxH|{;uB{2_Ai4)cha88xL$Sx`E9DO3M`ZnLP+QgTsUz#Q1@2SN~AmKqM&txwL}ixXa;CA7mwrf$qdxq0U%n>+qi@u9oPC| ze$258xZ@_eFK9EB7T6sQvdDSxgPtY1?VCY2Ynq4tI{1Wn%M?hkD>*$M3D0G#zY~xA zPE?DF(#13_*BVdQOks9qF7vOCs3*NWC+UbhADL*RvK<-dBb6?L6cd65a!gQrCv5Je zZ&ub6?DVE%fVU5|s0q*|(n(Og(BTEJ5;9OZ$&s1&@?nj5MfhSGA5P^K^VWf`Y0sb; zM7FaOst)b{B}sOW<0(7l=NsJ7F6)j3S{^iXwRi##D{U^D-GHYOJ#H_*Z9npx_j+Q={ zxgB*y4N!#HVQEgVr z-1%h%`fqV&&*ScEn0bH0OvF^fG=nL}6f@0an#ELmv|*+$`C;KpJsN3ujRmB)JE}qe zmO|D}LDaBBc|J~EsNt=)laFeo6CQ^6xDSnmJeM*>c$ew9U;i6RLxi_>T|KOk(s|7J zxp&7w&V7suMZHA?i&@ut=N+9Z7gtnF7B$l8eCgoNw=-)MACjIQ{83%Z5Lt_Ay^+!=Hj< z85sV>q*S-U=cPsz-YBuzSittnp98$LywSvFV*%$<4Yj;iJa=uJ<&>Jbzn1rir_|6` zr&GsLCu@1Pc>v)ez{bZcw&D8QbzF2&7>(sH~_WUu` zQ^yx+*!VJY*6jW?b)}9kk#>KY7z^Dy*qHjDj&Tu=v8}S%SirSZYmCu&?fr3<6C3B< za6QCaB{m!D`uKyN1Kwc(kecJ`(WGg+S5D9_|Q=$+JUGx=gRte(kxHT$op4$kDA dqvOk?)~l)a*^~{r^?aV@{F$SkdVWdnKLDKdi6Q_1 diff --git a/orderly/data/test_data/extracted_ord_test_data_dont_trust_labelling/extracted_ords/uspto-grants-2001_07.parquet b/orderly/data/test_data/extracted_ord_test_data_dont_trust_labelling/extracted_ords/uspto-grants-2001_07.parquet index d263b9333338a202d2fc89e30b4107f14c0e15a5..1a9f4dc4dedba9e45420020010220a424803b7fc 100644 GIT binary patch delta 6727 zcmZWt30zah);|+-Avc5}xo|HKATfbJ0+NyoL_~>_l*l3pVnt<9FcK@asVw4FrMOnD zOC4<#6-BGI*4p~eT3f4CTeZGcUw!Ufp7zyNt)-jwbxpsC_Iv~ChalEstEWO6)H1T~J7Rtn0I2{g#@}`@Tay_)#LYT~^9f;A}rh%ca&IbvlDw)Sv zhWC+iMKL*Nk+1cAVAn{~Mbo+Vc!txRunz#nbHxzmJpiH_nA!HP<<3Mvo0)!yUECxZ z03GbAB(D@jqaEZ#kujzjx3P8=@L8%_q(@)#cg211UL!Y|gciH=cZccZs#xW;tT4TX8K`g-W~KFHB5@gX zghTESa;&7JO&ko)F?dcSa*8l>I4ns($Y1n>I1E~IK}-vNu;M46=5<~L#;$w&Xf~ir zVm2%(ff*K&Lc*cG9*Q2Xw=H6A36MG~Oe_c*`ni9Hnv=o0Kq9t)AzYVeO#-tG#0&s| zdzX!JSR8o{hyEaLlR8bdu4sOgmg^Vb4hk3Jt1@p>+P7qUr!!Sr&OL)4N^mT4^II|)!6jPVf zx{Fnw=+;sw{vFWg3U{6(FSm`%bU#cT^{{&)+D=l+^og@M(F$EXQjZ01(n5`GODNYJ z#;p&N8SPEj9|dk)OCBrhr_}J=07dSjDdK#n z;tc?edA+3k3YeDo9tJ#HGxA^cDnn$v$z^dkJYqBciR>6T58Wkc6*}|=DXmCGhe%6> zIpRW&T?G4DXc`O!aXGh>JdA?M}{^oLKZ7$SK=$#@h*?W=b?+nG8~i673ImQModz- zIqhdu;6g9skCnQo1pBbc-H8eZ4Uk-cC`?J>3>jvUh>s`Q2a94X3WP&vIOxKdb4P#b zAD|Y`g)d6sv>PaGq!aC zqGqqi0AAqP#aZ~#6x0c>?^bo~0Jrc|oWkR=m^3%C=;r3CF~nVa2k+&=s zAP;X(1`!V#IN%6BktXYHK}TShh>^j#jo*ynV){uPj)wijUdi>HZ+!Plu;A6X)Q~ua zp7RKs;ovodN7z`P+0?f zzGfWxqtb`UiLXi*wZ;vOT$`3Tl&_{km<9zaZRuodRT}z~oT-YjeVZlAbeOm-JI(X6 z`)$9$e^Cd_Vg62g2*VY#0FR|Q+|DqB9EPdevqDTO?3Z0QIvVd|xvQ+bmKdrV!uz`d z|0P?0vY~oF{EZ@T=eC6K#~3y{&fZVOXm!KA69I1=h|iGksteE>(!WL@dbB)nwA_A_ z)YfE&UMdedj1CjtMWSl;vMW_IIEY&&5?gIGnnBjr7WHi$hh5{Oo-FQ}5m644 zC)<;SZ%0ULMEs@--;NjFCepgW?2m;s*i0_BpC-fV79t1vXI+c*L<8PPO2!|d^+vd7 zFZvfb>bam+*D|rGi_-Zrnquk%hC?k()*hr9q~XcmCOi@`WCETY7c+voezVV`-r3F9 z>k#^?yT3OC$>ZAbN6%4rsTfLzPMm`FlGi8NQNQkACmN9IwHFw6Us?Vvs*uIoS-hd! zGHEcPmTGKB)qJ}y^@^Rk*4taDQ(d$31Gt+@)T8pit^dxf8B7i|Oh@k$adNVJ!d^xM zSKS`&2{L~2eBt*s@DP48532EseY)*9;Wc66P$7Z*HhC6WMW#QtK-zSW+dJ)(r1M!E(P-4TfdhJe{`*>b}CWSgwV|JIzeCqOhi_(3DyC zqnDPHYUtV9q~1&VHtOQm%JHo*?wc^N-#r^WWw%O#fvTX;)|XT__H7H=x7>acbUDeB z)cIea$MG(CFrMWFO&Oejx8bT79URIVUqZ!7@O}VHhOUWWbsfk7ih0$Ig5`Pxo1@#Zl=d-DF$}~p!Srv5qN~L}rb@$)YIWa>7d^Ix2q&iGPFjg+I@Gzi>+#@L zfAvak#t3EtJkql?OuaPB`;R{tI6EiM2n~(!^Ge~SiTf-qznfc(v0oSE@z9D!#LuMQ zU*+nRDd58&ZPstTz&QrBdA9*}9g?|RdF((Q8%lm^DGoJ9=D)6w7fI37L}Vk6)BpA4*{K;h z^)WsVnD1io$)H{M$9*M$*N5TRG5Ep(B*X7SfI~PGty#me@lumJR^XCkSkB@T|dV@%`Be!0Y1^ZPs;GTM#s@Kl0tuo z6T3n6{6ZeM#xSl+TTu+LeHpp)a0K7Kk_EJ_d=XSOJrAk`>wZajA?SvkF70EwD2@nP z7}Y!XAp7sNs{L$rHfN{NXA)RH2bF_5hT+K!dtp|34gC2>Z#OQT2<3N8`|DN5?e1CA zX2rDy&V%(lbhv;&sk%&WEPgi9s6!{RJPAY#@XbpEz~7`|)4SZQRH(vlpAgFP@XI+Y zlYDf4XFg({syi+|1%VGD)gMHPj|9)#OlSTwylslKs>Hetyu*R-Pjhj5g*W@&WyJuL zL1S3o;gO6?b3GA}xg=JCk_pY^-jA);ghcNm*xe84frtj(9yPG)Q_h&cK4{nn!m0yY zsZwTi+l44SUZcd*3TVZIAK%Fuaj_E|)*QV(Caqu;ULL+v4dN5Qi<;IHpvO@<9eX(x z49CfD3;!gp)zQ+cs^#O!q}3^HgC|p`YRX4DUQ~Je2M=NVCx%htt}Z_A^k@iVL1z{) z+v>8!MlgK{Ws^GaB^6Z|4Bjr?u1$yv*9ZPWuZ8PHg@m;4){u&| zLijRTkJD<@6Id7FTSsQBeF+UEhIM{g&%L#7aJVcj@Lv*Sgzd1k;VZV_Le7>4&VcpI zpP7Q@3!4kLA-GA*rZ-sIc?eigj|G`51!;k~66ah=;4+-OOz($qj!ccew-iNT6lz#^F&LMg;ys+VrJnq=jPS<5w}5?%eE2d?u0LO!@c9c(4+G!XM9)tj{+R^u zCB5e7dGuhhzI*((yGYu7mg}BZy<<`J4n$c(xesM2WrlJH1*GUFh z0b=8690XoMgJSZHkNq``+SrkEPlgH?rcum|kNZ4`Ido1M&nseih>c)Unb3X_M9@bZiVAWIa{#KeJ!i=}&0Zrt>R1sqB9Z zbLa2=an0I934P9Q)*2*pZ~9+q*6R4VH)Rh(m*4VVqgsUJw|a@b{c4TBwnZD|&uP)B z1lmFmcJ5i|AJ?Ka2t5lsd&#z(3(9!^D=k`O^p)g}tZ1HEOd7y7$fwR#DCUToE-NX~xx+%#=+ z^x04E^ip$b%UwU3rj0UM?tAZ{8zI8crNNy+CB4wk?EcR^w13$&I&j+RjriqH#L0Vz zUwR`BzWUQW9VuD&zxdAiP~r6q`avq_rS8tv`(VFqI{hGR^+LR{^5^?Qnn4i)ZS_KY zeAvHfI(1k2JU*RTDRLf=_-1ZGXGlZ<|4_MW5q??NxjIOJ`_x(X00UN_9Q3 aJ)Yj@KNb`|y|1@Wc=eP0zs}Gei~K)=aD6lY delta 6835 zcmZX24_s4K`}lL_F6;t3Y!~hY25b)4*gz(`z(f>en=JmwCa9nwDh)@XEd@m>mGEy` zsddyLDoXjkH?7D&Y5uo-v-j03wX$AQ%l>F#nZ2*2t?!xE?^m(U^W1yReV%)F&h!5{ z*E{AOT08fZdIx{|L@Wu8iT{g_i1=iCQg9Ua?q7Gej_PQfaYvsEsAd4#UdmXjb%H**q9*k zNDkp%&~nH;z$Ivk?Dun zC5@sE(3@Q&71j1BsWpgY(zM}XGKg|Oncy;Y$wYNv!Z^_rJgbY#i@f zTUeVC_)O&llz_hD|10i>cP4WaiKN(N-4?11QO1})1ZAXBW{ZQoIw^!4{k7~wkrtnk zi4ka0e_b`2brB;7!qzZ_J*_6uDb|2T4xnJxVk>#RI1TB@k>VC~lH`;ONZX}gShS4F zX;MHQO!3j5#9Y`sRc0&99C!+N{9HI56DD;Vg!M|CLOM${D2E&>DQQ(*40nvcbLGe( z!owwCQzCf^zp&_vcnWNpK({Biijjdm^3|dM&H>=~CoqGxFNlK!53903TP7Nyq6FlN zTjU~rLk<(IQ1qxZ&&}q=LrP_+SP)pz>chb%Jy>cb;&jjjYvV0RV9EorH61|Uy4Xm& z*`8~+C$wX!!R{EnjO4GkQwYo-e+nS zV&W>6BYaF|>|&95E96@Hf$JzyxJ7q{TcsptOCw886lH|fWMrDFj!G6TlsapjCm8V! zN%0qqauAhx!0HmHQOj$NMkX8#u|7d4m4-DsJ%bDf= zM2r&&>nc_biE1u|;$H!M5$4LZ=jK$CI@eGXOt!n~(LACYo{%_(6D`oyBXyhc6B?+= zdm)(X4&~N{$_%zf+!P5eJeX7r@2B`Bk{cM7lb0+m0K7AX+ee-sJ|SfbzeM}E22O8* z@_PWv(mJeQrAE6jTMG`B7w(2IF^DRf? z!^;fl6Vg)lQEIvl=>&*QIA*L>dTs`yM8RiZ!#`kdnnLw`ocd!n?@pX!5U9V_lG)|i zlG|})Pk9#FM!qe#<*WV~%VeNUwHHS1X?U6`p9e~wMvG6WvsO}Podc|(?r%h9-Dqi* z*ThK~9f~t9OgWw`+-4o+U_DEwjK~TftjK#=;wT02H(-jMyGdRd5g(L~q7&|+YL^mffHnSwoK-Ao4E@K_>%!BlxmELW#W(m-1uUs z%q!Y)oyPJ^gg6t*E{J@L61X zSoEM9ir;0$*3&dJYpXWDc&)K`03a=s(&n=C1<`rm{HAcvhcKhixQ-m4k(0^A!$YZI z2Fy3n_P9h$3=Eg1gRma=C=@D%LTm{sUNJrfA5*(kQJH`i%Kf90JS|t(0d$h2x2BRI>hJ zsQ5L2+O$?uei4jIy+Z-dRFC>>qEZ(TXLOqFcDLAsZ;|Dr=A-XPM1>ZuAXyc9w2zFf zFopjk+a`jo9U6x~L5L}stf|OBx5>GR4Ae$smCMmXq`h)5dXJp1)Uw4I+Z

^91r+ zrKWX}84Lo(dPO1)8p3C(Qn%Q73+Y<9a9KJgJv zE{E-m5}fE|{Ha3wq`(cU+Ljn*r?HY7A_@&joF?7$Y$86AXd5VsF~~26PB~~pnRCZZ z5Aaio=fIbxaN3B-`q5Z&PfK1Nt&6)ctYBi8!DTc|ti;o_+*<|Uc(tixB(yc+9i(Tp zA>=i+Ckcc?r6Z2?9b=XJHki1_6t?#0VeFED_(ucFx0Q2uve}r94}lriT!*NzE7E}% zcy>u9emEI*g7b&xyS9Q$crrH3?KT_LS2L+Eb7L{YUO9nZ=fTc}XVO5955z()-lzu= z=L{Nnl&_~Qu{ofAcBvQBPYASo|C6x4p~)nK{KIXMP4#_x;nM;qs(hF6xAd0^K0-4 zS(x3O;l-IrQlp!@o+-<)8#$AWE^cPOEno9r)&g^wzugwZaK$XZl$02kBNQRKZpN0( zAmeh|E+>wP!n;}SGHbIC`Pc@z-0A-V^5kT}*n#LOIW#s(>KlaLAs>$|K+B1yCLy@J z+}~bqYbS*@S;0HY{X5H3c9I1(rN~3xuhF5G$eo&T=oL~q&WP5LwsG-DBpb#ZKrfT( z@niaWMl(U44PbeaVe<3p1)5>=s$l6V5c2~WTG&L+jqfY#X$mAMJe%AbpCCI|O+$zH zLM_ox7>itF{)D2wlgH!2@ltmtw`fG9-RRD8XX0W=tBXeAq@!+RcupOj6B|8( zdO?%dt=iQ6Yn>LMYuy@85DMYj@T;q-4OLi4|N5zD0ohn@L;CK|>vc%EXdA=s9&UXi zkl*kY7EkGpnKT4Zd5vjEQD0w^a?wWZZS{-P(yrL}f!yayRh7BmO8CK~9!R~(*>de6D+XhZnWTFy$VBSBbI}3vrPqk& z5XECDg00hrfVZVNIOzsyVGX|YG783%$?(VW>e|}zghX06+eQOZ0woO%t@3Q~Nnl24 zbCh`%YH$e9)k!1ifcPb2X@N@pMOmYiiSdCEGXh+myBZNJc3wt+zDN$}YfUS`s40Ko4D( z{93&JZC3pe%WWQPJj=YK-V`rJ_66p2o0>_?OHwdZ*JhS$z_XupPSq40l%uMgpZkbu zfgqsWaqS(c)2L5L_w)~R3!N&^JdB6fn2u4))&zPCa>3R)t!6L3CL6TpNEPC z;5iCRx^@Ch^9PyMX{K>WmRt}wK|?Gs!?mkv@ykasEOxPvZwWeDs-7bowTbF;uVATx zft3$$LgHbdq1-}i>0*DsY6`dLRe>xD_G`xu)GT1b3jcI&U>h1KhDT?BTTWEpM)*SSDy7V!%n{C+a-ItZESF^BNN zhwmeIfbv%aGp5EMP8z-E~AQ_otL1nje&|`AdU&t5avj#cz%j96&Gu?x5PKxd5aKcl5xc$st?3@NeFr%46Gor6HH+c%&O|c2`X@z7 z>MUKU?FM33i%n1*0R@Y>))ehG-sNR9ffuafRGe}ZTIkr9B4+(=eaaTG0&?miwc3?n z6SVju+JEY1$>lYi7(0(fO8ZivP&y@6!-U^v)?y;16AEE^{^x?%(_NrwBd_y{>S zLqB4GgVVOgbDQ!;D&_Yx7gNin{)WVb%SqDWWXg?H-pqN%| zIr!h?nPzqC=gOoW`uMIGCSgj^8S2jUrd?8o=ha${rkiB?AXe-K<)*a5%5X6WxQU_M zY)!>bi0R9OIQ|jN52#`RZOCbZ@+GwLSIR7#CF1L#Ow{I|9@NM9t~eqfQ#8@B8QI>R zpj^q0%?h*8=rsu}Ux3nXq}HHoPNzM5S`GaEqem=npA6>Tp84nVme0G3XFd_zDxL@B zBM+R9nRiJmr6(>wmOlbNYeX85<*+OXL^JSBG&JBZQ*ht6xGO18jrX1uigNL$Y?jf# zvDcD~=s82c8lS8^NE3&p_*md5 zshJ+lr~`|)l|w-pjz1!tB8rt!(hrr(#*oyN$)aU4wX{iAv~|1EGa%6N@&D0PiZ6EY zv8P9aKNC7Lf!SP}DNY09Sr|U41D_!aR`!jWk|jn3HquEOlLdLZ`_R6P>2joz11pn| zlU!bDi@0<;m~Sunvr2Y#XRSJmV)5Q6nm#%w1C4@#ZhV84qfN1YF?ww=iS`*2Uy`1# z44if`b8WXJ9l797{{L_(8FVwc9wl|Y0g~TB$$DP`I!ccC;v<6-{C|Qy!3oqpiMCNq z!rKM;;&58yvf%cUa4#WZ`%B15F17p6dD6UQh+LBD{{sXGIkqORm5Wb?63d(r{~55H z0gn@y+5R=jjcbxUvjY<1a&V(Qi=M4*;olu<~b*QepXPg6HYPR0FN9qX!cHG%8q6sk7mzPHO1Rm| z$I$?4=2TtpQ+aRUx`}mdWvVK!XR>tRWc3^29Yuh{g-EyA-Q@QAEopy)c@}bf?dX{8 zxqepC!%B+SYtLMa8Pr3MyM6(k8|RbEb@M_ty8QP+*hr47D^si(;r|nqujtmU4??oz z!(}eo6ig!hJNqF$8QWQ+uzUS~f(kp?+o_YjS72kw<<8fF^UM7CWn4a4vtd%m_SydX zz-{ji+gOJ1*K@2!v>nD1rk55k*d5LhUMEL$>u<-rN7!WAWU<*J@9F3d6OK1?)??3O$pQedXKXoQp zI95V=_Meyjo#)<0-^^*61peMe=I^Ba7kxVer2Q9rk=|VHyA~k5xjg9aZ>>1ui)*B$ z6-RoJ_TTi42$1&Q>_s}(>08>UF$%{zg9lqqmHcHJ`!@LAY1E|g`!-P8Uw63f-@ec$ zO`?R})0;Fp$&Ty3$|jAL-*H{`ch)^Oe9tz~uk5+ei`4#|?@W_MAJzWdD)jf)RzKD0 zd#_0o>FYaPqllujUL2>+|ISfOIRrZE#c^+;uWhD51FP-(`SaEsB1CFOChd^h& zIPP`&cFxe~g?pVZ^`h;2FF+GkH`90bz1N%G^P5lKtVtAle(OzpbFpuHvnD}8@AI2A z$$ftL?6yxiOB4CW-p}m`67C#FbnleX>z{AE_}}~7zF(W^-s!9t$LqWP_kd&IOuBbE z>&5s0 zsIP?5B=p`mOOq^leUWeNEKO42el##|f5mrw7L^gb;}$yKzuPyUMWaRgeb2RMBKqiG d+2ebR&ZA!0bC-Vmubv|8`(*Fd7R~X9{{zK1b^ZVV diff --git a/orderly/data/test_data/extracted_ord_test_data_trust_labelling/extract_config.json b/orderly/data/test_data/extracted_ord_test_data_trust_labelling/extract_config.json index da2336a2..51c41860 100644 --- a/orderly/data/test_data/extracted_ord_test_data_trust_labelling/extract_config.json +++ b/orderly/data/test_data/extracted_ord_test_data_trust_labelling/extract_config.json @@ -340,6 +340,7 @@ "[Pd on-carbon]": "[Pd]", "[Pd].C": "[Pd]", "[Pd]/C": "[Pd]", + "[Pd]/[C]]": "[Pd]", "[TEA]": "OCCN(CCO)CCO", "[Ti-superoxide]": "O=[O-].[Ti]", "[[Pd].c1ccc(P(c2ccccc2)c2ccccc2)cc1]": "[Pd]c1ccc(P(c2ccccc2)c2ccccc2)cc1", From 79028d7415833f338e5c2350bc76989ac8e32a53 Mon Sep 17 00:00:00 2001 From: Daniel Wigh Date: Wed, 17 Jan 2024 14:00:48 +0100 Subject: [PATCH 12/29] fix strict mypy --- orderly/extract/extractor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/orderly/extract/extractor.py b/orderly/extract/extractor.py index 37768638..f383c7f3 100644 --- a/orderly/extract/extractor.py +++ b/orderly/extract/extractor.py @@ -1008,7 +1008,7 @@ def move_unresolvable_names_to_end_of_list( ) # Add paladium on carbon exception: Delete carbon if Pd exists. Expand exception to other transition metals - def contains_transition_metal(agents): + def contains_transition_metal(agents: AGENTS) -> bool: for agent in agents: if orderly.extract.defaults.has_transition_metal(agent): return True From 0af60735cd80da5e7290c30d8d5f7e89eae2646c Mon Sep 17 00:00:00 2001 From: Daniel Wigh Date: Wed, 17 Jan 2024 14:20:05 +0100 Subject: [PATCH 13/29] test passes --- tests/test_extract.py | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/tests/test_extract.py b/tests/test_extract.py index de8f6e65..54a8c4e7 100644 --- a/tests/test_extract.py +++ b/tests/test_extract.py @@ -345,7 +345,8 @@ def test_rxn_string_and_is_mapped( [ "O=C1C[C@@H](C(=O)N[C@@H](Cc2c[nH]cn2)C(=O)N2CCC[C@H]2C(=O)OCc2ccccc2)N1", ], - [ + [ "CO", + "[C]", "[Pd]", ], ["O=C1C[C@@H](C(=O)N[C@@H](Cc2c[nH]cn2)C(=O)N2CCC[C@H]2C(=O)O)N1"], @@ -1101,6 +1102,30 @@ def test_match_yield_with_product( [], True, ], + [ # Test that [C] is removed correctly when [Pd] is present + "ord_dataset-0b70410902ae4139bd5d334881938f69", + 262, + {}, + False, + [ + "O=C1C[C@@H](C(=O)N[C@@H](Cc2c[nH]cn2)C(=O)N2CCC[C@H]2C(=O)OCc2ccccc2)N1", + ], + [ + "[Pd]", + ], + [], + ["CO"], + [], + ["O=C1C[C@@H](C(=O)N[C@@H](Cc2c[nH]cn2)C(=O)N2CCC[C@H]2C(=O)O)N1"], + [99.7], + None, + None, + "C([O:8][C:9](=[O:32])[C@@H:10]1[CH2:14][CH2:13][CH2:12][N:11]1[C:15](=[O:31])[C@H:16]([CH2:25][C:26]1[N:30]=[CH:29][NH:28][CH:27]=1)[NH:17][C:18]([C@H:20]1[NH:23][C:22](=[O:24])[CH2:21]1)=[O:19])C1C=CC=CC=1>CO.[C].[Pd]>[NH:23]1[C@H:20]([C:18]([NH:17][C@H:16]([C:15]([N:11]2[CH2:12][CH2:13][CH2:14][C@H:10]2[C:9]([OH:32])=[O:8])=[O:31])[CH2:25][C:26]2[N:30]=[CH:29][NH:28][CH:27]=2)=[O:19])[CH2:21][C:22]1=[O:24]", + 'In 150 ml of methanol was dissolved 782 mg of Nα -[(S)-2-azetidinone-4-carbonyl]-L-histidyl-L-proline benzyl ester (12) and the compound (12) was hydrogenated for 2 hours at room temperature using 156 mg of 10% palladium-carbon as a catalyst. When the catalyst was filtered off and the filtrate was concentrated, 620 mg of Nα -[(S)-2-azetidinone-4-carbonyl]-L-histidyl-L-proline (13) was obtained.', + None, + [], + True, + ], [ "ord_dataset-85c00026681b46f89ef8634d2b8618c3", 3948, From e0d32c19b9b33ddf8f2f6d4cf44640c327661992 Mon Sep 17 00:00:00 2001 From: Daniel Wigh Date: Wed, 17 Jan 2024 16:18:16 +0100 Subject: [PATCH 14/29] added prelim orderly-cond with 100 min freq --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index f8708ad4..7676e0d2 100644 --- a/Makefile +++ b/Makefile @@ -187,8 +187,8 @@ paper_5 : paper_plot_uspto_no_trust_filtered_min_frequency_of_occurrence_10_100 # 6. clean (final) # NB: I changed this one, min_freq=0, train_size=1 -paper_gen_orderly_cond_prelim: #requires: paper_extract_uspto_no_trust - python -m orderly.clean --output_path="data/orderly/datasets_$(dataset_version)/orderly_cond_prelim.parquet" --ord_extraction_path="data/orderly/uspto_no_trust/extracted_ords" --molecules_to_remove_path="data/orderly/uspto_no_trust/all_molecule_names.csv" --min_frequency_of_occurrence=0 --map_rare_molecules_to_other=False --set_unresolved_names_to_none_if_mapped_rxn_str_exists_else_del_rxn=True --remove_rxn_with_unresolved_names=False --set_unresolved_names_to_none=False --num_product=1 --num_reactant=2 --num_solv=2 --num_agent=3 --num_cat=0 --num_reag=0 --consistent_yield=False --scramble=True --train_size=1 +paper_gen_orderly_cond_prelim_100: #requires: paper_extract_uspto_no_trust + python -m orderly.clean --output_path="data/orderly/datasets_$(dataset_version)/orderly_cond_prelim_100.parquet" --ord_extraction_path="data/orderly/uspto_no_trust/extracted_ords" --molecules_to_remove_path="data/orderly/uspto_no_trust/all_molecule_names.csv" --min_frequency_of_occurrence=100 --map_rare_molecules_to_other=False --set_unresolved_names_to_none_if_mapped_rxn_str_exists_else_del_rxn=True --remove_rxn_with_unresolved_names=False --set_unresolved_names_to_none=False --num_product=1 --num_reactant=2 --num_solv=2 --num_agent=3 --num_cat=0 --num_reag=0 --consistent_yield=False --scramble=True --train_size=1 paper_gen_uspto_no_trust_no_map: #requires: paper_extract_uspto_no_trust python -m orderly.clean --output_path="data/orderly/datasets_$(dataset_version)/orderly_no_trust_no_map.parquet" --ord_extraction_path="data/orderly/uspto_no_trust/extracted_ords" --molecules_to_remove_path="data/orderly/uspto_no_trust/all_molecule_names.csv" --min_frequency_of_occurrence=100 --map_rare_molecules_to_other=False --set_unresolved_names_to_none_if_mapped_rxn_str_exists_else_del_rxn=True --remove_rxn_with_unresolved_names=False --set_unresolved_names_to_none=False --num_product=1 --num_reactant=2 --num_solv=2 --num_agent=3 --num_cat=0 --num_reag=0 --consistent_yield=False --scramble=True --train_size=0.9 From d0172d0a1460d439bc9852c251d6c91c7208597a Mon Sep 17 00:00:00 2001 From: Daniel Wigh Date: Wed, 17 Jan 2024 16:19:05 +0100 Subject: [PATCH 15/29] test case [H][H] reactant --- tests/test_extract.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/tests/test_extract.py b/tests/test_extract.py index 54a8c4e7..87ac9f5b 100644 --- a/tests/test_extract.py +++ b/tests/test_extract.py @@ -1126,6 +1126,32 @@ def test_match_yield_with_product( [], True, ], + [ # Test that [H][H] is correctly classified as a reactant + "ord_dataset-0b70410902ae4139bd5d334881938f69", + 277, + {}, + False, + [ + 'O=C([C@@H]1CCCN1C(=O)OCc1ccccc1)N1CCOCC1', + "[H][H]" + ], + [ + "[Pd]", + ], + [], + ['CCO',], + [], + ['O=C([C@@H]1CCCN1)N1CCOCC1'], + [98.7], + None, + None, + "C(OC([N:11]1[CH2:23][CH2:22][CH2:21][C@H:12]1[C:13]([N:15]1[CH2:20][CH2:19][O:18][CH2:17][CH2:16]1)=[O:14])=O)C1C=CC=CC=1.[H][H]>C(O)C.[C].[Pd]>[NH:11]1[CH2:23][CH2:22][CH2:21][C@H:12]1[C:13]([N:15]1[CH2:20][CH2:19][O:18][CH2:17][CH2:16]1)=[O:14]", + 'In 100 ml of ethanol was suspended 4.9 g of compound (83) and after adding thereto 250 mg of 10% palladium carbon, the mixture was stirred for 4 hours in a hydrogen stream. After filtering off 10% palladiumcarbon, ethanol was distilled off under reduced pressure from the filtrate to provide 2.8 g of crude N-(L-prolyl)morpholine (84).' +, + None, + [], + True, + ], [ "ord_dataset-85c00026681b46f89ef8634d2b8618c3", 3948, From 714eeb7b7bf435047769bc8a062cb49c4baf751f Mon Sep 17 00:00:00 2001 From: Daniel Wigh Date: Wed, 17 Jan 2024 16:19:35 +0100 Subject: [PATCH 16/29] inspect reactions to find test cases --- notebooks/inspect_orderly_data.ipynb | 87 ++++--- notebooks/orderly_benchmark_stats.ipynb | 332 +++++++++++++++++++++++- 2 files changed, 378 insertions(+), 41 deletions(-) diff --git a/notebooks/inspect_orderly_data.ipynb b/notebooks/inspect_orderly_data.ipynb index 75de1c6d..1722e945 100644 --- a/notebooks/inspect_orderly_data.ipynb +++ b/notebooks/inspect_orderly_data.ipynb @@ -73,7 +73,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -93,7 +93,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -102,7 +102,7 @@ "'uspto-grants-1986_09'" ] }, - "execution_count": 15, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -114,7 +114,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -123,7 +123,7 @@ "'/Users/danielwigh/projects_local/ORDerly_project/ORDerly/notebooks'" ] }, - "execution_count": 8, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -142,7 +142,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -151,7 +151,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -161,29 +161,19 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "262 C([O:8][C:9](=[O:32])[C@@H:10]1[CH2:14][CH2:13...\n", - "265 C(OC([N:11]1[CH2:15][CH2:14][CH2:13][C@H:12]1[...\n", - "273 C([O:8][C:9](=[O:35])[C@@H:10]1[CH2:14][CH2:13...\n", - "275 C([O:8][C:9](=[O:25])[C@H:10]([CH2:19][C:20]1[...\n", - "276 C(OC([N:11]1[CH2:15][CH2:14][CH2:13][C@H:12]1[...\n", "277 C(OC([N:11]1[CH2:23][CH2:22][CH2:21][C@H:12]1[...\n", "279 C(OC([N:11]1[CH2:15][CH2:14][CH2:13][C@H:12]1[...\n", - "281 C(OC([N:11]1[CH2:15][CH2:14][CH2:13][C@H:12]1[...\n", - "285 C(OC([N:11]1[CH2:15][CH2:14][CH2:13][C@H:12]1[...\n", - "368 [ClH:1].C(OC([NH:12][CH:13]([CH2:23][CH2:24][C...\n", - "377 [C:1]([O:4][CH2:5][CH2:6][CH2:7][NH:8]C(OCC1C=...\n", - "379 [ClH:1].[C:2]([O:5][CH2:6][CH2:7][CH2:8][NH:9]...\n", "582 [CH3:1][C:2]1[O:3][C:4](=[O:14])[C:5](=[CH:7][...\n", "Name: rxn_str, dtype: object" ] }, - "execution_count": 20, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -191,75 +181,98 @@ "source": [ "# find df['rxn_str'] which contains [Pd] and [C]\n", "\n", - "filtered_df = df[df['rxn_str'].str.contains('\\[Pd\\]') & df['rxn_str'].str.contains('\\[C\\]')]\n", + "filtered_df = df[df['rxn_str'].str.contains('[H][H]', regex=False) & df['rxn_str'].str.contains('\\[C\\]')]\n", "filtered_df['rxn_str']\n" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'C([O:8][C:9](=[O:32])[C@@H:10]1[CH2:14][CH2:13][CH2:12][N:11]1[C:15](=[O:31])[C@H:16]([CH2:25][C:26]1[N:30]=[CH:29][NH:28][CH:27]=1)[NH:17][C:18]([C@H:20]1[NH:23][C:22](=[O:24])[CH2:21]1)=[O:19])C1C=CC=CC=1>CO.[C].[Pd]>[NH:23]1[C@H:20]([C:18]([NH:17][C@H:16]([C:15]([N:11]2[CH2:12][CH2:13][CH2:14][C@H:10]2[C:9]([OH:32])=[O:8])=[O:31])[CH2:25][C:26]2[N:30]=[CH:29][NH:28][CH:27]=2)=[O:19])[CH2:21][C:22]1=[O:24]'" + "'C(OC([N:11]1[CH2:23][CH2:22][CH2:21][C@H:12]1[C:13]([N:15]1[CH2:20][CH2:19][O:18][CH2:17][CH2:16]1)=[O:14])=O)C1C=CC=CC=1.[H][H]>C(O)C.[C].[Pd]>[NH:11]1[CH2:23][CH2:22][CH2:21][C@H:12]1[C:13]([N:15]1[CH2:20][CH2:19][O:18][CH2:17][CH2:16]1)=[O:14]'" ] }, - "execution_count": 22, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df.loc[262]['rxn_str']" + "df.loc[277]['rxn_str']" ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ - "rxn = 'C([O:8][C:9](=[O:32])[C@@H:10]1[CH2:14][CH2:13][CH2:12][N:11]1[C:15](=[O:31])[C@H:16]([CH2:25][C:26]1[N:30]=[CH:29][NH:28][CH:27]=1)[NH:17][C:18]([C@H:20]1[NH:23][C:22](=[O:24])[CH2:21]1)=[O:19])C1C=CC=CC=1>CO.[C].[Pd]>[NH:23]1[C@H:20]([C:18]([NH:17][C@H:16]([C:15]([N:11]2[CH2:12][CH2:13][CH2:14][C@H:10]2[C:9]([OH:32])=[O:8])=[O:31])[CH2:25][C:26]2[N:30]=[CH:29][NH:28][CH:27]=2)=[O:19])[CH2:21][C:22]1=[O:24]'\n", - "react,ag,prod=rxn.split('>')" + "'C(OC([N:11]1[CH2:23][CH2:22][CH2:21][C@H:12]1[C:13]([N:15]1[CH2:20][CH2:19][O:18][CH2:17][CH2:16]1)=[O:14])=O)C1C=CC=CC=1.[H][H]>C(O)C.[C].[Pd]>[NH:11]1[CH2:23][CH2:22][CH2:21][C@H:12]1[C:13]([N:15]1[CH2:20][CH2:19][O:18][CH2:17][CH2:16]1)=[O:14]'\n" ] }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'O=C1C[C@@H](C(=O)N[C@@H](Cc2c[nH]cn2)C(=O)N2CCC[C@H]2C(=O)O)N1'" + "98.7" ] }, - "execution_count": 28, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df.loc[262]['product_000']\n" + "df.loc[277]['yield_000']" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "99.7" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "O=C1C[C@@H](C(=O)N[C@@H](Cc2c[nH]cn2)C(=O)N2CCC[C@H]2C(=O)O)N1" + "df.loc[262]['yield_000']\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "'CO.[C].[Pd]'" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ag" + ] }, { "attachments": {}, diff --git a/notebooks/orderly_benchmark_stats.ipynb b/notebooks/orderly_benchmark_stats.ipynb index 0f1099b0..64ea38e7 100644 --- a/notebooks/orderly_benchmark_stats.ipynb +++ b/notebooks/orderly_benchmark_stats.ipynb @@ -20,7 +20,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 88, "metadata": {}, "outputs": [ { @@ -34,7 +34,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 40/40 [00:21<00:00, 1.84it/s]\n" + "100%|██████████| 40/40 [00:23<00:00, 1.69it/s]\n" ] } ], @@ -57,7 +57,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 90, "metadata": {}, "outputs": [ { @@ -66,7 +66,7 @@ "82850" ] }, - "execution_count": 38, + "execution_count": 90, "metadata": {}, "output_type": "execute_result" } @@ -76,6 +76,57 @@ "# So around 10% of reactions contain a transition metal" ] }, + { + "cell_type": "code", + "execution_count": 94, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "46778 1007 5\n" + ] + } + ], + "source": [ + "molecule = \"Pd\"\n", + "count0=df['agent_000'].str.contains(molecule).sum()\n", + "count1=df['agent_001'].str.contains(molecule).sum()\n", + "count2=df['agent_002'].str.contains(molecule).sum()\n", + "print(count0, count1, count2)\n", + "count_total = count0+count1+count2" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 0 0\n" + ] + } + ], + "source": [ + "molecule = \"[Pd2+]\"\n", + "count0=df['agent_000'].str.contains(molecule, regex=False).sum()\n", + "count1=df['agent_001'].str.contains(molecule, regex=False).sum()\n", + "count2=df['agent_002'].str.contains(molecule, regex=False).sum()\n", + "print(count0, count1, count2)\n", + "count_total = count0+count1+count2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": 39, @@ -194,6 +245,88 @@ "trans_metal" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Transition metal on Carbon" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['agent_000', 'agent_001', 'agent_002', 'date_of_experiment',\n", + " 'extracted_from_file', 'grant_date', 'is_mapped', 'procedure_details',\n", + " 'product_000', 'product_001', 'reactant_000', 'reactant_001',\n", + " 'reactant_002', 'rxn_str', 'rxn_time', 'solvent_000', 'solvent_001',\n", + " 'solvent_002', 'temperature', 'yield_000', 'yield_001'],\n", + " dtype='object')" + ] + }, + "execution_count": 82, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "path = \"/Users/danielwigh/projects_local/ORDerly_project/orderly-benchmarks/orderly_forward_train.parquet\"\n", + "df = pd.read_parquet(path)\n", + "df.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "81 2445 223\n", + "There are 2749 reactions with a carbon agent. This represents 0.00329967639405026 of all rxn\n" + ] + } + ], + "source": [ + "# Check if there are any agents == \"[C]\"\n", + "count0=df['agent_000'].str.contains(\"\\[C\\]\").sum()\n", + "count1=df['agent_001'].str.contains(\"\\[C\\]\").sum()\n", + "count2=df['agent_002'].str.contains(\"\\[C\\]\").sum()\n", + "print(count0, count1, count2)\n", + "count_total = count0+count1+count2\n", + "print(f\"There are {count_total} reactions with a carbon agent. This represents {count_total/len(df)} of all rxn\")" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "632 7244 1862\n", + "There are 9738 reactions with a hydrogen reactant (mislabelled). This represents 0.011688704519920491 of all rxn\n" + ] + } + ], + "source": [ + "# Check if there are any agents == \"[H][H]\"\n", + "count0=df['agent_000'].str.contains(\"\\[H\\]\\[H\\]\").sum()\n", + "count1=df['agent_001'].str.contains(\"\\[H\\]\\[H\\]\").sum()\n", + "count2=df['agent_002'].str.contains(\"\\[H\\]\\[H\\]\").sum()\n", + "print(count0, count1, count2)\n", + "count_total = count0+count1+count2\n", + "print(f\"There are {count_total} reactions with a hydrogen reactant (mislabelled). This represents {count_total/len(df)} of all rxn\")" + ] + }, { "attachments": {}, "cell_type": "markdown", @@ -550,6 +683,188 @@ "df['rxn_str']" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# ORDerly-cond-prelim " + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from tqdm import tqdm" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [], + "source": [ + "path = '/Users/danielwigh/projects_local/ORDerly_project/ORDerly/data/orderly/datasets_v6/orderly_cond_trial.parquet'\n", + "df = pd.read_parquet(path)" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "753196" + ] + }, + "execution_count": 78, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 40/40 [00:20<00:00, 2.00it/s]\n" + ] + } + ], + "source": [ + "# Assume reactions only have max 1 transition metal\n", + "transition_metals = [\n", + " 'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn',\n", + " 'Y', 'Zr', 'Nb', 'Mo', 'Tc', 'Ru', 'Rh', 'Pd', 'Ag', 'Cd',\n", + " 'Lu', 'Hf', 'Ta', 'W', 'Re', 'Os', 'Ir', 'Pt', 'Au', 'Hg',\n", + " 'Lr', 'Rf', 'Db', 'Sg', 'Bh', 'Hs', 'Mt', 'Ds', 'Rg', 'Cn'\n", + "]\n", + "trans_metal = {}\n", + "for tm in tqdm(transition_metals):\n", + " count = df['rxn_str'].str.contains(tm).sum()\n", + " trans_metal[tm] = count" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "77310" + ] + }, + "execution_count": 80, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sum(trans_metal.values())\n", + "# So around 10% of reactions contain a transition metal" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "45689" + ] + }, + "execution_count": 81, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "trans_metal[\"Pd\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['[CH2:1]([O:3][C:4](=[O:36])[C:5]1[CH:10]=[CH:9][C:8]([N:11]2[CH:15]=[C:14]([C:16]3[CH:21]=[CH:20][CH:19]=[CH:18][C:17]=3[O:22]CC3C=CC=CC=3)[C:13]([C:30]#[N:31])=[CH:12]2)=[CH:7][C:6]=1[O:32][CH2:33][O:34][CH3:35])[CH3:2].C(OCC)(=O)C>[C].[Pd].CO>[CH2:1]([O:3][C:4](=[O:36])[C:5]1[CH:10]=[CH:9][C:8]([N:11]2[CH:15]=[C:14]([C:16]3[CH:21]=[CH:20][CH:19]=[CH:18][C:17]=3[OH:22])[C:13]([C:30]#[N:31])=[CH:12]2)=[CH:7][C:6]=1[O:32][CH2:33][O:34][CH3:35])[CH3:2]']\n" + ] + } + ], + "source": [ + "df3 = df[df['reactant_000']=='CCOC(=O)c1ccc(-n2cc(C#N)c(-c3ccccc3OCc3ccccc3)c2)cc1OCOC']\n", + "print(df3['rxn_str'].values)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "['[CH2:1]([O:3][C:4](=[O:36])[C:5]1[CH:10]=[CH:9][C:8]([N:11]2[CH:15]=[C:14]([C:16]3[CH:21]=[CH:20][CH:19]=[CH:18][C:17]=3[O:22]CC3C=CC=CC=3)[C:13]([C:30]#[N:31])=[CH:12]2)=[CH:7][C:6]=1[O:32][CH2:33][O:34][CH3:35])[CH3:2].C(OCC)(=O)C>[C].[Pd].CO>[CH2:1]([O:3][C:4](=[O:36])[C:5]1[CH:10]=[CH:9][C:8]([N:11]2[CH:15]=[C:14]([C:16]3[CH:21]=[CH:20][CH:19]=[CH:18][C:17]=3[OH:22])[C:13]([C:30]#[N:31])=[CH:12]2)=[CH:7][C:6]=1[O:32][CH2:33][O:34][CH3:35])[CH3:2]']" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [], + "source": [ + "df2 = df[df['rxn_str'].str.contains(\"O=O\", regex=False, na=False)]" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "339 [Cl-:1].[K+].O=O.[C:5]1(=[O:12])[O:11][C:9](=[...\n", + "881 [CH2:1]=[CH:2][CH2:3][CH2:4][CH2:5][CH2:6][CH2...\n", + "937 [C:1]([C:5]1[CH:10]=[CH:9][CH:8]=[CH:7][CH:6]=...\n", + "1015 [C:1]1([OH:7])[CH:6]=[CH:5][CH:4]=[CH:3][CH:2]...\n", + "2850 [CH2:1]([N:3]([CH2:34][CH3:35])[CH2:4][CH2:5][...\n", + " ... \n", + "730862 [CH2:1]([C@@H:4]1[S:9](=[O:11])(=[O:10])[C:8](...\n", + "730882 [CH2:1]([CH:4]1[C@@:9]([CH2:20][F:21])([C:10]2...\n", + "732855 [O:1]=[O+][O-].[Si:4]([O:11][CH:12]([C:14]1[O:...\n", + "735590 O=O.[CH2:3]([N:10]1[CH2:14][C:13]([C:15]2[CH:2...\n", + "737280 O=O.[C:3]([C:7]1[CH:8]=[C:9](O)[C:10](=[CH:12]...\n", + "Name: rxn_str, Length: 348, dtype: object" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2['rxn_str']" + ] + }, { "cell_type": "code", "execution_count": null, @@ -563,6 +878,15 @@ "metadata": {}, "outputs": [], "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "'ClC1C=CC=C(C(OO)=[O:9])C=1.[CH3:12][S:13][CH2:14][C:15]1[C:24]([C:25]([O:27][CH3:28])=[O:26])=[N+:23]([O-:29])[C:22]2[C:17](=[CH:18][CH:19]=[CH:20][CH:21]=2)[N+:16]=1[O-:30].C(OCC)C>C(Cl)(Cl)Cl>[CH3:12][S:13]([CH2:14][C:15]1[C:24]([C:25]([O:27][CH3:28])=[O:26])=[N+:23]([O-:29])[C:22]2[C:17](=[CH:18][CH:19]=[CH:20][CH:21]=2)[N+:16]=1[O-:30])=[O:9]'" + ] } ], "metadata": { From a25b5af635a5cd2f71d4a5c8788559cbd9bed34b Mon Sep 17 00:00:00 2001 From: Daniel Wigh Date: Wed, 17 Jan 2024 16:20:08 +0100 Subject: [PATCH 17/29] make black --- tests/test_extract.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/tests/test_extract.py b/tests/test_extract.py index 87ac9f5b..fcd6cb21 100644 --- a/tests/test_extract.py +++ b/tests/test_extract.py @@ -345,7 +345,8 @@ def test_rxn_string_and_is_mapped( [ "O=C1C[C@@H](C(=O)N[C@@H](Cc2c[nH]cn2)C(=O)N2CCC[C@H]2C(=O)OCc2ccccc2)N1", ], - [ "CO", + [ + "CO", "[C]", "[Pd]", ], @@ -1121,7 +1122,7 @@ def test_match_yield_with_product( None, None, "C([O:8][C:9](=[O:32])[C@@H:10]1[CH2:14][CH2:13][CH2:12][N:11]1[C:15](=[O:31])[C@H:16]([CH2:25][C:26]1[N:30]=[CH:29][NH:28][CH:27]=1)[NH:17][C:18]([C@H:20]1[NH:23][C:22](=[O:24])[CH2:21]1)=[O:19])C1C=CC=CC=1>CO.[C].[Pd]>[NH:23]1[C@H:20]([C:18]([NH:17][C@H:16]([C:15]([N:11]2[CH2:12][CH2:13][CH2:14][C@H:10]2[C:9]([OH:32])=[O:8])=[O:31])[CH2:25][C:26]2[N:30]=[CH:29][NH:28][CH:27]=2)=[O:19])[CH2:21][C:22]1=[O:24]", - 'In 150 ml of methanol was dissolved 782 mg of Nα -[(S)-2-azetidinone-4-carbonyl]-L-histidyl-L-proline benzyl ester (12) and the compound (12) was hydrogenated for 2 hours at room temperature using 156 mg of 10% palladium-carbon as a catalyst. When the catalyst was filtered off and the filtrate was concentrated, 620 mg of Nα -[(S)-2-azetidinone-4-carbonyl]-L-histidyl-L-proline (13) was obtained.', + "In 150 ml of methanol was dissolved 782 mg of Nα -[(S)-2-azetidinone-4-carbonyl]-L-histidyl-L-proline benzyl ester (12) and the compound (12) was hydrogenated for 2 hours at room temperature using 156 mg of 10% palladium-carbon as a catalyst. When the catalyst was filtered off and the filtrate was concentrated, 620 mg of Nα -[(S)-2-azetidinone-4-carbonyl]-L-histidyl-L-proline (13) was obtained.", None, [], True, @@ -1131,23 +1132,21 @@ def test_match_yield_with_product( 277, {}, False, - [ - 'O=C([C@@H]1CCCN1C(=O)OCc1ccccc1)N1CCOCC1', - "[H][H]" - ], + ["O=C([C@@H]1CCCN1C(=O)OCc1ccccc1)N1CCOCC1", "[H][H]"], [ "[Pd]", ], [], - ['CCO',], + [ + "CCO", + ], [], - ['O=C([C@@H]1CCCN1)N1CCOCC1'], + ["O=C([C@@H]1CCCN1)N1CCOCC1"], [98.7], None, None, "C(OC([N:11]1[CH2:23][CH2:22][CH2:21][C@H:12]1[C:13]([N:15]1[CH2:20][CH2:19][O:18][CH2:17][CH2:16]1)=[O:14])=O)C1C=CC=CC=1.[H][H]>C(O)C.[C].[Pd]>[NH:11]1[CH2:23][CH2:22][CH2:21][C@H:12]1[C:13]([N:15]1[CH2:20][CH2:19][O:18][CH2:17][CH2:16]1)=[O:14]", - 'In 100 ml of ethanol was suspended 4.9 g of compound (83) and after adding thereto 250 mg of 10% palladium carbon, the mixture was stirred for 4 hours in a hydrogen stream. After filtering off 10% palladiumcarbon, ethanol was distilled off under reduced pressure from the filtrate to provide 2.8 g of crude N-(L-prolyl)morpholine (84).' -, + "In 100 ml of ethanol was suspended 4.9 g of compound (83) and after adding thereto 250 mg of 10% palladium carbon, the mixture was stirred for 4 hours in a hydrogen stream. After filtering off 10% palladiumcarbon, ethanol was distilled off under reduced pressure from the filtrate to provide 2.8 g of crude N-(L-prolyl)morpholine (84).", None, [], True, From b80ccb6efe2d1192b088f37af39b34a6c54b5a4d Mon Sep 17 00:00:00 2001 From: Daniel Wigh Date: Sat, 20 Jan 2024 19:02:44 +0000 Subject: [PATCH 18/29] H2 always reactant, never an agent --- orderly/extract/extractor.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/orderly/extract/extractor.py b/orderly/extract/extractor.py index f383c7f3..721f723b 100644 --- a/orderly/extract/extractor.py +++ b/orderly/extract/extractor.py @@ -253,7 +253,7 @@ def extract_info_from_rxn_str( and ( # any(generator) r_clean not in products_from_rxn_without_mapping ) - ) or (r_clean == "[H][H]"): + ): reactants.append(r_clean) else: cleaned_agents.append(r_clean) @@ -285,9 +285,17 @@ def extract_info_from_rxn_str( cleaned_agents = [ a for a in cleaned_agents if a not in reactants and a not in products ] + # We also add an exception for [H][H], this should always be a reactant + # Move [H][H] from agents to reactants + if "[H][H]" in cleaned_agents: + cleaned_agents.remove("[H][H]") + reactants.append("[H][H]") else: reactants = reactants_from_rxn_without_mapping products = products_from_rxn_without_mapping + if "[H][H]" in cleaned_agents: + cleaned_agents.remove("[H][H]") + reactants.append("[H][H]") return ( sorted(list(set(reactants))), From a326340c8ca8707cc2ff9ef548180f44aa76791e Mon Sep 17 00:00:00 2001 From: Daniel Wigh Date: Sat, 20 Jan 2024 19:11:23 +0000 Subject: [PATCH 19/29] del C if charcoal in procedure details --- orderly/extract/extractor.py | 45 +++++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 13 deletions(-) diff --git a/orderly/extract/extractor.py b/orderly/extract/extractor.py index 721f723b..1acd3761 100644 --- a/orderly/extract/extractor.py +++ b/orderly/extract/extractor.py @@ -473,11 +473,15 @@ def rxn_time_extractor(rxn: ord_reaction_pb2.Reaction) -> Optional[RXN_TIME]: else: return None # no time found - @staticmethod + @sta + #NB this needs to be before the removal of [C] and Cticmethod def procedure_details_extractor( rxn: ord_reaction_pb2.Reaction, - ) -> str: # TODO check does it return empty string or none - procedure_details = rxn.notes.procedure_details + ) -> + #NB this needs to be before the removal of [C] and C str: # TODO check does it re + #NB this needs to be before the removal of [C] and Cturn empty string or none + procedure_d + #NB this needs to be before the removal of [C] and Cetails = rxn.notes.procedure_details return str(procedure_details) @staticmethod @@ -1014,6 +1018,12 @@ def move_unresolvable_names_to_end_of_list( catalysts, _ = move_unresolvable_names_to_end_of_list( catalysts, rxn_non_smiles_names_set ) + + #NB this needs to be before the removal of [C] and C + procedure_details = OrdExtractor.procedure_details_extractor(rxn) + date_of_experiment = OrdExtractor.date_of_experiment_extractor(rxn) + rxn_time = OrdExtractor.rxn_time_extractor(rxn) + temperature = OrdExtractor.temperature_extractor(rxn) # Add paladium on carbon exception: Delete carbon if Pd exists. Expand exception to other transition metals def contains_transition_metal(agents: AGENTS) -> bool: @@ -1021,9 +1031,14 @@ def contains_transition_metal(agents: AGENTS) -> bool: if orderly.extract.defaults.has_transition_metal(agent): return True return False + + def contains_charcoal(procedure_details) -> bool: + if "charcoal" in procedure_details.lower(): + return True + return False # Check if any agent contains a transition metal - if contains_transition_metal(agents): + if (contains_transition_metal(agents) or contains_charcoal(procedure_details)): # Remove "[C]" and "C" from agents agents = [a for a in agents if a not in ["[C]", "C"]] @@ -1031,10 +1046,7 @@ def contains_transition_metal(agents: AGENTS) -> bool: _yields = [None] * len(products) yields = _yields - procedure_details = OrdExtractor.procedure_details_extractor(rxn) - date_of_experiment = OrdExtractor.date_of_experiment_extractor(rxn) - rxn_time = OrdExtractor.rxn_time_extractor(rxn) - temperature = OrdExtractor.temperature_extractor(rxn) + if ice_present and ( temperature is None ): # We trust the labelled temperature more, but if there is no labelled temperature, and they added ice, we should set the temperature to 0C @@ -1058,7 +1070,8 @@ def contains_transition_metal(agents: AGENTS) -> bool: yields, temperature, rxn_time, - rxn_str, + + #NB this needs to be before the removal of [C] and Crxn_str, procedure_details, date_of_experiment, is_mapped, @@ -1101,7 +1114,8 @@ def build_rxn_lists( "temperature": [], "rxn_time": [], "product": [], - "yield": [], + " + #NB this needs to be before the removal of [C] and Cyield": [], "procedure_details": [], "date_of_experiment": [], "is_mapped": [], @@ -1129,7 +1143,8 @@ def build_rxn_lists( yields, temperature, rxn_time, - rxn_str, + + #NB this needs to be before the removal of [C] and Crxn_str, procedure_details, date_of_experiment, is_mapped, @@ -1147,7 +1162,9 @@ def build_rxn_lists( rxn_lists["temperature"].append(temperature) rxn_lists["rxn_time"].append(rxn_time) rxn_lists["product"].append(products) - rxn_lists["yield"].append(yields) + rxn_lists[" + #NB this needs to be before the removal of [C] and Cyield"].append(yields) + #NB this needs to be before the removal of [C] and C rxn_lists["procedure_details"].append(procedure_details) rxn_lists["date_of_experiment"].append(date_of_experiment) rxn_lists["is_mapped"].append(is_mapped) @@ -1239,7 +1256,9 @@ def build_full_df( ) ) dfs.append( - OrdExtractor._to_dataframe( + OrdExtractor._to + #NB this needs to be before the removal of [C] and C_dataframe( + #NB this needs to be before the removal of [C] and C data_lists["procedure_details"], base_string=["procedure_details"] ) .fillna("") From f73e2e071d685f02e1435b2201ee90b6cb56f196 Mon Sep 17 00:00:00 2001 From: Daniel Wigh Date: Sat, 20 Jan 2024 21:19:28 +0000 Subject: [PATCH 20/29] scramble reactant, prod, solv, but not agent --- orderly/clean/cleaner.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/orderly/clean/cleaner.py b/orderly/clean/cleaner.py index a61c5383..a53dc09f 100644 --- a/orderly/clean/cleaner.py +++ b/orderly/clean/cleaner.py @@ -475,10 +475,12 @@ def _move_none_to_after_data( @staticmethod def _scramble( df: pd.DataFrame, - components: Tuple[str, ...] = ("agent", "solvent", "catalyst", "reagent"), + components: Tuple[str, ...] = ("reactant", "product", "solvent", "catalyst", "reagent"), seed: int = 42, ) -> pd.DataFrame: - """Scrambles the order of the reactants (ie between reactant_001, reactant_002, etc). Ordering of prodcuts, agents, solvents, reagents, and catalysts will also be scrambled. This is done to prevent the model from learning the order of the molecules, which is not important for the reaction prediction task. It only done at the very end because scrambling can be non-deterministic between versions/operating systems, so it would be difficult to debug if done earlier in the pipeline.""" + """Scrambles the order of the reactants (ie between reactant_001, reactant_002, etc). Ordering of products, solvents, reagents, and catalysts will also be scrambled. This is done to prevent the model from learning the order of the molecules, which is not important for the reaction prediction task. It only done at the very end because scrambling can be non-deterministic between versions/operating systems, so it would be difficult to debug if done earlier in the pipeline. + + NB: agents not scrambled by default, since we need their ordering to let transition metals be first.""" list_of_dfs = [] all_component_cols = [] np.random.seed(seed) @@ -823,8 +825,8 @@ def _get_dataframe(self) -> pd.DataFrame: df.reset_index(inplace=True, drop=True) if self.scramble: - LOG.info(f"Scrambling the order of the components") - components = ("agent", "solvent", "reagent", "catalyst") + components = ("reactant", "product", "solvent", "catalyst", "reagent") + LOG.info(f"Scrambling the order of the components: {components=}") df = Cleaner._scramble(df, components) df = Cleaner._move_none_to_after_data(df, components) df = Cleaner._replace_None_with_NA(df, components) @@ -1035,7 +1037,7 @@ def get_matching_indices( "--scramble", type=bool, default=True, - help="If True, the order of the reactants be scrambled (ie between reactant_001, reactant_002, etc). Ordering of prodcuts, agents, solvents, reagents, and catalysts will also be scrambled. Will also scramble the reaction indices. This is done to prevent the model from learning the order of the molecules, which is not important for the reaction prediction task. It only done at the very end because scrambling can be non-deterministic between versions/operating systems, so it would be difficult to debug if done earlier in the pipeline.", + help="If True, the order of the reactants be scrambled (ie between reactant_001, reactant_002, etc). Ordering of prodcuts, agents, solvents, reagents, and catalysts will also be scrambled. Will also scramble the reaction indices. This is done to prevent the model from learning the order of the molecules, which is not important for the reaction prediction task. It only done at the very end because scrambling can be non-deterministic between versions/operating systems, so it would be difficult to debug if done earlier in the pipeline.NB: agents not scrambled by default, since we need their ordering to let transition metals be first.", ) @click.option( "--train_size", From eda76fd3898b13522de38fe2681afd3de8f8bf3b Mon Sep 17 00:00:00 2001 From: Daniel Wigh Date: Sat, 20 Jan 2024 21:19:45 +0000 Subject: [PATCH 21/29] added comment on sorting --- orderly/extract/extractor.py | 1 + 1 file changed, 1 insertion(+) diff --git a/orderly/extract/extractor.py b/orderly/extract/extractor.py index 1acd3761..6d4a6115 100644 --- a/orderly/extract/extractor.py +++ b/orderly/extract/extractor.py @@ -297,6 +297,7 @@ def extract_info_from_rxn_str( cleaned_agents.remove("[H][H]") reactants.append("[H][H]") + # NB: We need to sort the lists, otherwise the order is non-deterministic, and the tests won't pass. return ( sorted(list(set(reactants))), sorted(list(set(cleaned_agents))), From 0075db0f64d9082bb03b6d93fb9dc9cb2e81ed59 Mon Sep 17 00:00:00 2001 From: Daniel Wigh Date: Sat, 20 Jan 2024 21:20:25 +0000 Subject: [PATCH 22/29] added no_min_freq as orderly-cond dataset --- Makefile | 43 ++++++++++++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 13 deletions(-) diff --git a/Makefile b/Makefile index 7676e0d2..946b42de 100644 --- a/Makefile +++ b/Makefile @@ -123,10 +123,11 @@ run_python_310: # 3. Plot histograms of the number of non-empty columns of each type (reactants, products, solvents, agents) # 4. Run a cleaning with decided upon number of columns to keep # 5. Plot histogram showing dataset size as a function of min_frequency_of_occurrence (can probably use the min_frequency code from the cleaner within the plotter) -# 6. Generate the four datasets we need for the paper (split into train and test set) +# 6. Generate the six condition prediction datasets we need for the paper (split into train and test set) # 7. Plot histograms with the occurrence of the most common reactants, products, solvents, agents -# 8. Generate fingerprints for each dataset -# 9. Train & evaluate a model on each dataset +# 8. Generate the final benchmark datasets (split into train and test set) +# 9. Generate fingerprints for each dataset +# 10. Train & evaluate a model on each dataset # 1. Extract @@ -185,11 +186,7 @@ paper_plot_uspto_with_trust_filtered_min_frequency_of_occurrence_100_1000: paper_5 : paper_plot_uspto_no_trust_filtered_min_frequency_of_occurrence_10_100 paper_plot_uspto_no_trust_filtered_min_frequency_of_occurrence_100_1000 paper_plot_uspto_with_trust_filtered_min_frequency_of_occurrence_10_100 paper_plot_uspto_with_trust_filtered_min_frequency_of_occurrence_100_1000 -# 6. clean (final) -# NB: I changed this one, min_freq=0, train_size=1 -paper_gen_orderly_cond_prelim_100: #requires: paper_extract_uspto_no_trust - python -m orderly.clean --output_path="data/orderly/datasets_$(dataset_version)/orderly_cond_prelim_100.parquet" --ord_extraction_path="data/orderly/uspto_no_trust/extracted_ords" --molecules_to_remove_path="data/orderly/uspto_no_trust/all_molecule_names.csv" --min_frequency_of_occurrence=100 --map_rare_molecules_to_other=False --set_unresolved_names_to_none_if_mapped_rxn_str_exists_else_del_rxn=True --remove_rxn_with_unresolved_names=False --set_unresolved_names_to_none=False --num_product=1 --num_reactant=2 --num_solv=2 --num_agent=3 --num_cat=0 --num_reag=0 --consistent_yield=False --scramble=True --train_size=1 - +# 6. ORDerly-condition alternatives cleaning paper_gen_uspto_no_trust_no_map: #requires: paper_extract_uspto_no_trust python -m orderly.clean --output_path="data/orderly/datasets_$(dataset_version)/orderly_no_trust_no_map.parquet" --ord_extraction_path="data/orderly/uspto_no_trust/extracted_ords" --molecules_to_remove_path="data/orderly/uspto_no_trust/all_molecule_names.csv" --min_frequency_of_occurrence=100 --map_rare_molecules_to_other=False --set_unresolved_names_to_none_if_mapped_rxn_str_exists_else_del_rxn=True --remove_rxn_with_unresolved_names=False --set_unresolved_names_to_none=False --num_product=1 --num_reactant=2 --num_solv=2 --num_agent=3 --num_cat=0 --num_reag=0 --consistent_yield=False --scramble=True --train_size=0.9 @@ -202,18 +199,38 @@ paper_gen_uspto_with_trust_with_map: #requires: paper_extract_uspto_with_trust paper_gen_uspto_with_trust_no_map: #requires: paper_extract_uspto_with_trust python -m orderly.clean --output_path="data/orderly/datasets_$(dataset_version)/orderly_with_trust_no_map.parquet" --ord_extraction_path="data/orderly/uspto_with_trust/extracted_ords" --molecules_to_remove_path="data/orderly/uspto_with_trust/all_molecule_names.csv" --min_frequency_of_occurrence=100 --map_rare_molecules_to_other=False --set_unresolved_names_to_none_if_mapped_rxn_str_exists_else_del_rxn=True --remove_rxn_with_unresolved_names=False --set_unresolved_names_to_none=False --num_product=1 --num_reactant=2 --num_solv=2 --num_agent=0 --num_cat=1 --num_reag=2 --consistent_yield=False --scramble=True --train_size=0.9 -paper_6: paper_gen_uspto_no_trust_no_map paper_gen_uspto_no_trust_with_map paper_gen_uspto_with_trust_with_map paper_gen_uspto_with_trust_no_map +paper_gen_uspto_no_trust_no_min_freq: #requires: paper_extract_uspto_no_trust + python -m orderly.clean --output_path="data/orderly/datasets_$(dataset_version)/orderly_no_trust_no_min_freq.parquet" --ord_extraction_path="data/orderly/uspto_no_trust/extracted_ords" --molecules_to_remove_path="data/orderly/uspto_no_trust/all_molecule_names.csv" --min_frequency_of_occurrence=0 --map_rare_molecules_to_other=False --set_unresolved_names_to_none_if_mapped_rxn_str_exists_else_del_rxn=True --remove_rxn_with_unresolved_names=False --set_unresolved_names_to_none=False --num_product=1 --num_reactant=2 --num_solv=2 --num_agent=3 --num_cat=0 --num_reag=0 --consistent_yield=False --scramble=True --train_size=0.9 + +paper_gen_uspto_with_trust_no_min_freq: #requires: paper_extract_uspto_with_trust + python -m orderly.clean --output_path="data/orderly/datasets_$(dataset_version)/orderly_with_trust_no_min_freq.parquet" --ord_extraction_path="data/orderly/uspto_with_trust/extracted_ords" --molecules_to_remove_path="data/orderly/uspto_with_trust/all_molecule_names.csv" --min_frequency_of_occurrence=0 --map_rare_molecules_to_other=False --set_unresolved_names_to_none_if_mapped_rxn_str_exists_else_del_rxn=True --remove_rxn_with_unresolved_names=False --set_unresolved_names_to_none=False --num_product=1 --num_reactant=2 --num_solv=2 --num_agent=0 --num_cat=1 --num_reag=2 --consistent_yield=False --scramble=True --train_size=0.9 + + +paper_6: paper_gen_uspto_no_trust_no_map paper_gen_uspto_no_trust_with_map paper_gen_uspto_with_trust_with_map paper_gen_uspto_with_trust_no_map paper_gen_uspto_no_trust_no_min_freq paper_gen_uspto_with_trust_no_min_freq # 7. Plot plot_molecule_popularity_histograms paper_plot_uspto_no_trust_no_map: - python -m orderly.plot --clean_data_path="data/orderly/datasets/orderly_no_trust_no_map_train.parquet" --plot_output_path="data/orderly/plot_no_trust/" --plot_num_rxn_components_bool=False --plot_frequency_of_occurrence_bool=False --plot_molecule_popularity_histograms=True + python -m orderly.plot --clean_data_path="data/orderly/datasets_$(dataset_version)/orderly_no_trust_no_map_train.parquet" --plot_output_path="data/orderly/plot_no_trust/" --plot_num_rxn_components_bool=False --plot_frequency_of_occurrence_bool=False --plot_molecule_popularity_histograms=True paper_plot_uspto_with_trust_no_map: - python -m orderly.plot --clean_data_path="data/orderly/datasets/orderly_with_trust_no_map_train.parquet" --plot_output_path="data/orderly/plot_with_trust/" --plot_num_rxn_components_bool=False --plot_frequency_of_occurrence_bool=False --plot_molecule_popularity_histograms=True + python -m orderly.plot --clean_data_path="data/orderly/datasets_$(dataset_version)/orderly_with_trust_no_map_train.parquet" --plot_output_path="data/orderly/plot_with_trust/" --plot_num_rxn_components_bool=False --plot_frequency_of_occurrence_bool=False --plot_molecule_popularity_histograms=True paper_7 : paper_plot_uspto_no_trust_no_map paper_plot_uspto_with_trust_no_map -# 8. gen fp +# 8. Gen benchmarks (final datasets) + +paper_orderly_condition: + + +paper_orderly_forward: + +paper_orderly_retro: + +paper_orderly_yield: + + + +# 9. gen fp fp_no_trust_no_map_test: python -m orderly.gen_fp --clean_data_folder_path="data/orderly/datasets_$(dataset_version)/orderly_no_trust_no_map_test.parquet" --fp_size=2048 --overwrite=False @@ -240,7 +257,7 @@ paper_8: fp_no_trust_no_map_test fp_no_trust_no_map_train fp_no_trust_with_map_t #Generate datasets for paper paper_get_datasets: paper_1 paper_6 paper_8 -paper_gen_all: paper_1 paper_2 paper_3 paper_4 paper_5 paper_6 paper_8 +paper_gen_all: paper_1 paper_2 paper_3 paper_4 paper_5 paper_6 paper_8 paper_9 # 9. train models #Remember to switch env here (must contain TF, e.g. tf_mac_m1) From e544336d3bf449cbfb451b21e6dc20a9efd7a40b Mon Sep 17 00:00:00 2001 From: Daniel Wigh Date: Sat, 20 Jan 2024 21:22:23 +0000 Subject: [PATCH 23/29] looking for bugs in prelim orderly-condition --- notebooks/dataset_freq.ipynb | 826 ++++++++++++++++++++++++++++++++++- 1 file changed, 824 insertions(+), 2 deletions(-) diff --git a/notebooks/dataset_freq.ipynb b/notebooks/dataset_freq.ipynb index 454c4b3f..d1829454 100644 --- a/notebooks/dataset_freq.ipynb +++ b/notebooks/dataset_freq.ipynb @@ -237,12 +237,834 @@ "df['product_000'].value_counts().shape" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Most common agents in orderly-cond" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "path = \"/Users/dsw46/Projects_local/ORDerly-project/ORDerly/data/orderly/datasets_v6/orderly_no_trust_no_min_freq_train.parquet\"\n", + "df = pd.read_parquet(path)" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
original_indexagent_000agent_001agent_002date_of_experimentextracted_from_filegrant_dateis_mappedprocedure_detailsproduct_000reactant_000reactant_001rxn_strrxn_timesolvent_000solvent_001temperatureyield_000
index
371755864236O=C[O-][Pd][NH4+]NaTord_dataset-b8b98725045d45bdbd73512048f4b47e2009-01-01 00:02:00TrueA solution of 740 mg of 5-benzyloxy-3-[1-tert-...CC(C)(C)OC(=O)n1nc(-c2cc3cc(OCCN4CCOCC4)ccc3n2...CC(C)(C)OC(=O)n1nc(-c2cc3cc(OCCN4CCOCC4)ccc3n2...NULL[C:1]([O:5][C:6]([N:8]1[C:16]2[C:11](=[CH:12][...NaNCCONone25.083.2
111108245483CNC[Pd]NoneNaTord_dataset-5eb2900a93c842ee98f26c305e657b611992-01-01 00:04:00True10 g (0.03 mol) of 2-(2'-hydroxy-3',5'-di-t-bu...CC(C)(C)c1cc(-n2nc3ccccc3n2)c(O)c(C(C)(C)C)c1CC(C)(C)c1cc(-n2nc3ccccc3[n+]2[O-])c(O)c(C(C)(...[H][H][OH:1][C:2]1[C:7]([C:8]([CH3:11])([CH3:10])[CH...NaNCc1ccccc1O50.073.2
67346150389Cl[Pd]NoneNaTord_dataset-b9a9e369e9da4413999591aa08f4c3e31986-01-01 00:11:00True4.36 g. of the resultant compound of Example 5...COc1ccc2c(c1OC)CC1CNCC21COc1ccc2c(c1OC)CC1CN(Cc3ccccc3)CC21NULL[ClH:1].C([N:9]1[CH2:13][CH:12]2[CH2:14][C:15]...NaNCONoneNaNNaN
121435269124[BH4-][Pd][Na+]NaTord_dataset-a20aed058d7b40bc81fdf50bc5b03f971993-01-01 00:06:00True4-(2-hydroxyethoxy)-6,6'-dimethyl-2,2'-bipyrid...Cc1cccc(-c2cc(OCCO)cc(C)n2)n1Cc1cccc(-c2cc(OCCO)cc(C)[n+]2[O-])n1NULL[OH:1][CH2:2][CH2:3][O:4][C:5]1[CH:6]=[C:7]([C...NaNCONoneNaNNaN
2137542319NC(=O)C(=O)O[Pd]NoneNaTord_dataset-4c8627b52d564809adb9b494879c07c01978-01-01 00:07:00TrueA suspension of 10 parts of [[[2-[3-(dimethyla...CCOC(=O)CN(Cc1ccccc1OCCCN(C)C)C(=O)C=Cc1ccc(N)cc1CCOC(=O)CN(Cc1ccccc1OCCCN(C)C)C(=O)C=Cc1ccc([N...[H][H][C:1]([OH:6])(=[O:5])[C:2]([NH2:4])=[O:3].[CH3...NaNCCONoneNaNNaN
.........................................................
7522971767946c1ccc(P(c2ccccc2)c2ccccc2)cc1[Pd]NoneNaTord_dataset-0b32b90cc77b4a3db47ad263e0bbc1a82016-01-01 00:09:00TrueA mixture of dimethyl(2S,3S,5S)-5-[(allyloxy)c...C=CCO[C@H]1C[C@H](C(=O)OC)[C@@H](C(=O)N2CCN(c3...C1CCOC1C=CCOC(=O)O[C@H]1C[C@H](C(=O)OC)[C@@H](C(=O)N2...C(O[C:5]([O:7][C@@H:8]1[CH2:13][N:12]([C:14]([...NaNNoneNoneNaNNaN
179187419812Cl[Pd]NoneNaTord_dataset-94e21e9990034c729ea727e7d2ab0eb01998-01-01 00:12:00TrueTo a solution of 4-amino-3-nitrophenol (25.0 g...Nc1ccc(O)cc1NNc1ccc(O)cc1[N+](=O)[O-]NULL[NH2:1][C:2]1[CH:7]=[CH:6][C:5]([OH:8])=[CH:4]...12.0CONoneNaNNaN
147296329677NN[Pd]NoneNaTord_dataset-2c460e2ef9934444aaf26fec1f75741f1996-01-01 00:05:00TrueTo a solution of 5 mmol of 6,7-dihydro-5-(4-ni...Nc1ccc(C(=O)N2CCc3cccn3-c3ccccc32)cc1O=C(c1ccc([N+](=O)[O-])cc1)N1CCc2cccn2-c2ccccc21NULL[N+:1]([C:4]1[CH:25]=[CH:24][C:7]([C:8]([N:10]...NaNCCONoneNaNNaN
236303560550c1ccc(P(c2ccccc2)c2ccccc2)cc1[Pd]NoneNaTord_dataset-7c28974b7fcf4c9c86d5f2a42ba328a22002-01-01 00:09:00TrueA mixture of 6-(3-bromo-phenyl)-4,4-dimethyl-1...CC1(C)OC(=O)Nc2ccc(-c3cccc(C#C[Si](C)(C)C)c3)cc21C#C[Si](C)(C)CCC1(C)OC(=O)Nc2ccc(-c3cccc(Br)c3)cc21Br[C:2]1[CH:3]=[C:4]([C:8]2[CH:20]=[CH:19][C:1...NaNCCN(CC)CCNone80.091.8
5789961328400[H][H][Pd]NoneNaTord_dataset-cfad8b3f00044bcda60a96b019f098722013-01-01 00:08:00True2-(Trifluoromethyl)imidazo[1,2-a]pyrazine (1.6...FC(F)(F)c1cn2c(n1)CNCC2FC(F)(F)c1cn2ccncc2n1NULL[F:1][C:2]([F:13])([F:12])[C:3]1[N:4]=[C:5]2[C...NaNCONoneNaN93.7
\n", + "

2683 rows × 18 columns

\n", + "
" + ], + "text/plain": [ + " original_index agent_000 agent_001 agent_002 \\\n", + "index \n", + "371755 864236 O=C[O-] [Pd] [NH4+] \n", + "111108 245483 CNC [Pd] None \n", + "67346 150389 Cl [Pd] None \n", + "121435 269124 [BH4-] [Pd] [Na+] \n", + "21375 42319 NC(=O)C(=O)O [Pd] None \n", + "... ... ... ... ... \n", + "752297 1767946 c1ccc(P(c2ccccc2)c2ccccc2)cc1 [Pd] None \n", + "179187 419812 Cl [Pd] None \n", + "147296 329677 NN [Pd] None \n", + "236303 560550 c1ccc(P(c2ccccc2)c2ccccc2)cc1 [Pd] None \n", + "578996 1328400 [H][H] [Pd] None \n", + "\n", + " date_of_experiment extracted_from_file \\\n", + "index \n", + "371755 NaT ord_dataset-b8b98725045d45bdbd73512048f4b47e \n", + "111108 NaT ord_dataset-5eb2900a93c842ee98f26c305e657b61 \n", + "67346 NaT ord_dataset-b9a9e369e9da4413999591aa08f4c3e3 \n", + "121435 NaT ord_dataset-a20aed058d7b40bc81fdf50bc5b03f97 \n", + "21375 NaT ord_dataset-4c8627b52d564809adb9b494879c07c0 \n", + "... ... ... \n", + "752297 NaT ord_dataset-0b32b90cc77b4a3db47ad263e0bbc1a8 \n", + "179187 NaT ord_dataset-94e21e9990034c729ea727e7d2ab0eb0 \n", + "147296 NaT ord_dataset-2c460e2ef9934444aaf26fec1f75741f \n", + "236303 NaT ord_dataset-7c28974b7fcf4c9c86d5f2a42ba328a2 \n", + "578996 NaT ord_dataset-cfad8b3f00044bcda60a96b019f09872 \n", + "\n", + " grant_date is_mapped \\\n", + "index \n", + "371755 2009-01-01 00:02:00 True \n", + "111108 1992-01-01 00:04:00 True \n", + "67346 1986-01-01 00:11:00 True \n", + "121435 1993-01-01 00:06:00 True \n", + "21375 1978-01-01 00:07:00 True \n", + "... ... ... \n", + "752297 2016-01-01 00:09:00 True \n", + "179187 1998-01-01 00:12:00 True \n", + "147296 1996-01-01 00:05:00 True \n", + "236303 2002-01-01 00:09:00 True \n", + "578996 2013-01-01 00:08:00 True \n", + "\n", + " procedure_details \\\n", + "index \n", + "371755 A solution of 740 mg of 5-benzyloxy-3-[1-tert-... \n", + "111108 10 g (0.03 mol) of 2-(2'-hydroxy-3',5'-di-t-bu... \n", + "67346 4.36 g. of the resultant compound of Example 5... \n", + "121435 4-(2-hydroxyethoxy)-6,6'-dimethyl-2,2'-bipyrid... \n", + "21375 A suspension of 10 parts of [[[2-[3-(dimethyla... \n", + "... ... \n", + "752297 A mixture of dimethyl(2S,3S,5S)-5-[(allyloxy)c... \n", + "179187 To a solution of 4-amino-3-nitrophenol (25.0 g... \n", + "147296 To a solution of 5 mmol of 6,7-dihydro-5-(4-ni... \n", + "236303 A mixture of 6-(3-bromo-phenyl)-4,4-dimethyl-1... \n", + "578996 2-(Trifluoromethyl)imidazo[1,2-a]pyrazine (1.6... \n", + "\n", + " product_000 \\\n", + "index \n", + "371755 CC(C)(C)OC(=O)n1nc(-c2cc3cc(OCCN4CCOCC4)ccc3n2... \n", + "111108 CC(C)(C)c1cc(-n2nc3ccccc3n2)c(O)c(C(C)(C)C)c1 \n", + "67346 COc1ccc2c(c1OC)CC1CNCC21 \n", + "121435 Cc1cccc(-c2cc(OCCO)cc(C)n2)n1 \n", + "21375 CCOC(=O)CN(Cc1ccccc1OCCCN(C)C)C(=O)C=Cc1ccc(N)cc1 \n", + "... ... \n", + "752297 C=CCO[C@H]1C[C@H](C(=O)OC)[C@@H](C(=O)N2CCN(c3... \n", + "179187 Nc1ccc(O)cc1N \n", + "147296 Nc1ccc(C(=O)N2CCc3cccn3-c3ccccc32)cc1 \n", + "236303 CC1(C)OC(=O)Nc2ccc(-c3cccc(C#C[Si](C)(C)C)c3)cc21 \n", + "578996 FC(F)(F)c1cn2c(n1)CNCC2 \n", + "\n", + " reactant_000 \\\n", + "index \n", + "371755 CC(C)(C)OC(=O)n1nc(-c2cc3cc(OCCN4CCOCC4)ccc3n2... \n", + "111108 CC(C)(C)c1cc(-n2nc3ccccc3[n+]2[O-])c(O)c(C(C)(... \n", + "67346 COc1ccc2c(c1OC)CC1CN(Cc3ccccc3)CC21 \n", + "121435 Cc1cccc(-c2cc(OCCO)cc(C)[n+]2[O-])n1 \n", + "21375 CCOC(=O)CN(Cc1ccccc1OCCCN(C)C)C(=O)C=Cc1ccc([N... \n", + "... ... \n", + "752297 C1CCOC1 \n", + "179187 Nc1ccc(O)cc1[N+](=O)[O-] \n", + "147296 O=C(c1ccc([N+](=O)[O-])cc1)N1CCc2cccn2-c2ccccc21 \n", + "236303 C#C[Si](C)(C)C \n", + "578996 FC(F)(F)c1cn2ccncc2n1 \n", + "\n", + " reactant_001 \\\n", + "index \n", + "371755 NULL \n", + "111108 [H][H] \n", + "67346 NULL \n", + "121435 NULL \n", + "21375 [H][H] \n", + "... ... \n", + "752297 C=CCOC(=O)O[C@H]1C[C@H](C(=O)OC)[C@@H](C(=O)N2... \n", + "179187 NULL \n", + "147296 NULL \n", + "236303 CC1(C)OC(=O)Nc2ccc(-c3cccc(Br)c3)cc21 \n", + "578996 NULL \n", + "\n", + " rxn_str rxn_time \\\n", + "index \n", + "371755 [C:1]([O:5][C:6]([N:8]1[C:16]2[C:11](=[CH:12][... NaN \n", + "111108 [OH:1][C:2]1[C:7]([C:8]([CH3:11])([CH3:10])[CH... NaN \n", + "67346 [ClH:1].C([N:9]1[CH2:13][CH:12]2[CH2:14][C:15]... NaN \n", + "121435 [OH:1][CH2:2][CH2:3][O:4][C:5]1[CH:6]=[C:7]([C... NaN \n", + "21375 [C:1]([OH:6])(=[O:5])[C:2]([NH2:4])=[O:3].[CH3... NaN \n", + "... ... ... \n", + "752297 C(O[C:5]([O:7][C@@H:8]1[CH2:13][N:12]([C:14]([... NaN \n", + "179187 [NH2:1][C:2]1[CH:7]=[CH:6][C:5]([OH:8])=[CH:4]... 12.0 \n", + "147296 [N+:1]([C:4]1[CH:25]=[CH:24][C:7]([C:8]([N:10]... NaN \n", + "236303 Br[C:2]1[CH:3]=[C:4]([C:8]2[CH:20]=[CH:19][C:1... NaN \n", + "578996 [F:1][C:2]([F:13])([F:12])[C:3]1[N:4]=[C:5]2[C... NaN \n", + "\n", + " solvent_000 solvent_001 temperature yield_000 \n", + "index \n", + "371755 CCO None 25.0 83.2 \n", + "111108 Cc1ccccc1 O 50.0 73.2 \n", + "67346 CO None NaN NaN \n", + "121435 CO None NaN NaN \n", + "21375 CCO None NaN NaN \n", + "... ... ... ... ... \n", + "752297 None None NaN NaN \n", + "179187 CO None NaN NaN \n", + "147296 CCO None NaN NaN \n", + "236303 CCN(CC)CC None 80.0 91.8 \n", + "578996 CO None NaN 93.7 \n", + "\n", + "[2683 rows x 18 columns]" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 1. Check [Pd]/C not present\n", + "df[df['agent_001'] == \"[Pd]\"]\n", + "# OMG there's a bug, the [Pd] should be first!!\n" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "To a stirred solution of 3-(4-propyl-2-methoxyphenoxy)-2-nitropyridine (1.1 g, 3.8 mmol) in methanol (15 ml) was added anhydrous Ferric chloride (55 mg, 5% by wt) and activated charcoal (55 mg, 5% by wt). The resulting mixture was heated to reflux and hydrazine hydrate (570 mg, 11.45 mmol) was added dropwise. The reaction was allowed to stir under reflux condition overnight, then filtered through celite. The filtrate was concentrated under reduced pressure, taken in ethyl acetate (150 ml). The organic layer was washed with water followed by brine, dried over anhydrous sodium sulfate and concentrated under reduced pressure. The residue was passed through silica column (eluant ethylacetate/pet ether 1:3 to get 900 mg (91.3%) of title compound as a pale yellow solid.\n", + "619281\n" + ] + } + ], + "source": [ + "# Bug 2: Charcol is being written as \"C\" -> this becomes CH4\n", + "print(df[df['agent_001'] == \"C\"]['procedure_details'].iloc[0])\n", + "print(df[df['agent_001'] == \"C\"]['procedure_details'].index[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'[CH2:1]([C:4]1[CH:19]=[CH:18][C:7]([O:8][C:9]2[C:10]([N+:15]([O-])=O)=[N:11][CH:12]=[CH:13][CH:14]=2)=[C:6]([O:20][CH3:21])[CH:5]=1)[CH2:2][CH3:3].C.O.NN>CO>[CH3:21][O:20][C:6]1[CH:5]=[C:4]([CH2:1][CH2:2][CH3:3])[CH:19]=[CH:18][C:7]=1[O:8][C:9]1[C:10]([NH2:15])=[N:11][CH:12]=[CH:13][CH:14]=1'" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.loc[619281]['rxn_str']" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "'[CH2:1]([C:4]1[CH:19]=[CH:18][C:7]([O:8][C:9]2[C:10]([N+:15]([O-])=O)=[N:11][CH:12]=[CH:13][CH:14]=2)=[C:6]([O:20][CH3:21])[CH:5]=1)[CH2:2][CH3:3].C.O.NN>CO>[CH3:21][O:20][C:6]1[CH:5]=[C:4]([CH2:1][CH2:2][CH3:3])[CH:19]=[CH:18][C:7]=1[O:8][C:9]1[C:10]([NH2:15])=[N:11][CH:12]=[CH:13][CH:14]=1'" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
original_indexagent_000agent_001agent_002date_of_experimentextracted_from_filegrant_dateis_mappedprocedure_detailsproduct_000reactant_000reactant_001rxn_strrxn_timesolvent_000solvent_001temperatureyield_000
index
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [original_index, agent_000, agent_001, agent_002, date_of_experiment, extracted_from_file, grant_date, is_mapped, procedure_details, product_000, reactant_000, reactant_001, rxn_str, rxn_time, solvent_000, solvent_001, temperature, yield_000]\n", + "Index: []" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df['procedure_details'] == \"zeolite\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "57" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(df[df['agent_002'] == \"C\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "'C(O[CH:4](OCC)[CH2:5][NH:6][C:7]1[N:11]([CH3:12])[N:10]=[N:9][N:8]=1)C.[ClH:16].[OH:17][C:18]1[CH:19]=[C:20]([CH:24]=[CH:25][C:26]=1[OH:27])[CH2:21][CH2:22][NH2:23].Cl.C>O.C(O)C>[ClH:16].[CH3:12][N:11]1[C:7]([NH:6][CH2:5][CH:4]2[C:24]3[C:20](=[CH:19][C:18]([OH:17])=[C:26]([OH:27])[CH:25]=3)[CH2:21][CH2:22][NH:23]2)=[N:8][N:9]=[N:10]1'" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "agent_000\n", + "Cl 63102\n", + "[Na+] 56010\n", + "[K+] 25356\n", + "[OH-] 22763\n", + "[Pd] 22616\n", + "O=C([O-])[O-] 21063\n", + "[H-] 12680\n", + "[Li+] 7218\n", + "O=S(=O)(O)O 6866\n", + "[Cl-] 6442\n", + "O=C([O-])O 6336\n", + "[Li]CCCC 5612\n", + "c1ccc(P(c2ccccc2)c2ccccc2)cc1 5063\n", + "Cc1ccc(S(=O)(=O)O)cc1 4410\n", + "CN(C)c1ccncc1 4319\n", + "[BH4-] 3900\n", + "[NH4+] 3668\n", + "[Cs+] 3584\n", + "c1ccc([P](c2ccccc2)(c2ccccc2)[Pd]([P](c2ccccc2)(c2ccccc2)c2ccccc2)([P](c2ccccc2)(c2ccccc2)c2ccccc2)[P](c2ccccc2)(c2ccccc2)c2ccccc2)cc1 3504\n", + "Br 3178\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# most common agents in agent_000\n", + "df['agent_000'].value_counts().head(20)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'[C:1]([O:5][C:6]([C:8]1[C:9]([C:14]2[CH:19]=[CH:18][C:17]([CH2:20][N:21]3[C:25]([CH:26]=O)=[C:24]([CH:28]=[CH2:29])[N:23]=[C:22]3[O:30][CH2:31][CH3:32])=[C:16]([F:33])[CH:15]=2)=[CH:10][CH:11]=[CH:12][CH:13]=1)=[O:7])([CH3:4])([CH3:3])[CH3:2].Cl.[NH2:35][OH:36]>N1C=CC=CC=1.O>[C:1]([O:5][C:6]([C:8]1[C:9]([C:14]2[CH:19]=[CH:18][C:17]([CH2:20][N:21]3[C:25]([CH:26]=[N:35][OH:36])=[C:24]([CH:28]=[CH2:29])[N:23]=[C:22]3[O:30][CH2:31][CH3:32])=[C:16]([F:33])[CH:15]=2)=[CH:10][CH:11]=[CH:12][CH:13]=1)=[O:7])([CH3:2])([CH3:4])[CH3:3]'" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df['agent_000']==\"Cl\"]['rxn_str'].iloc[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Intermediate (15b) (19.5 g, 43.4 mmol) was dissolved in pyridine (100 mL, 1 μmol). Hydroxylamine hydrochloride (9.0 g, 130 mmol) was added, followed by water (50 mL, 3 mol), and the mixture was stirred at room temperature overnight. Water (100 mL) was then added and the mixture was stirred for 20 minutes. The precipitant was filtered off and dried to yield intermediate (15c) (13.5 g). MS m/z: [M+H+] calcd for C26H28FN2O4, 466.2; found 466.4. 1H-NMR (CDCl3): 9.78 (1H, s), 7.81 (1H, d), 7.48 (2H, m), 7.26 (1H, s), 7.0 (4H, m), 6.20 (1H, d), 5.53 (1H, d), 5.50 (2H, s), 4.55 (2H, q), 1.43 (3H, t), 1.25 (9H, s).'" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df['agent_000']==\"Cl\"]['procedure_details'].iloc[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "'[C:1]([O:5][C:6]([C:8]1[C:9]([C:14]2[CH:19]=[CH:18][C:17]([CH2:20][N:21]3[C:25]([CH:26]=O)=[C:24]([CH:28]=[CH2:29])[N:23]=[C:22]3[O:30][CH2:31][CH3:32])=[C:16]([F:33])[CH:15]=2)=[CH:10][CH:11]=[CH:12][CH:13]=1)=[O:7])([CH3:4])([CH3:3])[CH3:2].Cl.[NH2:35][OH:36]>N1C=CC=CC=1.O>[C:1]([O:5][C:6]([C:8]1[C:9]([C:14]2[CH:19]=[CH:18][C:17]([CH2:20][N:21]3[C:25]([CH:26]=[N:35][OH:36])=[C:24]([CH:28]=[CH2:29])[N:23]=[C:22]3[O:30][CH2:31][CH3:32])=[C:16]([F:33])[CH:15]=2)=[CH:10][CH:11]=[CH:12][CH:13]=1)=[O:7])([CH3:2])([CH3:4])[CH3:3]'\n" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'[Cl:1][C:2]1[CH:3]=[CH:4][C:5]([O:12][CH2:13][C:14]([N:16]2[CH2:21][C@H:20]([CH3:22])[N:19]([CH2:23][C:24]3[CH:29]=[CH:28][C:27]([F:30])=[CH:26][CH:25]=3)[CH2:18][C@H:17]2[CH3:31])=[O:15])=[C:6]([CH:11]=1)[O:7][CH2:8][C:9]#[N:10].[Cl-].[NH4+].[N-:34]=[N+:35]=[N-:36].[Na+]>CN(C)C=O.C(OCC)(=O)C>[Cl:1][C:2]1[CH:3]=[CH:4][C:5]([O:12][CH2:13][C:14]([N:16]2[CH2:21][C@H:20]([CH3:22])[N:19]([CH2:23][C:24]3[CH:25]=[CH:26][C:27]([F:30])=[CH:28][CH:29]=3)[CH2:18][C@H:17]2[CH3:31])=[O:15])=[C:6]([O:7][CH2:8][C:9]2[N:34]=[N:35][NH:36][N:10]=2)[CH:11]=1'" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df['agent_000']==\"[Cl-]\"]['rxn_str'].iloc[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "'[Cl:1][C:2]1[CH:3]=[CH:4][C:5]([O:12][CH2:13][C:14]([N:16]2[CH2:21][C@H:20]([CH3:22])[N:19]([CH2:23][C:24]3[CH:29]=[CH:28][C:27]([F:30])=[CH:26][CH:25]=3)[CH2:18][C@H:17]2[CH3:31])=[O:15])=[C:6]([CH:11]=1)[O:7][CH2:8][C:9]#[N:10].[Cl-].[NH4+].[N-:34]=[N+:35]=[N-:36].[Na+]>CN(C)C=O.C(OCC)(=O)C>[Cl:1][C:2]1[CH:3]=[CH:4][C:5]([O:12][CH2:13][C:14]([N:16]2[CH2:21][C@H:20]([CH3:22])[N:19]([CH2:23][C:24]3[CH:25]=[CH:26][C:27]([F:30])=[CH:28][CH:29]=3)[CH2:18][C@H:17]2[CH3:31])=[O:15])=[C:6]([O:7][CH2:8][C:9]2[N:34]=[N:35][NH:36][N:10]=2)[CH:11]=1'\n" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'[CH3:1][N:2]([CH3:13])[C:3]1[CH:8]=[CH:7][CH:6]=[C:5]([N+:9]([O-])=O)[C:4]=1[CH3:12]>CO.[H][H].[Ni]>[CH3:1][N:2]([CH3:13])[C:3]1[C:4]([CH3:12])=[C:5]([CH:6]=[CH:7][CH:8]=1)[NH2:9]'" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Check reactions with H2\n", + "df[df['agent_000']==\"[H][H]\"]['rxn_str'].iloc[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# NB: atom mapping does not involve H atoms\n", + "'[CH3:1][N:2]([CH3:13])[C:3]1[CH:8]=[CH:7][CH:6]=[C:5]([N+:9]([O-])=O)[C:4]=1[CH3:12]>CO.[H][H].[Ni]>[CH3:1][N:2]([CH3:13])[C:3]1[C:4]([CH3:12])=[C:5]([CH:6]=[CH:7][CH:8]=1)[NH2:9]'\n" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'[N:1]([CH:4]([CH:12]([OH:26])[CH:13]([OH:25])[CH:14]([N:22]=[N+]=[N-])[CH2:15][C:16]1[CH:21]=[CH:20][CH:19]=[CH:18][CH:17]=1)[CH2:5][C:6]1[CH:11]=[CH:10][CH:9]=[CH:8][CH:7]=1)=[N+]=[N-].[H][H]>CO.[Pd]>[NH2:1][CH:4]([CH:12]([OH:26])[CH:13]([OH:25])[CH:14]([NH2:22])[CH2:15][C:16]1[CH:21]=[CH:20][CH:19]=[CH:18][CH:17]=1)[CH2:5][C:6]1[CH:11]=[CH:10][CH:9]=[CH:8][CH:7]=1'" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df['reactant_000']==\"[H][H]\"]['rxn_str'].iloc[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "index\n", + "98490 [OH-:1].[Na+].[CH2:3]([Sn:7](Cl)(Cl)[CH2:8][CH...\n", + "297681 [Cl].[Br:2][C:3]1[CH:11]=[CH:10][C:6]([C:7]([O...\n", + "719131 [CH:1]1([C:4](=[O:11])[CH2:5][C:6]([O:8][CH2:9...\n", + "Name: rxn_str, dtype: object\n" + ] + } + ], + "source": [ + "# Bug 2: Charcol is being written as \"C\" -> this becomes CH4\n", + "print(df[df['agent_000'] == \"[Cl]\"]['rxn_str'])" + ] } ], "metadata": { @@ -261,7 +1083,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.10.12" }, "orig_nbformat": 4 }, From ac5fbfacd0ed8e66f2fe7c56feb2348fe0166dfd Mon Sep 17 00:00:00 2001 From: Daniel Wigh Date: Sat, 20 Jan 2024 21:31:39 +0000 Subject: [PATCH 24/29] move benchmark generation --- Makefile | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/Makefile b/Makefile index 946b42de..6908248d 100644 --- a/Makefile +++ b/Makefile @@ -125,9 +125,13 @@ run_python_310: # 5. Plot histogram showing dataset size as a function of min_frequency_of_occurrence (can probably use the min_frequency code from the cleaner within the plotter) # 6. Generate the six condition prediction datasets we need for the paper (split into train and test set) # 7. Plot histograms with the occurrence of the most common reactants, products, solvents, agents -# 8. Generate the final benchmark datasets (split into train and test set) -# 9. Generate fingerprints for each dataset -# 10. Train & evaluate a model on each dataset +# 8. Generate fingerprints for each dataset +# 9. Train & evaluate a model on each dataset + +### Benchmark generation +# I. Extract data +# II. Clean data + # 1. Extract @@ -217,20 +221,8 @@ paper_plot_uspto_with_trust_no_map: paper_7 : paper_plot_uspto_no_trust_no_map paper_plot_uspto_with_trust_no_map -# 8. Gen benchmarks (final datasets) - -paper_orderly_condition: - - -paper_orderly_forward: - -paper_orderly_retro: - -paper_orderly_yield: - - -# 9. gen fp +# 8. gen fp fp_no_trust_no_map_test: python -m orderly.gen_fp --clean_data_folder_path="data/orderly/datasets_$(dataset_version)/orderly_no_trust_no_map_test.parquet" --fp_size=2048 --overwrite=False @@ -255,9 +247,9 @@ fp_with_trust_no_map_train: paper_8: fp_no_trust_no_map_test fp_no_trust_no_map_train fp_no_trust_with_map_test fp_no_trust_with_map_train fp_with_trust_with_map_test fp_with_trust_with_map_train fp_with_trust_no_map_test fp_with_trust_no_map_train #Generate datasets for paper -paper_get_datasets: paper_1 paper_6 paper_8 +paper_get_datasets: paper_1 paper_6 -paper_gen_all: paper_1 paper_2 paper_3 paper_4 paper_5 paper_6 paper_8 paper_9 +paper_gen_all: paper_1 paper_2 paper_3 paper_4 paper_5 paper_6 paper_8 # 9. train models #Remember to switch env here (must contain TF, e.g. tf_mac_m1) @@ -287,6 +279,9 @@ with_trust_no_map_train_20: with_trust_with_map_train_20: python -m condition_prediction --train_data_path="data/orderly/datasets_$(dataset_version)/orderly_with_trust_with_map_train.parquet" --test_data_path="data/orderly/datasets_$(dataset_version)/orderly_with_trust_with_map_test.parquet" --output_folder_path="models/with_trust_with_map_20" --train_fraction=0.2 --train_val_split=0.8 --overwrite=False --epochs=20 --evaluate_on_test_data=True --early_stopping_patience=5 --wandb_entity=$(WANDB_ENTITY) +################################################ +# Generate ORDerly benchmarks +################################################ # Sweeps RANDOM_SEEDS = 12345 54321 98765 From 9611e08f9f6cb9a2b1ebf6d96d8faf9aaa7ac16c Mon Sep 17 00:00:00 2001 From: Daniel Wigh Date: Sat, 20 Jan 2024 21:43:20 +0000 Subject: [PATCH 25/29] make black --- orderly/clean/cleaner.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/orderly/clean/cleaner.py b/orderly/clean/cleaner.py index a53dc09f..f84d83bf 100644 --- a/orderly/clean/cleaner.py +++ b/orderly/clean/cleaner.py @@ -475,12 +475,19 @@ def _move_none_to_after_data( @staticmethod def _scramble( df: pd.DataFrame, - components: Tuple[str, ...] = ("reactant", "product", "solvent", "catalyst", "reagent"), + components: Tuple[str, ...] = ( + "reactant", + "product", + "solvent", + "catalyst", + "reagent", + ), seed: int = 42, ) -> pd.DataFrame: """Scrambles the order of the reactants (ie between reactant_001, reactant_002, etc). Ordering of products, solvents, reagents, and catalysts will also be scrambled. This is done to prevent the model from learning the order of the molecules, which is not important for the reaction prediction task. It only done at the very end because scrambling can be non-deterministic between versions/operating systems, so it would be difficult to debug if done earlier in the pipeline. - - NB: agents not scrambled by default, since we need their ordering to let transition metals be first.""" + + NB: agents not scrambled by default, since we need their ordering to let transition metals be first. + """ list_of_dfs = [] all_component_cols = [] np.random.seed(seed) From e7b8516495541756e2413ce9af34578a83276b59 Mon Sep 17 00:00:00 2001 From: Daniel Wigh Date: Sat, 20 Jan 2024 21:48:13 +0000 Subject: [PATCH 26/29] restore extractor from random comment insertions --- orderly/extract/extractor.py | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) diff --git a/orderly/extract/extractor.py b/orderly/extract/extractor.py index 6d4a6115..a364886d 100644 --- a/orderly/extract/extractor.py +++ b/orderly/extract/extractor.py @@ -474,15 +474,11 @@ def rxn_time_extractor(rxn: ord_reaction_pb2.Reaction) -> Optional[RXN_TIME]: else: return None # no time found - @sta - #NB this needs to be before the removal of [C] and Cticmethod + @staticmethod def procedure_details_extractor( rxn: ord_reaction_pb2.Reaction, - ) -> - #NB this needs to be before the removal of [C] and C str: # TODO check does it re - #NB this needs to be before the removal of [C] and Cturn empty string or none - procedure_d - #NB this needs to be before the removal of [C] and Cetails = rxn.notes.procedure_details + ) -> str: # TODO check does it return empty string or none + procedure_details = rxn.notes.procedure_details return str(procedure_details) @staticmethod @@ -1071,8 +1067,7 @@ def contains_charcoal(procedure_details) -> bool: yields, temperature, rxn_time, - - #NB this needs to be before the removal of [C] and Crxn_str, + rxn_str, procedure_details, date_of_experiment, is_mapped, @@ -1115,8 +1110,7 @@ def build_rxn_lists( "temperature": [], "rxn_time": [], "product": [], - " - #NB this needs to be before the removal of [C] and Cyield": [], + "yield": [], "procedure_details": [], "date_of_experiment": [], "is_mapped": [], @@ -1144,8 +1138,7 @@ def build_rxn_lists( yields, temperature, rxn_time, - - #NB this needs to be before the removal of [C] and Crxn_str, + rxn_str, procedure_details, date_of_experiment, is_mapped, @@ -1163,9 +1156,7 @@ def build_rxn_lists( rxn_lists["temperature"].append(temperature) rxn_lists["rxn_time"].append(rxn_time) rxn_lists["product"].append(products) - rxn_lists[" - #NB this needs to be before the removal of [C] and Cyield"].append(yields) - #NB this needs to be before the removal of [C] and C + rxn_lists["yield"].append(yields) rxn_lists["procedure_details"].append(procedure_details) rxn_lists["date_of_experiment"].append(date_of_experiment) rxn_lists["is_mapped"].append(is_mapped) @@ -1257,9 +1248,7 @@ def build_full_df( ) ) dfs.append( - OrdExtractor._to - #NB this needs to be before the removal of [C] and C_dataframe( - #NB this needs to be before the removal of [C] and C + OrdExtractor._to_dataframe( data_lists["procedure_details"], base_string=["procedure_details"] ) .fillna("") From 776ed921a4eb7b4180ada88f394012e6e2093030 Mon Sep 17 00:00:00 2001 From: Daniel Wigh Date: Sat, 20 Jan 2024 21:48:33 +0000 Subject: [PATCH 27/29] make black --- orderly/extract/extractor.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/orderly/extract/extractor.py b/orderly/extract/extractor.py index a364886d..a205e83c 100644 --- a/orderly/extract/extractor.py +++ b/orderly/extract/extractor.py @@ -248,11 +248,10 @@ def extract_info_from_rxn_str( # check reactant is mapped and also that it's not in the products mol = rdkit_Chem.MolFromSmiles(r_map) if mol != None: - if ( - any(atom.HasProp("molAtomMapNumber") for atom in mol.GetAtoms()) - and ( # any(generator) - r_clean not in products_from_rxn_without_mapping - ) + if any( + atom.HasProp("molAtomMapNumber") for atom in mol.GetAtoms() + ) and ( # any(generator) + r_clean not in products_from_rxn_without_mapping ): reactants.append(r_clean) else: @@ -1015,8 +1014,8 @@ def move_unresolvable_names_to_end_of_list( catalysts, _ = move_unresolvable_names_to_end_of_list( catalysts, rxn_non_smiles_names_set ) - - #NB this needs to be before the removal of [C] and C + + # NB this needs to be before the removal of [C] and C procedure_details = OrdExtractor.procedure_details_extractor(rxn) date_of_experiment = OrdExtractor.date_of_experiment_extractor(rxn) rxn_time = OrdExtractor.rxn_time_extractor(rxn) @@ -1028,14 +1027,14 @@ def contains_transition_metal(agents: AGENTS) -> bool: if orderly.extract.defaults.has_transition_metal(agent): return True return False - + def contains_charcoal(procedure_details) -> bool: if "charcoal" in procedure_details.lower(): return True return False # Check if any agent contains a transition metal - if (contains_transition_metal(agents) or contains_charcoal(procedure_details)): + if contains_transition_metal(agents) or contains_charcoal(procedure_details): # Remove "[C]" and "C" from agents agents = [a for a in agents if a not in ["[C]", "C"]] @@ -1043,7 +1042,6 @@ def contains_charcoal(procedure_details) -> bool: _yields = [None] * len(products) yields = _yields - if ice_present and ( temperature is None ): # We trust the labelled temperature more, but if there is no labelled temperature, and they added ice, we should set the temperature to 0C From ac67db133c2fb45c15cd5e5340125f5a8f3d64e3 Mon Sep 17 00:00:00 2001 From: Daniel Wigh Date: Sat, 20 Jan 2024 22:09:52 +0000 Subject: [PATCH 28/29] regen test data --- .../uspto-grants-1976_01.parquet | Bin 835352 -> 835298 bytes .../uspto-grants-1978_12.parquet | Bin 379848 -> 379809 bytes .../uspto-grants-1979_09.parquet | Bin 319631 -> 319620 bytes .../uspto-grants-1980_08.parquet | Bin 403061 -> 403046 bytes .../uspto-grants-1982_08.parquet | Bin 580054 -> 580010 bytes .../uspto-grants-1986_09.parquet | Bin 453362 -> 453362 bytes .../uspto-grants-1995_11.parquet | Bin 678280 -> 678277 bytes .../uspto-grants-2001_07.parquet | Bin 2067818 -> 2067777 bytes 8 files changed, 0 insertions(+), 0 deletions(-) diff --git a/orderly/data/test_data/extracted_ord_test_data_dont_trust_labelling/extracted_ords/uspto-grants-1976_01.parquet b/orderly/data/test_data/extracted_ord_test_data_dont_trust_labelling/extracted_ords/uspto-grants-1976_01.parquet index 90e2f09e9cc325aaaecde7a5ffc3cda563dace30..28aaa77c68c644fb0bb71e46fd0bb471f5085975 100644 GIT binary patch delta 9024 zcmZvB3wRS%+W$FirkqJ9r^z%kZPF%f(k4xM&$JDtkkDjkX)7d!Lb;RzAyDoG%N>+j zRs>d%#X$xtC`C~V0!6MC7AXn>LgnfzP@pO*peP6~uA2Q$m;d9pAN-%^J(-!jXU?2? zZ@>3_XKhIM@(&@d+%wR@0dDQ8aPHQ$bnaXNxrpEp0H=;ijsg5mM;lfDf(Mx9<0K_= z`R(|tsIqb(M1$HaP#X_~A|R*$y@UMGbxfoqs81E;V+9Aa&(oh#n&t`aE=Fc=pzzl5 zyJ;kfq9-fXF4{!9O)Sb(+x((ENJqRjofgJ}MitOc)i}EWeoQoQ@E(Toiw>8SH=+LT z0?{6(@CBA#BtIu&@B-vA?b#wM1Do5)%B2sS$E5WUCd~r2pNh_RtHA&}g@CUhRcen& ztiP@hM1CFa@r6o+nLyO2;2ULoXc)6>(WFX%rAVt8Q#p)Sm*%8$R`m$XF^mP=nMqCxJ78sIoO6a|9_CoXK0B)6MznEYT&7 zXgd48TSx^pDqzaymnp}wu#f&NL~bu;K}`v;XqkR`BrMr!fei4oXAwOpHqfG%Sr7}onm_P08Ydwj;$6{v zK*~UI_DxFRVb^|6W&k>;poS`l0yv-s6*|f^nSFr~p6ODOrzd|$dKz&sTv~z>ORqCR zZ(#4Ku~A4-lEJNuOKF})7NhA$OW zPdP^eVJbNKqp1Tsq)T)HoG1N@j)ym;7-dSVkVp@tBXk*)l0m`Ol%Rt6f_B_;{1>TG zIYhsM#Uqo+1D>N9(eCmaD-snBq5Wlm-;h35X2W*rp|WRGk0f@eang)1SdNDw{tw9; z*|X$3qlE^dswbW1%%<)KGadtZk~NrH#_h zPG%bz;G_QCypJ^*%3=VwNbi%5Mmjw&#^Zy$!sR=6Io!0OhYyIUcashLjR&|!2`ok3 z`8hFa;$}%cxB$2ef5nGmh&PE2$`1UQNr32lt<;MtPLvE(Z+$VbFA>(@9LBjfOp`1$ zKw<*a6yU1iq)kju_*dyFV^P#6V@~R(vf~Bump(99f*!BFTwNp)8%ga?TZhn8Jm{z2_A>4fX|~f5 z#@*2{w(Nw|2j^JgllV83MK7im5F zm(2zXqrsTP;Ft~wlff8bbMKnW+-0FV@YF6|5B&Hdb{+zh;EAPZ)Nf+iva#T)0tED< zGB%`B%Wu?o#iSL#F%G*|(2r+^zYbrX9}e4z!!2u9l$L6+a31J4VBxm3Dk?~;N&LXs z2v3myh#D3-FvW0!TCMPm*Kd>p>S4+U>9|hC59XRiNgt_gktGh@$mZqxMkOqhepU~S zSmgjVR+a{6s$iY8O*0~CVd_Y za`1IAKA0N})&_&Q6vOtF9%!Gn%q;ZvEH1S=Z3@yk^_XD%P15`~Y2|S`NlfW1{VhqvwXxL6d(q4t=AD4wc zq)Eg&(XFU}=|4VAP^oF2jsUvA->KePzXA3P;>Nm{7|j1iTFVjrZuYi zG%zDG4Oo8lEGcTM`+=(YmIU#luQ)#yO&uh*l)6_*JzA_sL#Es{bXg3gG$Cpd!S>Nj$VPU($zl_UQ^bim=dD;M@v;VBN?6>I5F&~6e7xwLj0WEcQIpj1tqE3Mkys%;+tLb?A- zFgNez4TpSM(3lQ=JwcroZ0^VYs^n6r=*@K}qJ#yY&H#!Ux?VxKc@C)HWnMp*p+9$GbWaDy2+3;BtiMPrd_MlP2H1*X*#kNtde`Ga!-0_91xJ6C0hd%v1Iu+p_K)Ys z0rBn_yplyE;IopFEC&Bh6q{o&n5$=}#=*4C_wW=PPf)^6G6?_T$?Wr5fNxyJN8_Lg z#+SY#90qi>cU+v1+y!WCMRXYcL%L|rW}~^fSJ9y$Xv_wUDS&FlZ%Q#qUe#Rfz$^ec zBqoW!beC$9GQ;9k(wd~y)K{VuLU7MH5~Yepuxpi;ZmGpO{HZ$i1}!WG+^e`F7XPkR zREhZS(ygQc$zzk7e`*^Q_?`WWnhBCushwt7oM)Vi3<&>|Cb6nG$!L~FCG#*!nw{*a z&$SA5T^oe^nJRyV;B*0sZ%YS#=>2Sb3K2X`HBgMg?&DWs{R17{4n*U5otcY-iQ1BC zp+a<+9BQ$uVq$d>t(aoB?+ATjvwmVLN+GF{5KVE_xsB2A4tdR#qTgzYfvLLK97g2< zgK3)DB|5~4L_1Lce<%j|1X15)gS{;4YDNJ;56aesmywh&N1EPC8-9gA$#e^cs#s(z zwzDPD3QK}3fo2aWg{Qid*LEt|7j)Ft)^@={XsMo%d#PUYi?FMU%l&{SNAq7WKs;9+ zY06NDzRr5-drNV*mubFTg$1dH)mYCau=^42h^A1SVyRWAlvqSq zBt3{Pps83zzlqL`VBu-V#v>g?^{IP5<*(f!W;oR*{u z89~S5Ur^ANfgOpUmX#fN1*gvts6n`2l`;gN~+WM;57tVc-sdO%=PGr+%yb)loAZSoYRMNL-3dImF@~18J+9%9Z-2q2sNP92&e__8Gz#Iw6S?+l7Ic@ z5%}%=%oX{#FS@aqt+e77O+bWeNg|A|C_CrkRw>PvoO#}4TxP=Fm- zz{ht8B);E?Q3Qj$5bYJ9X`#aJVxN_swIxPV##n)5dj5kk^T5RdDGka=n zBsukE;q@jU;B8vV>*QhjSET#48HP?{WgVbDWw*sN6{|^BFWPIr$_G(_dMb;(B0X<+ zDTl_t_YJ6d4&YtVC-!{i8{OF{p>!7^U%C{PYK=;aL*kTjoEfK40Ff<|`lLP`-)bj1 z;=38}^N|ce96N4N;+L~Kqd}E^sB|_p-Vu=y_`*N|$7in^I$!2RtMEG@I6#Mt&)Q5q zgkoTM;nMDiM3U)%3YSP?&ppYQ77K?<-O~mr-gB{S((<&Kitnv>nWRWB)8oE6C7dZJ z!thWB{??&&iK4g8-nk0vqzUPhNCI>|-DBt|M;SsjzMveO#18QR+pvlJ?8Mm{8KD{A zTaqJ#Ae~L1G z0YQC1DeLD*=rOJ-p{~}ARBOz5nh!VA_*Opg(p&jZEi`pu60sRt)jvbw5JR&{3H@tU zsnjL7qJcLB0p||0kx1s&*&?4}N>J~{aW$BUzz{;l-prmsQ5cTC{ zH%o3*cJgdrJFwmC^Fz)bL!cR-kxpi1!@bgPS-s&psb}`9@Uvb2$bJYpj=r}J3}7`# z=ub)j4H{?w0w*;U@zf#$+p{H@lLZe+jvQ-h=t0E%2N9tvNN7_P1WbTflcOuu?x9$T?4Ie_9*PeQx~koMQhel2hn*HMsJi|6Y#bo(qIp!~ef@r!}0& zvqGcpN)xI*e^GbU8~%2Gn67WT8?@&FxC2%7o485EQrX^tZ2Nra?m(ATmAiSs)pQKP z*rwP}88xYlV$-A%z4Nrs#D$)T`|_DMpkf;gK;#!1NU5;7_o9fWl8C?TmNE)!;4{+d z!hz7Z>)XO{FsfCs6m-eELm)J}<1ba-B~@NXMi?34WGKj>$cP}L+rBC<${nu#Y$?UP z)(QXnjkoT5V?AP&^?Z|3r);1Cx$LOO!B5xp0qhVJzN;cpifzN4ZP_+r=QT zfirkM)Hg?wT*0#)7ex{OB2|K|vz)V$d6qm*O{_wp9f#cc0Fh|mn*-q5Sr<)BYSCx& z*^6Eb1RAy*XKBF6My z9)#yJdx8^7y}bbNSGoXRp>fXz8ZG@(!(ehkjWwsSe5qD*fbGuU9c+OB@f<#Iij;qD zAFwo1K4%%|Y=LFPrvdC&Spqy;9B<%PX;MGd&u|1?%4eow`)>mGJ za0;F1c@c^Ja?|&0-{{UTsAfnf!?xlXeh;mv4Pr*=&H|nVvuYadP#NQ*u!A;@^fb14 z_+yA)tvtOR&(MP4&D^FT5Z}iRGuPeYpX$yjM~6Q#AzuT!F#2W9+rAXMQ4#ebOwrT zO2NO-OfV-XXokaCv>~;TTm>B&&X*|@jq6$dHe(3^zc;897Q_EQfoZD#EE-GQ(N`<= z8&KX4O%v5s;uHdjyQ58C>ukCX=wZhrt~>r#Z}dh&qX_|jQ4jqxi3=xuSuTO&=j&A`n`_iUuja7<5>OS6AXZIK($o!O zUhVvJfM1vQHlqyG5lw$aU1tOG?Pin|yIa|XT!szHt`qsLiN$lGp%FOnYT3F4S?kSu z_NI3k&xN+h|FeFO%bfo1M7aZP8m?JQ7U{n8M%B)zvm)$sV$4m zY4Wr0q8R3y9&c3{))&tNn36ZXi_Eb>LuWiwVXt-I(`FL2#0t+n%owZ}Rz8gqfi1N{0hx+tC9XQe7t1K~UfEtAZf7+eucvS$hi;bBBCfweud z0;jM}K0p?Rm*j7^p>*p;B3`I429)OZM4B^Q)kmq#K@+WPoCk>8;CG@(#-;C&du&HV zMfXxy>{P9RZ#LDGcHnOqx&S}038d9Qz<+MT9savan9RkHW}v1S`1whx!bWJH+=TJ4 z1iXh9N&uWf!tM+5hueu8;P@MN?0s;#{E#dRr^;>uVT$XS{5^TY4%8DmCOY*_@&{L*4 zZtRL)R)$+68>+h!*0JgMbET)dVv2~fqUg`@cG8VLl;CAR};GI3rc{z_i33njw=^2S{#Mmyck4^%3Af(nBQoJUGXQLmFf*@f!h z2-&?GO@ia)=G|zU;rH$XoOq_1V)%We+~v3S0<}C%TTbE#jIHvFJ*YptBDd{9`O&4l z`_BNi8qJB{@IKTqoETw@%bYKgBU<6F#cOhmLz3|;@_@Z)P+VD_Cl0TT#J4g@ zx0GLBs25Y0^$q+)9Cna=crUWV&gqG&gSG74Sj}^$mVfC$AclUwVTpd0{KsCD3VGSK z4~^CQy$}!AOd{Yq=#`bJLsfXEykQ@T&$?KG8E?@-n93KrNDZ(s9=>7dPPBJ55`UpX zJ~4;=SxY}d_tDq)YpB7D{M|kxnXR&FKbp&D_x@o5JI`D&FpS-0rZ2D~)J67g4owig=n7N%8rJz6Fpb!hC&#y>lY^-~{D-XSD|#vR#j?HR zf&-{KbjfuGh^>AuuRB1r(8r&LDsEHg50dj3M3hOaxF>amI-O z`~%VzCvjdMMVsd6FE~TZBiA&`DIcJ0m?01S015C}`Na>A3BD+A`vC1_L_f8OG&uNm zxwyAH?J(lfs>|6Z4RDpAGSh3o-*(Hmu4a({;9hPb=!56-{CFDvf~|<4UzEJ#FcE%- zeBm&f3eU>kHdK*1Z(waAV8i1BQ}6{9HQ02kr~n@vYwoXf?+Dh&VWb3a`-iF$lpCw$ z586;Y{7vqC1kHt_y#EN1!)^JyBgk$x4=FtlMDxZ-VM#U4N(8ysVZamA zY@wXVu_bc!aWsZJZT4}L4>!v@j-x!7E8jRy?7DQH`UFzKZeypK9Z~Eu*>n<>(e_3B zWHrgF2g@^0qMLAvJn|FN3}4uXPN6jrX3DRhMw7YR1r!0d(pmUh2G2`cEVA0ke5;($ zjt)Vue6=0& z)!w53#(`|cmr?D)tu$~!WoSDfR7-x>1Lu0cCUK!f{GU{?h(DsGLlSgKFysS|=#mfc z<8z7SvtuIpLX0OT00F9XQJ1n=0@QD*%t?Ho(7mufsq|iGV%Q_PL{9z*oc{!E_leFP zs6+Q72yMR~(M~};L5IVmF{v%>^v|RaP{~hUMkVlw+A|E zHE~shPJv0M06r6yn$-8s_c8qUF`MU^^Hf4BU|&gRd#d=2UWH!_iN%DxqCDX$O0R0k zWamUHY#B|LL6AL_ZeYMPzCW`)Iz2QY>=9j>5QcxmRy$Yl&#caz0->+7gbL-s;ylQ2 z8gglfDzp+5uLSM))qnQ4*O2luAGgRqUPU}Jp%hh#g^+`3`YKs-4Vk!>?!X_K2i)_3 z@IqI=HN}2JwFWiUn>ctd|=Ayk-%?0+mjtBS6?StO?P?SbyNUn$rr98Z_JuuT8Bv1 z7eg-%`;*w0-NG2%XG(eAR!QligdHc*ECQ!ud5_ z%KO*3;>YLwO`fxrv16vpI~kLc}zT~{LgMt z-Q3zyGbc_R_h_eQhVkd($R0NSEcbbrv4-D!DJT!v#zf1{z00V1a`pK0pa1&*O?aP7 z;K6)dj~?*36wWV=Cr@}@9{c#z<-7hfRZiPRmRi2+34Y_rE`BoCc=8F> zo7d!J+qx{~n!J4*V-5TMuKd|H#?*EEwd?xcO4+oXT$5v;?d1B*DtXFwVo_&SjeFcM N`Stsnwlj0o{{tf28)*Ol delta 9111 zcmZ{K3tSV|w*KCb!9B@jlMKl~0tqCLknjj(kccSJWCQ^F0bU7APvR6gD)QTLPb{AV=%IWZ2<3oK65xv?`tll3o&spAMJXFOg?B*vJpC!T zX&C>?`<%v;CprjZ~Bo~#7BU=!>%p*T}z^9%MM9kX#Z%})g_N}!*icJ%?=q&VQ@ z+)O6p7o0_$5shdAf;~#+3pAf2AKe;UkBUtDUg4X;){|uBvOBJKL@CCku4l`Xbb(t1 zM%gI@e1+*^cT7sdRT(dEYjFQdN&z~6pjN^y<$Gut-`p^x65y2zYci(t7@;oHMdhvP zA9&JOjmJ)-GJS8!Th7DUbO3{!e`dAW{77K=_!Afh`gR9_^VnDv4DwtC20S~CDNIcO zydxJk@3?AyRt=~&Vg)U-RH{m9GHJfLYp+b0h2zD?`lK`{6!!k?1Pyi}&zy-nlzdUd zLcO830#%MsBur>Md%(?SfLdbmT<%$U4GV|pw;(dSgax&wz@m2TqfT`ezOLW}pmy+0%#~(-3IazsQ0F=vDuY zuh2LZ0RgwiaT9g$EVZzD1FZsc%eFGY6Pdj0B;eP};KIVc%lV%ocllm6pz{iAslr%* zlW0((qf9f|zcTz&5!ZPJaJN{U9997zMOpNWZ+f-XR9W`58DN#)9t=7s>p)8baP!2V zi&a&^luW)jzpxfkOWju)eh{z^P}?XZE6rwG6}!Kc`Ry(!BVR*hvA-*xa=Sx7t>w0h zr;q`v#9K(8l*wvGLYnax2}Ldh;Uk<=Ei3ZZctDYUnP{T5aGBVT9;EqHqi->4_@FK5 zw9}^`1U5WbTtZu8$0zHK7>8hBhwmy+u<7DkWQarj2c2x3r=jiYiE5-x*3y)otT@*( zP^JLBZVJr8{}fqydO}VLJ(`ZuWzR?hg`ZKp65_L(nq~NDag_Wa{SFparIFuxnq~xh zk>5BnMdsvBuLgLN__jP3o)o{64~TW8vSW=i9*=?}@i@f&A{HqIa5sz=8VJe(bfznp z>N?~83^gyUbco9p2JPc%Y$HQVd#|mNH5$s}fp)Qen0Q3t@;sk}4{)*~-?_`trWgAA zfRJ%J&A{FG0oTfbrMMqAH(o{JE!77Xf+E8b{AN6fD#1zFf!~n|i1Ys@_G3!mN^ucm zgsixM833OXFEJL`Q)!qLqm*_WG}n4Tm)rQR4Fn78#s11FeRlTNO|p>F3~@rx8LSGG zR=Zb0Hj~cyQrx9<^a0+RS#=7PcwHF}w~0S0`zw;|bc}It984%bE)KwXP$$m83Cw1T z)t1Hzv}sU=OB8WagLj~eo>$%sNOqar1di*#nuh`0VKQqk^ZkG))btYY<9}e+c0lo- z1d2w(r)M@#0mBLr&`-$TkVT!m(GbZoFZ{|l?Ot9#m9g9HlTSTrdU^GNXn2}LQS)h8 zS(zI1=Yf6$=Kqz>n?UPDj}G|tYhry)adqq%T1j#Q&PBLJ{5p1=!k=#Fre2hJrs_9{ z6{>OaJ6X6+$&J+p4vPC#wm!v9-9&GOOuVUjIA*yM*w`1vq3SAlNZg{H09~tDEnF&c ztQp=F`?D)m(W>H|8G1g@*uN~iwP^BI@e+F%_7Ts-r72GdB%4v6bmg_?bL&N@5n!TN ztXZ0PL1=!3q?n5dAP-*^l7fAM!B8;Rmtxo<;w{ZzWRDH@6^OI6i7_o+vhdG}FK7qC zcg6R#U62!(aUz`4krIy~x-1a8_{@IMm2dnGsVnd8LZx=op|29Hs^}9g*>Kj04q7kPkZX6!o#+@y`t^nN7fVW3gPV z%mj}+GJ)l1&&g-|N7sH%(s1#oT0a#>9U`kIqf<^j#*}7PCv!Ituu&)QDFkNGuR;ZK z7Wv=-l25}*uu%!9O?CHT!b`x$C7^a1!1LljMQt*;tp}9;whZUrvY1svOmbR=~Ud-+1+w}A*eHe z;>O6+G|0EsfeKFK^m7?%QY0V@lRQU|;1lX90RI<24Z*Pty^e~76?8qAdh7=9TRmzj zrE6hHF@P>A8TxqgbmXy>)C(Cl%_0@x9|K_(F_C3~OmK2{5@bQXX**N`o8w9>u;Uip z&n*_(WYf~b!64ZJypgHnr{Ny_4rZ({!t~X67T`>{nSxukEFGI(={pZ8~axk6C{rk znrUguH`XHq!avd^ad}gXM$wbTL528mnx`Sx%GX715Wdb-`LlU$E@E2~sJJdW7((x3 zwdut8IMqNgGP{pkj`g>7^hqEX-`AO01x(SDR`V5t)8tSIRTa~#i(_Q7?Dieuqc-bN zTX7nJJ~?rew%XaEg15<&rWgNITLMhgCCO1#J}{W(sEUL_r%+)hKH&DpBOfp5TWoNk zWnJwg!0SQzn&|R4z~@CG;y>jRC9qr}1;cOAqK8>zE3va4ah4@{a1zZPl=H(P?(FJL z^97xuP$*Ir!cXfzd|J=1iYU}~30fh0?N@kO9QPpugmdE+rfiwu>wQW*VJT@?L31aS zIEQwd08}yFI&Yoj<#{@aIQ$ls88~qIU!ZppmM62XBm8k3MU%8+^XoUzi%lR9#d%q3 z6)Gd^=boG%!xhq0f^7J7*Cw#&6l9Z-j-rOt?MYwK2U33C3aNaIxs0fn5Qt7}3-;E`j4FsBor2 zCJ1X=&<t7~%Rs6X7LB%$6)UTmC&f2NKvcE1`ylu6~{TBCRNL$_EJTWD` zfDI4Kz?W>?N_|wDcbFS>B6+bIp%`DgV8A@R5TwWH1$2#$6PEC;@c-KvznE(6zhp8S%I5hu~M@7)Jr@7GH4a;K$-C4u0sFJp2L5z6*kS zzKr#2Nun|?FQu;4O;oAPc$yEl(fIQM640L)Ko#E_31zZ|Xyu4(nNtYQF8k-~vM>cL zlxWOJ2`-*M(96ETt+BY2gOE5myctlN!^V88F+aQl+!yx>5-{?>!a(wZwCK%=RG}Tz ztRaQ}fQ%mkD~HKIY42np=tq>_EWJ_L%dtc4z;<)US9!lb1kJcx+?bOKUlTvb83fme zvAOHv#~s^qe}eH8?HwOgNy70^BqB$#YINKHcwP(?bJSu23?32B^EvRc7@cR$@UEAI zmGHiNtb~9GkcHm5&eJ3k`>V)2K&{j2pp;Q{8bfU9RAbuqI5$~yTR$c0KSH*hELy2 zvsA9v$DUpw?jBvFQTE+B>Y5FMD0V|axSX0%P7#V>8I5_rN5?opG)^bvPF?C znrt$%QDloDTc6H!FX}s9bLDwTyS5kp=X3AsobwW5;|>=I!O-f39W;GfD#bcq&0Ys>vzNwAmZ7LOJhkm5*lIat03Z@_z9#`(`@}`l%PPTV^Vy4+2c|UcHr8vvh+SGUT|t@+s9Ns?EyjcI4WNjv~^8K&g@X& zD2ZvxM$(H?PLIp@R!QfOFLVZqZOy)267g3X1o4InP|M2U zFz|b8<$M$T@5*7Ct-pY%+mrQ=$n~30{#UKj2@9V^DrRS#=`)>8cNKcr8A#g?f1x)P zE1=PYfWNrEei`x%bRbnA7SGmM-(!H&Fca(KIwKs2+i8axluQ7wzQC0WC~sjU@DaMd zMY{YdGBk{%Ijub3O&TbTi%f#=Sx7&|_Q)ru0{3(Pd)zAl`*fUTcXkxq@jY{R(rQ_R>5&kIM*oLxHFoi@c$)3&i@NH9z(Jz^{qeS~Gb$K%27ey58dx)y6 zIUy75*34v?z_TF@?un;%n9>QRpCHw2NA`vfSbQcW<7~3&4Ghh2JtOc%es&32)HqH9 z?S_Y#yr5H{juIME6PeQF?=8p2?pF&!A`HEFZ8F^(I8Ley_m~n0jihadEs1==zxl=0 zV+#qFz*EVDterIQC_Y^)MrOG8CIzvmygNM&gb&CnEhcaV8Za$krYfabmu_|+|iI%s_2PCNQtFZ*W>3gI;yHIiQ4;e4)R<40B zx7LHAGy80Tj z!ZJzLj%wjVX=XcmM4f6yL8t%N7Zl~$Q28Wud!;Mw$Q-*g9XKW@L)v7DG&4p>%H2p0 z8ztLrG#Gv^P2Y{iLP>gSH%f57oo-%6^K)VEc$|>TEjM+qA*FGmo%)W@m4i;>dc+q- z*596?K-)oxkFt*bg{(QYHATPEfm@{bJ;)UWU6Owfx!f!cxd75JZ0c?U{^W28A9C|>ipof|Ee`FJG;6*!;pS3#d7wRE65 zI7w3OMKjGD35uUhEEH{xXe5MGyOd?*dCl~VSjF^MDcJ&Aa&0)OEk zT~zMxgY`m2^U%OIB+MU@R_sTXgardobuh$!o1lK$)c%VO1mfufjnC=prT6!v45*WS z+>fTHj||4+)iX#j9`m;q84oM*UTIb*O3JxdietUSi(m#P6cLKCC<$&c^ds)ProfkU z$S35n-)ZQl=pp*W!y9XHtkl^_EVE0x(uwN1yg^@0W9OR-M@O+8X8Hm0p6$>i{ZDEOTR$?jPZ2$KnOa>^vJqnz-z*MW7VF zFAaVJ+2AH=`Wq-6zAmkL15Hz%8cPk~-eHxW($8aG0y{{e52Ai>u;e*NR_bMG+Ck!q z4btU> z9TV5=knSEvxiD8se+%*OX=(gh$OM;5PrZfq4=?dkn@KB$Th|wRNrKhCnM;ird$%{5 zl&620uuqO38Og?~K~Xs>H*Ey|t}l!))h^}%+^?MuhTwXRn@YpG*osJ{WTiz%h~ayr zEl1F7ctKKkp^A*BMu$=W8=VxGg-K}}Yx<(N5Wh9WJVNf?6Ree@2=VXwNLdL+8YfE2 zyHEoROZaV64@;!y-X>}JbDW#@rl-cpGO=HB{RZzURBT8(p3Kws=M;AtvW zkfPo}qHNxHyh>X44q6NEO8ikWez7$9DEdv7If*?dO+JRQiPu&hL%UNIL71;qc1 zo*>J8c<1#KNDli=(b~Jt{{;7&{y|7ha_4s!P$tENC&C#Mp=TxGjwud%lvW#_4`$34 z1LZ=)k}81H7<|=)$H@!*boekuBy^Yp7*EM%eA(4S_)QuVL3wx=AQB>uA z*LTs@7*)t=_c`(SbRb;ainpbmeV@D=O>Vvw-Z03!K`Ob5^4Z}-HXo+&!}8+E>=}0i zx~ZfWt|DE-Cc`f0Ow3>hIQ!eFXOo{_*-Db&JD{Y2^VHDuCu^aFkpRPOnJe>T!qIj02Xd6N1%vJKle7T@g)e9+i*L>c3| zt-$qccxS~QQb~?aM{PVEH7d}QVLGF4DRmIEx(wh%O#*2wILSL;>**1|Kl{4%Q|TEp zA?hTVHr0o}llERmR+(YEwOhJ&9c3AAPh>yI;+|LNb>o14V`&uS1G7#|1b*{{0c=1@ z?;+TyPzv^-LRc?t>_J|B-8hX?AoC01Rpb5;cGWll1Hbb-Af7dF)Jb0V?}l%PFb4H= zY%d}kO*VNa+pBOKPdv0LntSz89pKPF^0HZ%ME-LCXrsvM3~Reo^$MdiTwD>&-5yOY zXn*vl9(t$KwqjCzLr3b*kc_@I9C4=_o~=(IqWKnJGQgyv4l&CSzBap#h`)$5|U z<;i3ix#q!P?Jr9AUSZ7I_7|fbRK31Sa=l7aukZSkYVT2Lc0{%J=$}+eE=e!H%GkLj zmx$_I^8a|so=eiFuQFNMo`~*$8erKLDXoo3jUxN9HpUWlAr zPu?Ht*hwt_9$b4@X0f5O$^kR8my( zdOAHfky2AEO1C+MW#UOolT=j7QWH&&rjmJ-bF=a@PvTc6>(l z{MP5Y-t}MaIvk4b2t`{f3dVZ`5H|0igxe3M3wJnDh!hxrRokowG}&l6U6wZeOTDjn zWJWm+b0;z|ac-nE9wZLX9bNxk**WtABq208`srw$QS_u4(Zz8sMBccMnMGp&&r}n$Q55sW%oR7dX-we{-Hv zpa{QF59KZZX(Zqu*v#d~xf*p-W7&MCN}ZrxG7=Xlfs+MBIw;{>mi+eo%sz0T>h<5i zpPvs@6J1@#RZA~IV5V7C^5nb;QZ&?az|O9Z2WKSStCbP~jE=WJ5r1OA)$)#BanJIU zmVt8)t)|ennca>uN0Y&=$npf^8+9JqJt2A4tlKR>Q&lFb*Tnqk6gu}G|Z_Z zQ7Uy>eH>+_p3pfy(@Q~D@B$QahFL*RmK&eYK|J2h=@ZEg^lXI0^5r07UfqrC^f@g) ztDZ&U4H`Ytf#N$r=chy6QG0|sCbGKn5DP750vlWYgj==v&8VdZKw16(D4RA$v`llK zZ#h-^9!TGXH`;lOY+KP{qm&#jv46Pm4eihC&_Q0w$b5sAP06-VZ#%xD)^+OnQ3d_9EGNILH7j}56N+nkj?j4%@J6k1H0^hnH)cO1wZp-&dO4#e zvFNWfip`to`-!$VELy6b*BX(DKg79L@^5Mj-=ZFc8det=ZeOn}IOGcg{)^sm#n$Kv z1{;GOH^=b18LAwc3={(yvUC~>6nmOhvH-O#_g6}a5APS1o4bqYNv9SO@8tj6fKM7h zZtPS)9~Kv%QuuNHiKwC%A`xFuGC3%oGFL7JzwdoY+6D#MqS2~>v5E=lL6=XPN{_GbXIW<#?-uiF2V*sM1YKg;u%-hlj=sKD`WM6`jasI5^H zP|_eDYdRJ#t#PiUiLb5l@*sTy?5*s%XaN+L{@sFa8lj}aP0J;vW5N?QK7tr>>Wjui zvU908A`SNCR+w9q0=~~^Hm7YE#isI=W;~XoC}{TJVLC=}$@otuegbC*2D4lXk&gT+7KzziIX1D{)~h8rJ&k&Y7>F_Z;dC>8yJ59{q^W^hv$ zgY1^wL9vXZ9QE@+vVvoWp{UxIQDZf80|uCH0O?r>T>JHZeBI!Tu(z3-^LOc_lJFGp zdyMnp%sB0U0Ez&GyZJt?Ma%Xihm00$NwF~=MEAGmJcZ~@-tmxFjZs47?^$pC4H&S( z8Y>Jef$k-MXR~+8R~r!fq(6)e{rRc(?_WmQ4f6R8ca8evg}{ z2#Z=eVskH|=VE}}ETkZ|

+o-UawAy_wjnys<|7#?B~8eCt$GVURcz+0@;>osq@_ zJdqaC*ZcK{m?aRnDzdLW4ZCbS$PqWK}?441NXtXu3fS{n>K*TvZe%E-878&yP< zCfkf1{4hLO0GUryXX+BlKV^Dmx2#$8=Z*wW*fLPCq;+m~i97xugwW zP2&E59~Te;&-80(9vMe!0O2*r;m1bf|Fm~Jg-fHSpRU|XyZo5~o5_LqR)pkhrUp2; zpEhuiT_XJm&Mj6`V(JlYd=hx$^<-%iYr+dOHWa4u{W>WQ@H)==N}Ln}#s?(<0=?oH zmiMJdwy=(l&j%#X%Jlq=!%*}y7qC z4U0a*D`NV!xHZYx;qZw~HY4wQ8X-&mgSw>=RDSqi6ioF_prp3|JxttRoZ{|T01D#m zHX}}sFW{J*lYD~GlkRe5;Y1BH{IMw3Fj04 zHO%>C`@?uARyJj#srX5K;!2nX`~ThL*B{*sWd!vZX!)B(ECXg&@fsXW3+;X+#TVc_%RSW5L%018LBj2F|gL*}iANXN3ATsJDXBt!^`j)HU_6!B=&M zVMxdjNRU#1{Rxeuo<=M+MREdPq0`HOG>MI;+(g>$U8b000ws=y?S_ zNtH=Cu)ZqZqe!xnB`co5Aw7w&vaI9OT4TEOhHkwZPsQ~kpq=Q&B(O|EXQ{jVV%G6r z>BN&sUp0-ygQWVG0p;LRj6JCCH#(_j)GJ2sD1j}C1A0P6?qv$4Q(__F-;Fx^L*&3D zDr1r(ah^tp-Yfc}0UdRK^a~jMf*rAr0J2&+^>JQFTWJ7UoW~P+m;4^%NOx?edl<6j z$8czF%U`8hD2yxojKp8=z1B?i4F4E)R(;Uq&~>pOJpm|}IGH-#WXqX&I~)KK-$p*_ z*ZM^0y|;|2NgyqQy6*gbT1p7-rm@7igqCaV9{pL7GWz+a)K5$qy2~+)OOfzLdVEp+ z#WaozsE?Sv&8Y*H_6LC&Tv$$G(=%0o%E=1K?+Nl7T#P)U!X?je8|giNG%l(i6V}?3 zR=_TU>z=B+irgf5y&7o+;Ma!}H9WZw&>bkAIlO@ZXA{WwZVScol{`X%t!~D_T1!m9 zLKIWC(6N*uL1%!8;xCaSBE7dl5Yj}}CcEw{eCI(u88(Q}nl-{eCp2^d#CEPXkSIct z*$8`#I!3V4-*X9yIz>p*wVIq>XCFQm*S}hQNr<7V`Aya8MjvQ`A=uLrwy-nG1y_IltB%qNc-@iUO z)rAkL2j3tOt4G|!;{Q-riHS+ZIB5ov?h8}+Mwec8yX2xwO~YuAio?@FFE!0|X8xvr zBU-9H6Z9lFmHG5o{44{0JSiEE(~ApCcooxEqeUxxbx08TZ%w=`X0{sKhta~KbKolv zYpD(X?~lBB_HCtv56%Co3~RSnM4L1jt%*>d7xp7GXHyfkcLemV3_mC?pK3bANDly7 z>zH}}$$}EwzgJgSQZ2W2X20xlkw?YwDDtSdFpWg14R>(cF7>qKfyk>Y`0eXet=0C# zz%n3-TXwcI0v$e^2LpHD<{e<~3_J(_pUyr9m#-05y}4IYvbS(dfB`*jFw`A2xcnr- zAJ`MowSkN)WMXlg)OqH&six?xBRH2kWlZPur$PIXx9aPv(eph9rqMYQQf z)G~~S<^=q)niIQ?-ko%2w|XNsM?)v8DK;ndzt5N3UMROw@^3kiIpcnog7;^sO0!dfyGL45Qw6hoEC!p`&4RtZN9`zdNLPpP>D_ zhoI|+LZiay`k^6c$BEGFFxqiq2)g}c=$$aS{pAp}?^5V2LFRpzhM*-O^A?mUF1bO- z9q!+D?AVRaqgznAa4gLJeKWh-La%N?2^x~`+Jc$Y4JF%J9K_?W%3aDXwiF z0xte}cxY?5rGFbTilj88s%<*g&acHxgqGMmqN$8 zP`rlZ-*=&8&8MqF72C;*)3%dQKV6+Zc-FlK!aPanJun2de|5jHb*5c629Vk&l@)bK!GPO0dYX?do+1EQz y3bi0a?Vow#WBr=aiZJI3$zIv`I_aG^9=F56A$q7FtQh7TVegg0xRc(HL5xKoR8P z)%zeVT;!Kl=^)XFK()Yv%g<6k;DS^HK?SAv+N+2NOF@ulA9`I^d}4ej@Q;gRW!9|8 zCue5Q{_)-WbVO25MN+NfeG@za2!HKH!p5R};kKS$qzEj);yu>mN#UMMknyWJ|F{ta z)d=R-uuwBMMGAnV2Yf;|uAqLf{bYvR>}d{#q!3DBJR#qxF?G%3>O;drA(!lKZXO=; zNKy#^dt$kvZel}yiC34rDIJvSa~OG6GXc<{Kf|oDLbDu@WzPU)xV{a)&WxYt95%Tw zB`meU$e<*~pLgX#&zm}zB1;XtL2}do+S{)_0YK2+*6oZOXqfZ-GfAL#7X5Zi+62C- zjDH@EF;#iqbdX$N&tn56imyue6^$-#0ciw~ey*?$J7359(O9n3siIF9myE@wRl$#a zMkXYY`#2`#lY0Km4ZU3Z+ARo_mI9i`)K%&0q!+-~YnGJ~xn!c03XOW;o@&ejX9_v2 zksJU<`vy?NA2@KecB+Gz+~&RZD*HTxTao37#&r!I#y!W|mn6*qJf&l|zy+7+>K>X= z?>mFvWt8BHmf|)och2aLYT|bmp2Vf%TN*9}X0^To%Fg?f`9m6MCZMTU1dlzVzzwp! zlclb{@}RCH;8uS4S>3}}Gh4u=p9ab=Ue+B7dFP;pGlr07PAIeqn2^U>seZ>e(Js}1 zZ7HfA=MjxM0uMpG>SK8LkTY8Q2RbR_3SEXWy$?=QN0>u zGEn_HwlNnpv)izr)bE^!$v75e#Je0`Ew>v(lG!^Du$KnG+gdT z4eGRijo#eB@+Z}sDFxw7TaS_Zc7#9K8naSTH*s8j&k3evBH5%dj$#7t z>ZXySr6`<`b#g(yqx@qArCSSjUYE~RZkDdoZu)sTzYz7>s;@W6wLc79u0|S%#)snfPW{F_JB`Qejg`@ zl@BB70f_3s%Q4bOFb@I%|GX2nd|Fg>&3W|_wPiTpt7nEwr4Oq`tTVh!V~=hgl!7F0 z1$YUdTApYThYQQ>?SIjpErQBxAekVT)0z###Y(O79dJE5|MDf=!8|m>(j^=gIP?vz zk@4goS_oMGqNQ=*A0X!%Q64@fErNbp@Or$C8CS6SAb7}0Jp{7H8LTS@-v5=QGx{8a zNz7yjl+*>D7N5}>Xd;#bD!+$P!@%I_FL}I~9Y3vJQ9#LqQj*lsvf@$aMusM1yO#&) z9B@C5oR3>zT;&%Qa@7bG9Zp8Bs2meF*!Uo3sT!1zPT}UGOez_+=dCcmJQw^Y8Lp^p z0OE%5)n@WQSdS33>vs*SxMVWd#8>GJq0n$y>d6Mrqvs)?KcexB(o6hfT1JuuFK58T z+;n^Cl^pj?>%*q+FBUrh=Ldf!n+6(WkE}@iKY2!$N)5mjNsl7VG&KeANmdy*qVfvI z4ZoZke_456#hd(L7G%qjFpggSA8Gy_F__er0`~8Glcwea&Z5R|9fS4Mo%({tFz`=u zyf>20Z3M`?=_Uy z`3vf;T7Ap_iwqzwgy46-{M)}DoZ)2KQd4QSR;q}bKsK|^2MVp4m;j0ZWrz858jFVO z$!RlMtQBL8Ss=Q>ayzQdhwi%Rg|oiQ%z+dnol&YfA@E%7_-U4dMdBp$;_e-;Ibgoce%9( zLgcK_dXmcN0oED68m8skVyDsCVCcCa&NLt+>y|T6IS`rTn@Ji^#tR_x+3HkncJ*<# zXHM(-rEhg)gThsTLOdGhc6*Uj3^lZ)IO63++O;qqW61-`>!6qc2HHoGB75r_P>(q0 zI6lA)`I@JhI(f2sMw^eq>i=mS;q_K3Pk4RdB+F@~;!Ozf=`2B3vddh#3WEP+6o++% zRp{oYr*VF-O^;}8TesLcnhKX1Ee5P+16Mc#Ag7{@TpX-_oYV*ygJ;GKynsGOY69U^ zDB?>~$H{54UG3m0t8Qry&PYdD`mu`c0ws& zfl+3Zb_~F&&wWOude}PCx1?+jOEs^!$0QZT^J?}WcKrNsz4*D4_ZpVIOIp*eCy@n% zjU9Hs7|by8{--duln&L-_!X7L3&i)NcOsHD0Di#wfesYkW>7G1GZ~4H<V8jv=`cM^FE?KSe*;=#lLY-%iU;P$CgbeE$VIkAfaz75^g#EI2YY`WMkyvC^ql) zlMN_bd<{1&e&a@TKH%yFnMsY6@M8%y)rq5v0fCZpD=s7b2+^gi2a2VazR<0;i8+wAO+(mgzu7oDUh4f;;H>3W)x!oq`dN(`(wtRVogB=QxGdzR(s!p16P69GOrv96 zUVSK3{5ABfQb!rn(O9+Cm@h@No806H(l`vFR5ohCQj0I5-}qL}{#iav==d<$Omj$T zdGVScdtx244IjC02_HYV8i9(Mm_+ z`HBBlKF@$Zw1ZRt#(-eMoZW`4R#vU#m4cTHAd7Q(D)>^sV;td5FLV#UR;Vf-smJqM z|3Pb`EVJwbI;y#kTHWe#ehT_r%`(}Im=n?kzJ>!D8E-KlS$)WqQM~dipe;B4Oh39w zXU{vWkG|OCT996ahW(}287VuSUB(LMaz>tJ^XM*$$Qa<8)s3bC?WHuUGglqkvZ~|aZ%%#cwl<1 zK{%yXn_EG)Za)^5P(>lD)?O--dCs#U>V<^FX$3vmlOFc#MUNhK3XMmNef& zoYt_=zJjG=YLIQ<|3#}t>SXA+a#ua@+p*_h{c`|TT3#8l>uxvLaa2d`2w}WFBE)t; z(++_29qSEr>QH1Z*=971JP=%32KH+-aC}(}?P?Yyww61sEu`qyGO-zYG7MhI(MwZU!^haYhW=kv74}^4+ zY^IH#yvJRCy1CxeLx;r$SJYcV!GwcGYkO)_sTNoHpTuAx4cTddg8I6Oyy4cu9PvUb z5~9;l&E&YeM<>GBT<{&8SD1Nr67-J%Jfh-%UmG>VPEM#N*3qHZBOc*Mv-*ta7_7;Z zW>Mw8Je7aKrIXz*xx6r`X%tA~?j*sB^QJose^QT%mdVEj9i4w=-kDAovJfCuIpiUo z_^gSnW&7$ixZU4?1(E;K#E%vWml)iq@L$Wn0)KT}g)`^?cjrU*SG#v3;gtE0qVnoq zPpT0ug9~uHKU`$K$;C_>E7p)d7rR!Lx4BhXWZQlVMp|aI@dD9@*k}^ zAbVVN!(tgow=I{a(IL3$7HQk3c3Xy~T;(8O>rk&-GQy3kV5I{ZOX6jUr(G;hrb-p< zfLQLm-^yIL@-5Blrb9{5eW+|qkO3We!qD)(!4;sJiP*v9zJtm5Ur-f$KlRZ2sVF8D zBB$9uU;q|_TYjk&OmuRk@w61sg2Tn=S%IUyC#Ej42GJ(zst{Bm6ng=0djYo& zpmIEs>{X5FdzkLQ=eyOD=|xGmo$VP;^z)LOjAc0)h#pDwNTvsKASVNNy%gTrjl>U% zEFiL@y#SUB>QxYusDpEQi!ubAMF2}07ksjzdH!R=hIxb}@wQ%U*@nb72h%mZMfWcH zX-dK(>F%QQyOH=K9dg82_5j@E5WK+(OC&NzzhO+YUniF^@9 zFCR%jhu)9yJ1IK!egay2E>aOki_ayXQ|BWsadhf@0&4pva)2Up+cycQFY^6PJVfmK zo|4-%Vg3-sXn&+&7tR-={RyjDUWzpD!r4jm+`bDt(e23FyQr^kuSvSsqxAmqd-T1M zK%xgCg}bp+h#pA4chSyRWcqGASlAiMxEHNm9qEXpwW|l+i~6H+WRCRj#s;B3n!tB= zMofEfj(K-y0_u-cQzG;qvE6&qwk?t6d#IIdTQcuOUB@Et#!=U?1hnR($n7{<^HBoY z*%$G4QM9u!0kvF;Jk^EmLd&HD^ln>Za~IA^qUTdxI47xnUF2pLbwm5Q{Cmw>_g);C zBPDyW!MyIh1b!_N+)JN-?TZ9*?E2l6yXZ>c*!2Yd@=tg9wK#wIrv!fMw>#W9@^0C- z??>V(j+X7M-xB_MY+0nd8#|Kdd1g1xL9s}CH_oP~ecd=0osE1MzdE~iAHBE`aqYu) e@xsP}gbdx-5qWGM&Js3uBrq|0V9h?P7yloxjq}L> diff --git a/orderly/data/test_data/extracted_ord_test_data_dont_trust_labelling/extracted_ords/uspto-grants-1979_09.parquet b/orderly/data/test_data/extracted_ord_test_data_dont_trust_labelling/extracted_ords/uspto-grants-1979_09.parquet index 32552c282686e4c47083ea84ca7e36d0986d6658..c8ac1436e26cefb383d6bc89ad2d82489977bd54 100644 GIT binary patch delta 1260 zcmZY7Z){Ul6aesZyIa!kwRipV-rMW!sO@;$o_4wxQkKd{%45rxxON&K4%4tYJF~^i zMUW3-oE;;WI5uL?i?xYEO+PW?Ah87eS1C#3M)v{<5rRv$!61YvW^*ROh_CL8zJ7SQ zx%r)Q?l~uKJZm4x+P~ge&i*!vxN}Y~x5RM3ZQu+5nN~XZ0~XE>mcPGYUvCvi*ZB|a zyPtcJ1OHAKc-tbmf*uG~fWHF{nqg1M-n#|%R3I_9eV*Z-HlA$}EOJA%CaiZ5(WE!- z=&R84?&HC0fCNjVS#FLH!O(uyy-WYj_J@aK)}t{iQa@cLW-UA6vMXzd8;<_BpSu}@ zuWX(hN;L3EPvXmld`z~HN}e^K{id)Kj@a(n$*+8!$Rd6s}o8eNH3hitcNtby!A=ssWRJ#`%+@KwJi?!Dci--7*b%=f}((!bTvvsohJ z!GIZ)<7}vG;}@pgLq;R{-i}@Y+4voVKRIS*LW8)+PuAF~uRNsNsXu3Sqm9*!N%pC$ z`K40w)@H=e;ozqL%!Wd6d@rna(7QyY#B zJ5L@%{DRW~{IIkw0N`+{{{nD20I0o1r+&sB`>hr&7WU^i0eVgc9b(v$STEN1DGv)IC=Cq?b!Bg(pW6@5f0 z@zp#3y69gY@#e>fUt!b{SMQXB-dkQTHLaFn?=6>jivB*Uj+t635rx((c$3<4hl1@@9NMM~ds)FsYereh(u|o>OG<=RW44&EMzMjpWy~c@ zFfj_0O;H0*YvOV2VK8;Z=&&-ihM=)=_F=S8UkGuHN_0&V%=TbI*aPTo>Vw`s+}xb| z`TlV}Jsi8l4fW5~QRd|%JIE*+)x$r*tOd|32L{4HR|Ejg zj~J&$j7Yvz$q{4CSQEd#i^L0&|IWLW+&fUSDUQ3N5c}A*GT6fz8%Tp1jp{;`XMRi*UkimwExSvL!QwxcO_f*UV38$GEHeD{dOIiE)e z+8a;UBX^!)4r2EUn>oH*VUBOqo@0#xDq(;N0c=R0jHQz$$(%X8i z?;MWGZAHr$5*xKL-{@^Td-{PujdGnTOl~;3O^fKg7n} z?5`&7FV=jP{lsKj$6-JDZpsy`NxFhKGReee5ZR^|Z()LV=(Sr|;&%dvJpg8_{MUe} zl0Q|;2EAQ|ji;4xl^RxLxH_bS1sW*B!Wkvx=}TqEk1AoCe!hYo!uHr?-IwT} zE7-{|jVntMv@L_1QG$juxCN!@Xa?Ja^c+&==IEyx>=fqatK=@-l1qiVZ#5`u^3l;2Qz&(iv_xn#Sb)1>>;ucF$0kwS2dZpRFcIkHjwzdMHzT= zu@Y$M5LKk0C5g+3#qNTk-p{3yZr| kUZ9ORyk8!(J137>q^ELNl5aERRf2w1UL}$nIs94uKXSoEmH+?% diff --git a/orderly/data/test_data/extracted_ord_test_data_dont_trust_labelling/extracted_ords/uspto-grants-1980_08.parquet b/orderly/data/test_data/extracted_ord_test_data_dont_trust_labelling/extracted_ords/uspto-grants-1980_08.parquet index da8960319c18bdc0bb8ed5c72f404cd36eb805a8..d1f4a143b96cbcf1723f48a39969f001c936f81c 100644 GIT binary patch delta 5804 zcmZWt3wRS%zW<*x6K3)_Nlr4;ByH19+JvTULNlc)1X@Uk6bd#$DitXOLvD*eEz3h5 zUNAReQCvhc$|$}zxJ5xhk%x++^4bMO9_zxgEO@QRMGzk-vbt)yC+@x9w+r7lnaSk$ zKj-}4k9>c~_CJSgug*7G|NJTv)_E(02e1HwM&JN8PE1Qh!q#VubNeMft^;8d%X9T} zeNAu6ogf>4^`gbH2oeX1^rkIZswhgQ7wLkt7@c7d0}St|U!0b-%=5nIFEG^3m! z(ZYiq`E_|6+EG7%X&3;hS#+E1hGS4M;!ROSsh=ws!nCJz&_H%D?~ zZ-C#*#+Yb)W)BDlcpiL@>}7oN%xaJq0y|{d zWgIk!P`?J)X?0W6S1m5|PTFqG8}uhVtRUyKplmsFu8@niDw$IdFo+P{^4;#y+~_u$ z@6VJfb{mp1Kk^*qlRM~0-(hvm;(56oIw;$f(g>Ch%PgO|r=uFAqhhfMSMwdE(BS~{ z+u-Q^ZDwrm68GdlE2k*Y;62&}@(vkGkyREe~08+ zIBWpWb@P`er!QNpb${XYQEykE{WRH?4hSRceo>wbU(xdoXb$_#g#>FvYw$SUQILL6 zkWgj%Wx>}{Zv9FNtX7^4jR-ecs0O~nMa_y_OBqoVvezb802q4O936ak30Mtg{I$W% z=xbx%7@E*^4$CdDmNGE_bvnMbSmi`%BH+burU#04)RAr!%M0x|GKsv0(No>?o^1qq z8SpK#<<~V9=|r$Bofa#ED(kRPB_Sue5~aGY7?Hm1Xh?@q7H9m1(rjvgsaPX8_UAyQo^kIFiVc_0jpZ!#b}JAJp!iS9~2 zX{%`XU6DK_dvI4v4pQ=4>ql4l+LYp;lS2-d-PvaVdQoi|4DIcyV=N!v({UVxVjSef znfalgbLC;+ez{?0T4z6HJ-QVM>uvuU3ZLKhYbapHfqtX@e;;=Tb;Do>l22w;$WzsH zB@&6Oc!7E|$OE(HTiy+aTcB?cd^HI8LF~!C97BDMq2narXj*cL)!VdA(faK7sJ(gP zD~65-pw9t+asd18$AkeRZqbspoL=#kDx1R`yRiN_M$SQ#{+jM`R{B?H#s8#E8}}5I zD&9nbD!2-1O6j*mRm_`^W8Tnjweke`_DyjAk^GWAy> z9nhX`xJTP@1=s+&P2W+H1%eJp4=)t78V}PN3ym)5Se2eqScKB)WMLsQ$GTw7vUP>` zuPa3KlS4oK)OCf-bjzuAIl^P}{#9|CUe7OL?%<)k;?lnQD zq5RB`l!Vj8U&qUh9Q9eLn7v|Bd~Lq<7flq0Dtj-5Ff_UaWS zHqLq2;C|LU4-&}2B51^Ki1;Uyqze{+kVlrA{ewYpfGw=}9RP|wAKg6WNPlUiR4R@) zadOu@{Dc+)8&0D=_!TpB;xzBgW1`B8JY0(1CmctuLb+p$*4d#aIRdJdwhHR9GRB^q z%TY0_$Q_pUbr9R~1jKNym8tUfi1K~hmIol}KaZGbAZ$jD;wzZlOVbXT0t()vgQ#mN zPm{@zPw)w+WjAIR0myh@B*(?J)#(Fl4c4p7=X8l6747y?gjhvxqdZ-zo|XuJ^=yJZJ}a>Ydw2(8OFT;w}?zv z71rq~lf=!%Zw`Bo*JpEj8aKH>CV6@$QEDF!a%e&>9?atbmf#o*zF+HHV(6XUdovSI zf+Zl&&sr02)a54?rL}2#Tm9S=CjZbw7rGx}@oiL$MBrSNNogHd-sbt1*tP6cCXpb= zdG;Bpn!L|lxEn_It`4RQ>}%eI_ys`>5Qqi4iF^sTSZ%Ykgi$)tWMse>IPmfssO9IN z`+Lr^Y-lZyYFO8%kEq0TE{Lo43I-?3Qg!alk%3Lydgpk~;F+HzX+C;0JbnK`eQUE% z770g6L^K;6*DIy9&DG7QdO{7kO30TMXzrOt)?r*@6!+PvY}n6pQE#)mt-b&vM#@Cb z>-6TV*P^-31!m_Ik=hsQ$_@yqTi999jcmd*nOQ0AJ=AAM^Yk`deYJuSW}W895n2;{ zF@u5a?>yk}Ekvt*vojHS5g>!`44KKfpQTQq&&yY6iIroD-0MbEj$iE`5ZC7JKZ9CP zDc&zKiZ__b*?ku-vxtK%nakm0h%+fMH^8)fUDent)_zOn{p-W_yK0KZAD&i%s z!3E$In+saxoc61fW`G4n-FLuj$ESJXNlvI?ldR-TE`nxMBf50Bn}-CsY;>sHet&R6 zzl>gXLN?19QK~6)Z;n2Z2n267gq#L^++bLVr#W`FcMsa_V`4tMr?4*sheOb-f#{|w z=uGAx(g)%?lY1=wP#Z4<8Gl@4i`FVMf6T+5=pP(PqsaYJT?j(Y_M3ti3So<+&D6j+ z#51Ka(mTCfcV*9>HCvt7jaeU7|gXjj7{GZ&cJD6eITqn`g(=L}>d zCP2t+mC}{ya|P-%2Q9cg9oq4V9_dO2(-lOni@{;fM)06ILBm z$#|2SE#$d8l2b$-HHxos5Varh2sr$sc*$cO#NKfE-T)HC4Xox$3$hvV;s53 z2Az%)5}F25(5s%VWCEyN#Q~;uWOFIyqLzH=T4jZM8-bX4rYlglxF0qz9cA-IO>ua8cs}m3;FSW-Q#S<|C-+qmM_UmbQa=hXL!^bC9v^#9TwR(8 zvLm0IFun;w1N)KvTxLHS%XG^d5?cXk3|>BVJUo9WNGGr)23!*Mkv)edp1I&JR@7y&%7uPhuxRtPb;V! z;!6gDiYpvUSLtm5$9VN)nFh|>qV1nUskp~OlfY3Yn$i0dqr`%%5$+Ru-jzn+pH1@Z zMyZ!7jXdKh)0ok->^*TYT!=Wa1UpYl@s+9jai=+3)$nLD`61VD0_g*YIvs9w9*y#x z;==8S&?b}y$V<@&UF@(6vs=8y?#EmiS6N|6k`K`w>aO7pq}bK|z2L>u^CwLLbQY^F zyp_Y|5;EH4xXC4qkdX7TLCTyE>WeBKe_lVJ1Ps*2m~(J-MKGwg?;gGfFYKh?dZiq7a95 z(D~&z*ohm;?P!-g5|X>mGL4P(xsZC;-wJ4r+ey{Gv#@C*S{Av_vB|Z6Vx)iXqqr{D zEr|E&B;N0X`XrzU(i&51Gwoo>En55%6$|C^gK-si=O%IGAmk5xCr#A1Hu~DoX;2zY zLwy&ZQfLgG?_Xa6sgcfGJ+u^}Cr~P6eg&G&p7V`)7sWHuO7y9NSIt4jF=7Q_1~yI_ z45&PE!c}YvHFDt(#J6lPG&~)@Xz0Bl?hwVdig8@P2YA;84qCCZ+Obb3j<#}9Ie`DD z&I&U5$v12>-Xg!(W&ZAO0&>uX7YA_xS2q(WeW|J0N?4WeRCEXhO$@2#{e=*ALMeVQ zm%TmmpN0pi0j`9uSoe3+Og%C8&Q;zqnf0M$oijX}Er^FAkKu{VF1ttm2K6jPKSIKT zbmW*W+=hRk{RK{P(jwen#H%^hhgO*Rl-|9Z{yb9MTf>Y=);poIduVUceiJ#IZ`jFx zhJR7{oy$?faV+DGg6c5#_g)7yi`G(X$rUA+a}}p=relrxd`*uI zYlos&5L$Y%=Z)PHGiMH}4-R1r=ymmpAKr&pY4>Apy=74vSU zIe;E_$<&(S0uy_NCY-&4L2-PXOD^Pwy?BiJ!w|+5^m#xD-tO4l=a2S_EcQe=PN8-7 z{W{b`&Haa={#!Jn-pG{J;!X@+`j^)7>+qM8v-UghnHS-Y}bn)saryf!#&G|Rv8-D_TYa2oinKt-N(Jc z-Ot$M8las9U&F}rD1+aGm;&xe8_Z)L!GBSOp-dUts*V`S2syv_)mw)$9&}#4XDDr9 zK2g=7jAXUu;o&+ORCiRGNw#?3B;XeH&`_pgLRT@r3NOk_(b&WlqT@gnJ|~jjI3&Zy za!J0C0+j|sw0RNk;KfI^*VoincLl*mYHT>MGx{MlE4t+Kl zHSV0g^Xi>OO_8ygds zEm>sq(HoF8>O&L}*POZm`T1<2y2Y%qs;5mFvq;~r-}LIWqF)I-fq$CvzUOnqcji%3 zQs8>pZQn`1(z^f8?Dy_Q;_`e-OM&ZYcRsAXYSswqZ_OH$u=8Qp_4vi7vbbBg_|y&1 zl^*q?StD7m^xOcgRSStmLTlA9(F~xE8H6fi?S_HZ@BW#(jv~TmAKidlxul*ZniA`k zOE*9-t5yr$?&Y6uK=!Fk7EKA-r_Qy|V?R~bS~M>D*p+=-ub#`ktyeiKecP#4S~Zfm ibA9CvF4ebQoo>|><@Bw8eT7x?nVDOf4=^q~KJdSTYsR$z delta 5904 zcmZWt3w#q*y8li)6K2xnBss}U(zH!Gp$Sdfq|HE6NNFJ*QYobgwo4Ui!H{i1tmRVP z2xg;J9wHz`M_o{ZTNG4W^L^6}1XnAHc;2y6Rf*>ALscz4rGznaRxm z%$e_e&;K_Uj@hmrv;A?Q!E)k{NO;~;EUd-?2pWL{c>H?x%dw2GXRDzzJ<(7F>}uV_ zmgb3py&zNsc;Ko#z4Zs=n?TlsAj=|1EGW``Tckiyly(ow2+U#F(V-TBmqUYd@K<_> z2ERgVIXC&HB7S@m4{~@AV5_9!>U^eV1SIFsWd`p|Lc_b+XKQ8o(J!}FEd_wZW+k;0z0PR z3B=x}t1D{JwoU=%=z55TKO)03*r4=4(-f!?q56JcXI0(M{e{_y4s~No)~Gjiu!elB z1!ecU=d-y;vy%EN`1K-0c3K${&*e*$~|>Qr6-GS?0MW=>Hefm^kUi{P>Y+8zhl4p?L0zlBD25Bm&6aG?p8 zns65mJc{H>IH3o~=;FVe)_uWJmT{WbMLZq;*7wQ5ZnrQ#vq+S0fXh^6dNiNi`dOSc zp!@L@-fr(s2omykrvz`Jz;am&tX7@}wFpl#)7@Udd5wx(Nd-|9^0G}X2GIAk*xUHf z3b5!+_@ds#=qjV0D4N>wF_xR)Au7ZO)Na3Mw#e~6XC~-O75ie*| zJk1>wTjF?Z_fb(ozV0u?GW^;FI5 zO_}dnUf+X+?Y4h+`Sa6$art!nk@UpFRI7czxV#=+P5_&5#RvVXX;4`js)UYsJU%G6 zzuE)?U9h$bF2{k54}{=U2wW`#KkKT0*0sF>xX!^e*UdeK+A0RU0nB0G6n}*5);i3V zhp(d6DLbFix2=W&JDjou`_cyn1`If}QA<=#r%)a5s^Zus)ms=j4Rx7n&UCjpN}0Q~ z;?nMW9S+@UtvnUJIsmTM$Pe9z9Fr1ddhl=az>Bv-+m^vKe;J*2XzkwB!?J$Tq#LlI zhiz!!^?!}~Uq^k5-d)R!SG4wYH`5#qhI87UKuyyTyLS7}nx-(cE&9Ke=z7TR8ihAn zS-!V-Kt%sLBvEFNzR8Z7L~0yBX$t4F>utaC3kq8QSr76 zk0jfK*YPNki^pZ$#j^=#&8{@*p*5$^z85OiDN1yTf%Dio>;`uU5+F-n_GJzy8HJK#p5l^NKa3u zyo%d>H$;5@iI|8#WI~(pcbNS%O+sh}D7Y^JBF4|sBm6gm#6%*1@3!iq9wkcB zbkIQ$G^SpR0TguAq4$rj0RMx}L*)3}n`&{$;L2(}ahDeePH@#;NQIcFr=}Um8f}vg z*#8Kbsy9ODO65t{8SsVP{~N}y1oA;%IA~1#bVhbUQJU*#w^Vl~ zG5Lv#jaET4wwG=r?mr(eD$P@hTHIGjrk4GdiO0!1JiApYBgeVVR>H*o?!cXT_F2yo zyi*WI5r_s>6Zs-=(aJ_?1*5d18;~Afs(Y@kB)JVy_XuoiVXmYEzek=oakQTT>{ zx`cx{UC1VEP2HSK+(uotj;DvYR=~02$qAsKt4Ndlfr(Zo0e{GmQ?!73qk26%%<+cZ zy990Y&P#>mrGWIpR&opHdYJlzAuC&<^;nK7a=!~P`1XIggJ0Zg?K_K_Q2{<8GKwdV zOzT~Qy=Jl8occ0!5^+W)BkBT}l@06&hgr?{G9%ni^XLnd&AS8pxoPKu_onsCa5WXK zGq}Ix`hQN2$Jd&exE~j)^Xc5g)=bs~QE`t6%|f;0Qil8)aI)eb2m>3Ia@T9qq$B<= z&w-Om@FQaVk=}aw5F!V#-cc2R&==IVak_;HzdQq9wxcxGz13@?ugnkV~o}Piurhbs#5POWSI(%Fk%LN(t=h-5a3e6b{@F%*p z6*P2QKUM`H_;5O2E`-dIHdPK&5YH4uNiBhP?+f-qC)y^;S}HC%uebk!gGTJpq8#uw z^3T#3d{d}8X%+Wb90M4S)5pGI{H4YFcLG0kWO`N36rn03)C}?nQ@U0f2bZTVMgSPbBHG`CNnhxTh@Onr#6*ga)Mu) z!9X;50_ozS|q)$T#-?ACx9?LZ5c}LTHN35AMlz1XqxQf|7Nyu^EHmUpoHznIK9*t| zl56oi(_l%%btdwo)n^3h1VkKpF7!FN#(kCx%|XP`!P9Wq71`)y$9~q?R5IM9Iv;ly zy7KuTO`xtIuO}X7>)!frtxmk3!`6IKXS6psh4B(fIKGpJjceKtl$m_P zJMYgYcjbrg5??jrYqg9Lt-s1|hYp-;z^~@RNb$E0EPD0+bL&h}c)_>{GP+ct<1d@}j~;HMm=ATA-tihgn=s ziPSv?fA-FsA%R{Imi7nmoZ)z?iFk4u#ZMOGL!_sHeIeAsc;fo`=nmTFB%c!=f%edz z%m7<1ZRuYkbx;$|C*p?!YXl`YA`eZ+d$H7Ke-j_wLd(GNYao1t5)CC9doTF`ngG#I z6qgvt87#J&?N5lb%%%>IPARctuoPru;{#^ihqeggQa6W2K_&S~sM10#4gZjZUM!nd zt%uNXU6d}@V+F;%SJ*L(QHE1jSV4P^Ts$EI+An>LC0ygnL@&q_p!Wh(TU%{~#s+5gzvQParQNbhxOlwLsplNCazK+En_T+>$G?m-3hYtqj7+{5A9pYT|ClUP>{k*d1#<3F=MRedsyTo^?T=cKO6%MirHFc6t zyV-inde5ZP_TcSRj?g?dCl(BE#?6k7T(^7$^(;f*L)?ur$mxtw3;sLxEXw($rC2NC zjU4MmYfOAn=empDzDC_&&P+;#9njuY(VxhiOipI&53=X*V|qGUoNmiJgZuL2WpMtw zPz6Gqzq<3dbBB)Rzl==J1}La4M)#34y?eQhp#=M?zv&ESXNreMk-3JN6|gu5`kut& zvjah(EgWVx)m_gfK|Q5o2cA!n+;L04yM5^h~=)Y&Lg&PSWyu7t`oR_c>*1O>wc2JxlY>(b1sT z`>o`nHRQqd>K9`eXJe@wltA3xJK&3?hnKm-9H-Et`&Jd|q2@kT;roq7)ESt9O5BdY zL%&S)WcDZRKb5b!)B7O5dD273WM>ykPs%0;-D*;#Lh#tJdSj5WP7G!{q8TI|BMtZw zn~d&TgsoNb3P8uY7VA&icj^5`JfD}IuZA(BP5IRP3dV*msB0@2$zsjI<1%Pe zc9fdPAQhM_;A!eh6-@Ee`NR43_|~i>4Q^a4+E0wc=S8x>F6nm`L1rRTi^gGbZ0k~r zrupnW<@DjBPq>WXJiNwR6GhlZ@SN(Zm1w^YcCG@vHOtY!JAPEhkEQRvO`SHD8IDG( zE5|a0S})}oDL|%;kIioFSU9t^RHo^9H5R|wgT%LNnIOJ0du#w8Gn;fIs8rgQ4^MvXhN>D&-#-S4FTv9GSPH5OkO`=^QhznFOAV`<{y1=PfJ z+OGvUp%$AoZsEk&Ly*%O)Rajhs4Gkwqi}kI^Vd(_xn&S{33qN80D{8VT)HWuh5DPtyri!`>Ywzg~Sn?WTw@@b(a7{U!A{(d1j!Um60vsR8kDRFVsta>)L8t|JIiNtM)>vi{1TuGyW&qXD{;eoJq*@ z=A4$cww`s`+Is^bdwxZMs5(i-|%+`Y4nI_x(_I%kRKVo?X{FOX_Huufq4JX%`x9YNG z*`FPJ5l{wG?Jsj?vEQ)K(r_3b5A=PV8-c0C#)%CzGWp|qNYMViytH_D-;0_+K+2!% zazqvFK&XN8HWlYGvd53K?h7&Dc*u)~l8sDRZVk=A1ozBn={5*4%5s%0v*bvy6wEvO z?uSx)RbDipB&J%P^MFu1RU=ndHQ-hjFH!!YijQsq^T^uk%2%r7=zp+iS=Tv56E!jg zFJYK~v$1}zCeSDaa2uluIAs|NEXcwO%8V#SY_m8@qw#rU50|Q}kD8QF`4(k4)H%w2 z2x94K!LI?oo1$=o@^h4<#q%k9h4T1uTr3rEdM86&HrUgpDIjTqZl!#+4YSQvw4N%< zb#;JL0IrXtq$CJx!Lq+EOUeK)#2WnpXP_})%T8nsWRDx2F_ufIRi!RT%B+PqEn@j|rZ05q}r zCiq<#Bs6!+<`=pT)I}Kd_v2 zxk0J}R}#?c#=pVZchIJy`=cZucxS6B3!tR1*n=JdA2(A&0#9dC6*H$Ozo)ISR1}>~ zO>mTaV%n=5rZZ-T)Pi46FK|;RSOgnHVCzZ1g{5ky$>F*h+%Xn6B~Tup3q^MZOs@$V zzulkAZDAOatipTvYCaVRh?^3~D|cC6`hGNiR!zx1til?b$7iS6_3?lfgKVYL+&N`p zbW%$rb9yz{@Ffi$i@#$jyUgWiIHT6pWHf6$D<81A6Tlq@(qDm_sWtC6*kpTSpmEFd zG1#FNYt(cziB)-F-mkUe7a@}U+*ECG5I>(tIcnHg$SSVV<&&(+M;o!oQ#Khzb3f^I zgj2qJ{F#;;Y3Lr~01z#;lx)L~a2i>5dJw}nJ$`YR{_41ePq303)h$w+_1=6VVIaML zDxM|&kFM`HnCX|i+d!DBSYzD7&WH@58m>vy$#S{fZ@y9cQ+tz366gms6B`^g<%tO4 zZ7kknfc#wsZWXUQ7xM`9K^)$$+=!W(y)~}w+3Y`bbxH}F9%o3S{B~|*0yrOCmpzhK7P49NQ6r}pm6zF3=vCzk zJJ!*kX}+Ne*eYz?%tQ#_el@89k+JQx`EC zjZ7e56Q?O(sNMPzNu)lZ%k>{1r+I#HynCHe5W5PU-+v@lLFgr=N_!vsN2f+Ew8?n0 z64IJl>T^&N<Dsu48&VWSn@P;*0O!_`{r3Az3!IRN zSLxV?jREHZ!pdCXtU+g!ZKZa+Fc<9jiUFKr3YH#*_Bh5@R+ychO8+L^AICl;*b>ke z;-yqZ^2pf@g?M2hv`E{c9a3AQi2ie2A>4nHYfq58sE6XJ6D_?DRwm}z&)=mxo5n^X z*Z70^^i1zOBN&R?@#86^_7cjKNsb!fu}nUXUdxots==G{@w`#xdvyeq?THqAHL3hF zZ7Ro_8F`N1?TnIMgBJP;aN_Z9@Z%8%eYC41zV389+b=x;?pKXca-_bv?I66wlX|hj z(8kfO^U!90=sn?VwO z(Fpn|Q!%r~Qb+H^g9h$QCf<`P=ibl$Z}ulCIMoc*;!M~0Um+Av2%i{js!C-!>0RIk zrQ#{+GkLnLtcPq_tjuvjD5I0g|tFrmHVPo*E0@w7Y z#3uKSD7@8e=`p{-1Z+O-3~OOR`me1gd%a&H?e9vS)XAif1%yM%l-z86nMY4{*xJ|E zTBfleX;4vn0;dyjnv}jQkz0_~{wyBN)bFd;n+itYG-FXIbQbEWeGLvNam7^+D5*M( znw31AAhKzV36PyNieAGlG}ftTsSy%~NeO^oQl{$?tdg2*VHrYlF<(t5Fl9AP?gtjn zP(uNy^+I81D*iKz`<0D4YgW3`+A9{BV6hIaJ|w)OF4<}B0*`$zVbaAW;mKmWmu*;s z$0=Xva*|KlS+nGZ4O&2JNC=1|Iv3~VvNPFeT%iop=e7)#6fE&mpF5_-7;=3#pk-V?mCEH47lX~}wn<}v5$IY%4lvESB z3;W;R)r@6{cdy#ooKox9)&=Mmj%S6FoY>FS_o8v^)6$;+t;QGFwt=29Fwbdp1MV1& z_vD-HSk9&F9zK@uW#7zwz%>Wx%|-FbExkikSgK@mriag^n%|&&Hr$b0>&b45%hLe& zjwkGb-Nb|xp>6otd~i0b?;I?T7mlTw$5Nh(QBxDu&Opi3B4y;s1<6hi`CYP4D*h8A zJEZ{w1edEIRUEBs<&xuOWpP;?*QXa|Fe!R*i*lCBK%Xeza>)~SyK5b;huP9p5awfC z8_y7F9DWU?@)bmWwp<7MQ^iOC8>^FUfaU$3l<~aoL|4FQm{ zfcO!2518E-@{v>l{85_qCPzs*zZ4sy%~~ye&8tw-M^HacBj?s+Z235fp6nf|IECTp zF=dvJg2pQCLb}ZTHdZPE+NL3fOPl*s*EfhJuA&1v5pGfkx6*VaOb~Cx7R70#CjnB~ z$B62Rw>2HhbrGNBZ(E5OI>9;lz8HIahv?0lL|1E=#{JtO@W==rWO>@=!M#$$Cv zmH1JT6S`udip>TYS#dq%Y5-|i#8uU<{2R1?shKp(A#Irq!A9t~8B7C71g9Vom==AJ^H9XJQ{x#=U+;p>{b>l)%WGVl5g^PddWjgFc- zhU4d73`hDxE&UOx#E-@!cApsq8kLmc}RH&Vs{SJdK3&=@4QpALq^7O&c zkwCQnHPUe8o;K%7X+Upmn??7mkD}?xT_fotVxQT)TyGNgX$^KV&o0m23lN95YUq62 zk<2}zlU@e54$(EuJ-f_4ij=U`31FW+9i$17r$eWxQZK&3k#bCOp{P}bP5h3z=nI6g zQI9n4y~5(pL0SO(XCwh4&Q(i4bmdbO?*XIvk8MQ|aDQYp{h(Du(JiDVqQe-+5xdy; z0I5~N3tZ2|qE2PDIGlYUl^Fic4aJw0?c!5NQmje&crb-LLhaUF9r$%NeW4Oa@*}IV zKgnbJO2GYs>mRZDm!LJD~#VJ%O1AdE@%k{+Qr@?*gwPUP@e12>iv zrmyAv!O+o45Rkd${l`hz$2Di2HT~K9)wDUstAb$duFav72S}dZI-#S3sUh z#^nW2BSe|8i4|@er_6+Z(kz=6xJ8-VIiX`QG~(MPsGBA=8_gtpzq=ch&r(fEm(1Ek zEQ+o~;0_9<2~bPAt~>`=&bZ%I64R1fN}RzmkQhYxf@h>AlRbAz{)&Qn0V+^539r+x zS8F7@zaTgX8s-v8bbU+toL&7Ixk`45*M~!3dMeAxU^dg>la8wFO~P+Ik;cvqPH&p7 zYimz2&30a-S)N0zv^kOrdH!fduJ&WA-Ce7^o;G&c{@XMmmn<)XT*GzZMz;Zd`^pc{VQ;5D25hZkHwK??5rsE!s++0&m>rJ;)k>35On!VPslh{~U z=Kk!&6SKf0T#94UrY}}1(jOXqphWl!D>dDzZ7mD?k>%~n2T|oZz{m0^zpvxZt`HvB zT`2A@r|()$KqB76+2{7;TY(O`Lc^|fQLyMI{i>oDkj%hl_5JKg%_1{poD*U(_QFsP(VKQTxvG6<8 zeu5PD@H^=4*zgY8a1#w=0i9)86wZjM&lp$*^r~<%kl`*S^=GA`BCXVbqAFLyKLHtT z^1Ax}8QRrdQuxIdRrtkU#?vpVELL(vBE+kSgg97s;SW(^b7TydqaHOwY~PU|-dq?5 z;Tw@La09eAHXH}Q!rK~rM8loHVVP}`+3t0a966aFy^;gb?o-;6JFXm$p*=+A+*Kg` z0U*m9e^(Om0Gs|#Q3MHl#9#XG|BV!&aIuJsMJ_>EZ%yMj>D#B06N$B7$I&@Vd2S7U zpT|d(v(|!^50d83ON%5H@zS#Y!e~v9LqQQN2O+nlDod{S7SKX)iJlzTkmzsG zLW!>^hdcKaXd)up@OML*qlTmsSC?|mwmqu9 zl?^`ek7-0wB9hd2SlpjuuS4jxvO1dxrLrSCSoEO;KD;!Gu?L)$n^YwJvwwoiM@@Z; zNL8t>%5BdXUt!;{A0URClMFuZwDjpl*zgdX%Yxx2EK8K}j)~S&Zr7ev%lgU>sj$j@ zOy!Tv6nr$VHPYnp`ml4>_1PCHnoh zR6^Jn`paR);9BNS<~4~tBpTV;S@n&N{c6}SC$AFvjY9Wfm9XBE2*NV|{iORyOeQW7 zcohkabyFzm2qT&f98wA0ndBRroIjo==-o2}S{Ld#!ekiMof?8{3LPZ~ zzv<&4$i9Kl_hdO?-@p*){B0B(c9b#mGl*A2RLzTcK4wOoocQ@9AMI zs*jh4KImc6_>Y%I-TPR_R)jRi7zclB#SrMVeW8iR7z_W}KF7VYemosoc8tm7e>|Og zFVuZKbR+_GUmpSuejob&7-QuJzaIi!I2$TB&R7i>&gS2{U~A}6g7B@^hTV(wt_p2G z&R7h+tDN^j>q32z<<@N-f;<;WIzg6uZtoCedrxS}3C3*L-ZKPR8(KvWv^KQs1e2wD zz$b|4?(WIes|`% z$joc655bp)22V1XXlW>+m&rogLZf?`bn;e5KB?aMI<%^nF-P8a_cB)1hu?%Q^)hL~ Vhu@H8{xhM3%l!v6Z!>4O{{#C~h9&?2 delta 6398 zcmZWt3wRS%+Wtqp+)#>kwUw_ z=u(PSZM^}*lHH<6E0v3)a#>t~A889+L|1qHg;f_9$}fmoD3F` z((Hapv;!)Iz@$>3l0_#(HLyejj+}JK9(3eOKIv}jV-TFd0cdjfmHp<}Gr}gVQ<8$t z`sV><(lx=#zno&eV&cWIFi8*8`0kBBhnVCki)$tF<2XptJXKXzRxs414u{31p6s~X z@`q@DWl`Q5Q2Lw7?6l+yPHKUu9`f{1zLBoXt)*y~?42Jk{tUcA?oimruiT~(UK+X$ zDjd~$@qkk38mYm(q3kZTR8zeeKgi%!Ooa}pe<&xrnAoL!Ww}{A2Ew?|IeA23j=#*H zM| z41^ZX7Ve;Q%F0|%Gl+%YIUXmbK)nX6&kyOvOkh_r$-%HY+!(eyfiaLZZ*-5b_Cuh| zAQs(t$t&a~%zio$szqZB0m&Y4_?G-tNk%{-1m*3@)K=?Nit?wxf_dO~0<~W~*;UK7 zje%-@g(^#Ogv0KjBxMGud7=@KfvaXx)09&ed7eYotiDevM#x_d>G2 z$#3^JIVPQ=L>Hhb&3hr}!5}(->Tl|y+bTe6@7DTi0=czw%~+fTBSG?|8r??fq3;Xo z?*w!XZ%PbZ-9M4>ctKnMo)n-qO!{xev4gUdXye2H%zjEyQV8WmWj-_q0&JF=xB*qG zoVKv)%@PsBI(eCB)?Lv+`L?F#6;wuSJ3p~3lMhmvt#eeoN=GgCDp7q2beVvuBksUY zP^p_-o?1cLF#+c$DSZJCiti3vmh$SLBWPxuXqxykFXQu>%5c~;Cy89y-{>knh{pky zQVQUHjoR)DI4EYL9?&X~vR`4?r!^1^+zuKObFPa+Q5(E(1Jt@hT~l3~ffvwkJp*>U zQB5V_}{4-;6tEgtO zqg6$DcFm**5Q!Pqs41(*(~L@2Et3GYvTAKUald@D37<45?GlP-59+ie5(D`Y@fHpH zjee&Kz+??6B|A2=YDseY5KX&%LDMDe`H3wLGNKnPkW_-s|3wmsTWYzoY?0{&?a%-S z)C>L|;Pd5-G2Vi96HNjXCmXerROJW?mqI_bH+jC|sOjn{i(R!V9@Fxavn@A+g zRkmX~P+8;@(y0H+2qbnq$lH?8XQq>BwCIzZnMJsO1a!|wPO&Wpvi~f9Cs^|CRDaMpBVp5xCqBqt_vlP??Cx3cgHxw*G9-#JgGVv7JZrcBSI`3UJrxEtdw zGtwAVd=J>&V%hABFh{ku4J9uRlQ;|);;wPpaG|47#D#fym%KF5)%us)_;+tp+pFyX z_KQ34p+e8RIAfD{M;xB(we|^rpu_fn=AP`rq>M$`#|HghBF)P3hqMxbu`qwotdyFw zU*OO~9rpH(A?sWQL^UdTB#BMr@i*>_b|c%A-u^T$q3ib4=`2?~I4!xP0=kQ|HG#!0 z(fIgzAIL^6Ze77M6k&^?C_Z}wm{%yLtvLUalTL3;0_TNs)H=2~d4Ylw8^Ks0CIRj- z>3-ezar+r6r=mSxEROYoSvzQpTSY*hCJ}DRQ&CBDWv!dN#^CuXC}jUw21VUz_yB{y z%+}UOrL{#mClpdnb*)>JU(|9o!JU1~RAPZuS~x$4e_K`llF$o2$5IkkC!2Ur8Q#q- zUWY5>bJ`s9Q3oT4Ug**QT1R$?kg9uCZZ5NsiN`_Nsmr~0dwCh%)`QP^@HjP_Ydfj( zenp`Z>VPCwizi^{s#0pe%QC7?-1R7Xoi@G4&@ram;VtheUqaCDS+BCtfqx}<9#@LB zu~C@6`JbV!5BqnkvR9;rTs^&ju46sJAC=$Kna!1V;8*hn2lnSG9X>9B8)V+do$hG> z>bVkD{!Zt*d!)R4n^9H1yX?TqbP(2rE(qP$7o+pRGao=iyG(yd6aJtK*zx1JA)m7? zF;5Nb6TYYiUhx2wpdS2GKDZZe?7mW^=ljxy2}<9z;=7D0cewnnlJ)W`HU*W)U935= z-o|FL>{%Uu51pzrJtrSyGtp`JbJjd%mpA0{+{sj=0ly66d-OD+&9N6ita_Yq)8@~> z@#`|g|7>Hl;w7+t(3d*N41_TFl!TZ7$uO8E8t{M~s@Tjdyg;6=FF+UNHTnts@xu!4 zS>L~_u&vLd?J_Dg?7gI4gUahR#MF?t14I$1tlqyFRiY5{mDz^?bX3c~bl7MnhfjZ%~RKLEZgoDYrIr^CxlCOnRIojQkN_)k=g4aG@iPgBfv*@CLYN z%z`V|pwkDPnNa^Wh$e9S83;~anFXnp;yp1%b!BPH4klp_G&uH_5;qooQ<5imI66)=irzrQX*)HICovpqHI9GjUtmlm)CnTqMTiTT~ z=r}^*(XSMw^n9gg|BUq}P{*Ot*^>}G3L{5>IqHxhw`U;!t03-7#CzGs4IF+!zBf6w z^}n=3|EDB$^?3qrrU{W=h-p(#4iG51Nj_c;?D>3*^|`+J95eDRQ18y0rHbk_c{&Z) z@aN1;86v!DsA+W7-q0c0>`fhlKT8k}@YZC4Hf?6sIk#uf{ts{x?3 zqQ;+FYkE^VcGb?0fRF=OEh>OFCW?FvE!zE-)Wnz$odDcips^=~COHGn*Vo?#p{34u zXiOy`H_=WpabmesSAR!r)7w8~&u~SE0sA1M$}2T z#NM_EkEd~g{Ew6XlI3!9jqZruG%*wG?6;|Edr)$CqkDIoD^cqTM}|vef166C#INTd zY${DdMSHs9!D=dJ@lgXy@}6n|f5!8B@(qr#=xAKdPiHMrj24NTNDhF1(kwWeB<2IY zoihY|(@4gG$J_ABX5!iSnu-0`tPZ&SO=8}^6wXa=Td9HbTT3$T;}#r8RG#w%&iVsB zz(!w0q9tj^}22lN0ky|} z75Hz)RoyRauh@myI)M?uhX=}WK`BWf?7?ch(@y(>4isl9E*+w}f5K^{9J4G=DF%?O z_u=jb=xRyiODz>YWynQ>5N~>0Z|TaD?-pFi4fek42r1qEjPdEF3OaO|n~}FknNcS{ zFL15H`Ltc~ar?oMsxF^b@okp)Bv6x~%9K}Gkpse2T$}*4p%6{BCp$|JD`KI{Z6mxk z2&fwWPG~+yJzVl_syHL&E+l9f4Yk#B!Kap*YpW=oJWh+VSJ~uA6 zfsa3x$fVEvRlZ!9G5d~>%K5)AV$%%-Yi$`u)<1syI-0Qo@WDJ~Fwk+(6Tu_ficH&z z*xRz3SM!Jl(fO?gZ*|1zX0A;>o>9eLI!I(d*1yYuYtP-30rL3F6Rq4|IYvyoF+yf2 zJVQVPNpHr@DZ+$`z7A-J9s*KM9fJD(gypI-4DPT}|EmCY)G+<{C<;-|?^ zXO-%FsuG_{YzE*v?j=mkPM6PQ<)M`_wt5zO-=*8*OC)92-8Ym4hn-B_MWwwOFI6EW zt|4;szyh$&ZutUHGYHT}`_awP)sMt_L_Ut3Gg%*q8sqAW(Oy{63(*>QzlPvYr>r0; z*odsClA}ZL{!m2s0?o&*X$JXCYf>wtB1GY0n3nJ3qT^#_WPIFx;~`<_!1w!F5+Qmi zRz@y?=2F)H0OtQkjeoCZ4`6RG(^UZ^llPEJeuFu7fI)I77oxm}HOF@ReP9gbBYftq z2Jr}hP0-(@Kf;6mC?!)Mfl5@`D_YhQdc;|BtNlLR*G>qSdeR9s#8{<~&fI(5z5t9n_%u9x@ zIquOsh}*YTZSk1}mvt%xX(&;m#P z+&YUR>muGGbOv{1-6+$Rp2#v{GHmG?Wx5vGc7#mh+MZFCQv;DRM`#CsYJixAlmBBS zw;zpI`sf^P`_WO|UXDiQ_tBXOa(|?cwkq~L9(k#cPUrSL9(U`EyE-Cg`)C)})iKKS zs%?;h|BM(~*>8#B%zH^3i=}5(_uHb(H1s z{>anE$Uukpk20N(3=#`E9r^MYZ9_jq^n-K;xp@Z3{I(89<`2?B?7n7@&Q`oO64^OO Xr}M9kkk|aD|M5S3`TE&G`ULy`U(uUt diff --git a/orderly/data/test_data/extracted_ord_test_data_dont_trust_labelling/extracted_ords/uspto-grants-1986_09.parquet b/orderly/data/test_data/extracted_ord_test_data_dont_trust_labelling/extracted_ords/uspto-grants-1986_09.parquet index 3e30ed0f9136b13ea14f9c7b5d5740d5c49e984d..a52e33ce96a546397c4656158574d4441ef5877c 100644 GIT binary patch delta 3327 zcmZ8keRxyl-Tpo8dD3%|lPAe(&Pj6eQBK>0q$O?7X&WFeq3I#9EznfT>ZBBip<%E# z3WI{Et#wSCN;fzIw3nN!6Rwq6w(2&gVigB^sT%_ZQ$Xs=>zMK}RJOeelkPp;Utas; z_s4Uc=i|Qa`}c95Yp2}TPPxUl#A2BT{@4Y?zczmsf6$^Y3_K0+@wn(J8o&=&JJoPk zndr8MHYw{tF@yi*#taP<*~{HFzvCB(xdXWFL1-Txn5nR*%A|IM&bylgXPE%LF8i_n!4)nGK^8PZzjt;>QeNp z7C1agC|wsy*AjdHE%2QZxHcOttLe;`*8uWzC;02AR9KiYz9p>rCza|RUC5@oyJq9u z;NPH~xjgt0xqE;G75^WjRNJa8(E_N& zyID0aXVF7cR^b|qR4|N(s9@UsPNvEp%4d~)@cc_uR(|3{<_f6E*U>yscU~O73uJ4joGE0QLR^dxN`eCuZx?)Fh$nP>#-2&0AmfgndTH6810+h8k2Hl`LC8TW%n=|fM08W<3j6Ttr-wdFTCln9_vajQc)XEi%v z&gg9zNv$LEeeW|2qC(~Ak-NyV4m4wQ9kh><~-?^kV2r&kv1Ct69JIP}_rM8ldn#UONd|q<;Gf0p8L^$jQwd#m3w7SXw z$_l`Xd~>wBjP(^ydl}3sYABBg$kB`5;m0BI7?o-yXM9|jIT3*%`IVb%wv$uZQKMtw zvueln;&UN1rsyQ<^2fv;3DQ-2aKu=s)WNVBxJyRpAsO1YLjv>Ym%>u| z*9WN9%&5J4Oj!opW=%E;)wz;XFZ7#`axb)mAydzdTN~emw{4*GLSq9k{m2mAz@Xxb zve-n9{8`&($`JmV(EfRwDwf2tT>whp;GGV8h;5-c|acaZLe9dZAq~+o2V` z*da|*Ozci_jvXw>{9TyK#b-$&VgH|tlvMyT5W#_ga%3@-{HIK@hrJ>{H&h2_uqa`1{=8-c^uJ8&)=z38C^q> z9>5XCd%+56|Jp;lsj^n#G7Sn>X}#`sW^2_}T`E#fKqbE^rPHc9c9N{KS@n{a)$?;l zeu^NmQKrv&kTBnQEv1smCeRKDM+jozu6VfbGaWI0 zts5XaSl3-B)Z2xx7c;Ds;&hmKpTn!y6tW2wVXwc)_uvePCiiQH%?r_MnvHnSC9R4$ z2gF%qftglAYIHeE+RbPgq)WeLCU2J7IOO>5QzJJkKH{Km^nH*&yME!gu(KEyljagy zRaH_(al+nyLxZW_?K0nzhP*UtGoTXDM9ClGl^YNyl2j_rvZ^fg=WNCa?a3Trh|IFeI zM%p`i2h}6`f6O3qOoU~ON!Y@MA$`>cOW!MkzQnE4ep*ih5;2k=mM|7BX+#$3Rkk*$ zro~A&G76L0I!kCEDpPVS@h{F{CpU{*?U4T?y|x(VX9HU8IyP8Nh0^&NNc=_U;?Ast zwX*@c#hj6w#|OS?R9>W#Z^x&IjmNaOtw!8o&kPPe!A!1sC3PD>N)(D)Xvl?uZClk< znY>BMF*Wdxm;BX>C@QF8lkf~Pcr^*#~zPMvelWQ~Cx^eq_mfF041w zYPwaldE>OK9RJURoflupFXlnfp=?9_iU6F6<5p=atpWcEPgE`5|ApkCEJPzFf zD$x`78C(vs!sz>*)g>rda5=ECdjss3mOy&%7JIcCQ|0Ob;U)tLDx$h-N!%oujmUcb zEj@?D@6+Os^kGD)A>VAHr%tU_gE4g(Zl}^JImnU29IFQP{&A3_)rJOgf@6caCqTv1 z>TH5)CtOTPoa?N_mBy@Kc`R1eU`Y*iX|x)IfV3<0UY{= ze3i`x)mZl>rEZnCvLb-9A0Z@v6xZjAP_yHl=;~6JO(xa`$Ox9L?0YkvpKxbe1*59Tw#UfNHYK zNi}wnr&xWi`rfoch`cym>kTFsN6BGSPM$4R8bJ3OC(N+;T}VP>t}x_P_Gu40D;*^^ zmyy4lxy%^!fj$yWZjp|IDt=FU+xd7=#Ok|;DIL+)xMcgJAWjO(10V%RKtPDu)$@!2PiApTSFs|E4Fo!WRA|NIx8g$3VSd=gDO1XDH8uRDT%wD>vDGyJM9 zkQpB%|6^2IbVb3yrL9y`oj7ib2RS?v*Gr`Za4EHLO=QLNeqFU_p)qO!u#9~`r$efb z;c^S$s4niy`zXp^mpG=4m4s{;?ZY*Yn+dpO>QlV;)7+{(gLhB@{~O)Tos16%xx#|| zwPqy=cy&co<7TAn!xr7s0J4_ifjDX4K9q^wPmph{Ik?TT`K|oZ_A_ z(YpVPUUnxsXtl@21x&)NJlR(~bXt37##~gPVJSy_A^EDcyQEfBr%gz6P@nd>bk^{^ z2UPBT?Y+`~^O>3WlS4QAVlW|uN3QPeux7K$x3i>16UyeHmhW_wZR;pQdKBnUs0XD7 zexakxaDA-p#swr?n(qRE{#pdU<0H@bI$!^ z&bfE)H{b8^{mvhApFidnn-dEZ9{6X^B7U@KD!5+ zS|Ipr;}Qla{5W^b%+?XaE&{IOc4!%Xg6lG_xjBBk$bN0(UoA(5gVMr><2bghN@ z0lJ6B^fl;{*b@|cFh(wjz({gAXu=0(--l+%a6=3z_;a&(k-Xpm7Qb&YtC|+>{IIXT z1m9@Jy$;$PdeH>6c6;J^RFQZIwEl7wgfQ}%XU%S}qnFS$n&b1E*-b8$d@5DuQRPgX zoT_|suXBE|a!@5`z)Y1=`wStmE z_fR>NyTV9?BlI8@&R7m*WrtkIsfF;^GgMA__|Mr7K})`jW`g#>$0fV!px*)H|HD3jrkgLv>Jk*#HPd`m5Hp{ZEn+g z+NI^}wZh{z;Q}8!T^g*Ovbj_ax(roUK&;RD&})W#CF=N3<2ou|1F8tgy)Z9yy^h&% zbTv-oskAa@GBsz8x((Q=_1AKHSZKi;Z44AHIf0QhKP#cbsJrC1-|i!cUf= zis4%zJ+usz@hbt`^ljuaaFS+m#ZLT|ev%CaUibA{L2ZI(j^R<4)C@9fK%@w%IPm}; z;3+MQZ`LJa)O)`CdR+LuTNp8lFZtvZRR&O(0X^SWtFJcJ_x;Sv(u7EI0bx*L zXT00CLSi$OYQ)EU+;x^j6vFt|Zm!9J4|9y1Q-oJoMIdlS_afBDv}tR^C*9DZ-Uh+% z!yNHsMSKRxJ5|y{f_{UzLV`@y7CLGyR_kD!1-MU)(2cW1}nQrE-W2xRNI9eTUTSppjO3O@KteVu8t`UZ|} zq1bKq+mgRI+oNUyq-?MMWs^8xI;&6x6*t+lw+kzs{lB8;lcHHBzFL3NRNL3aF%9YF z>=mi?Qt*TNRq+l)Kl+oO!u;4J8|{7ZIQDx%Ys?lQ0a^rbKpO3Cl|bDKxWtK%nX_MI z&k0kxc)29wk=IMq8vrU2KN@=mRNrk@s2zKY(rWEiO4YbG*eNrZ*1ZtDAF&UjF8|GZ zaB_SR>O=l$FW(WNrz$PG-z1E^BeQl9ZRB?Fbd*Rg*rtYzuEA(G&{5WVmVr#LYyTh> zNDCh@pmG7*Mc$dK)zSnt9Kjw<&gGPhrtN$UciL@4>WVg|3_U{ApxVpW#gG|W0LLiA5)X&mHfYo_sRG!9V=>OP2_h2L5s_(@er zRu^B!k3QAU3q#!Sn^Z0r|7THRx$tlE9iy>bfF^zXoM0}4=8xZI4bl&xW_}2nK~6kM zQ_3&!`{h(ahL*ji#q+>_pNZ41kg$gf-Dgt2f?%-sT@N?Fw$}4qZh%~WT}QDHaR~EE zS+kVlmRg|TGkRT4gzUnU3U9E)x28g32)z$juJ%1=QGW~MPm+(G{pO2X)Ck=D2$k-P z?xFE@jQWmVjh%g#saUfxnykfI&3K*#ErEt^H!=!u={;7tFQQONL*i@B>cC%=rX5gtgXt=z3)29ta2@WSL|m#+ z1Bo-j_qbEHK-V;&-D2LzP2)o=jq3AM@=x(Ev7M(Gab1nL(UI-%f0#Yp`C{rCfRu@ZWPo7muZt7qqJ^O)_2%^Vq{Wm5Y@k%7?t)xCN@ySz2n;d2Yg;o zPwQuFQ)i|t^BRC~pb#V8QOGHwv2d&!<>x_``^?Luif;JdLnim&tB9Y7d^aZyJe{$( zQ+-LlLyNQr(st=dM|X2r>z3`5IL=Ay)y{7Xb8CD(rTPJ_F>C3t#x1p)3q;yc1F4H= z)%dr2R>j|8$sRhH_+6_2ba~?u2pXtFcf7*jD#KZ$?|{uEs5x+z(Z-Hm7?G}p%=X(I z)mnqrr&J5yu%Mz6s=XG^G6_~AvW*=g)E0ljh>tOr5=xVOGmM@(ty&8=XiMl8Dx=|1 z4*!8OYhmIxqa35nnuU*Y<}gVPsPr;u--l}C^vrG#Yl@sveRmnzhd@JYP=b8|Q&OEXmI zGE8i}_<8A^8+-q#eU&w+Y=qmQ;GCR7!8VgC=m zSgYSrR?Y`_ONr{f+_{tKbS1EtSVOE@-(4o+5xd%Q`Rja5Fve}rPnON0o;Gs7(L>Ha zyyxdesK#5IRAW0HGLy;cJ7@zLKYv;3jV2dIDG@XY?g#7+7k=hl+xHs4y-|4Y5gr8wRb#PF_9`)>vxEa$f&<0s`$>3Wg)lEQMEbb8cf~)fV&E9)~nsqn1x*5jp1-D|w zlWc9xvn$(Mxk>cgxL5z${ZPx=D%#$-tcdWH58wq1F0yqv2$2;{P;I`A<-PYzO0_`a zjYR}%0!s}d!byKeJ-<7E`19g-)pP24eRqI=`l9FRqE6+gof++ii5l2ILdIHT->&j7 z@ofU6597Cu>Oz7(7`VAMYIdEt!yXTC^iZ6b!(5>K)YYBQWkA$JV2@3(F)M&|=P4#5 zYd(r2T7F#w`h{P8Qyi>I9MX4|%l0#lZ8eae4D{TIbM*Fe`5U(MFQNqgSrQPB#@7k? z;-Zl%i<$&_#guxTt4KK>w33vVzY01w^xlh-XyRT=@4b3=MeWoH4--zw9C&HQmLfSS z#XVtSNTQAX&7Ejvv~D{WG6{`59x5F?svoMDfr2_M<*9#3zA5@mQW{n3qtbMAi+(|R z+px<+BG7UDgg@kbY%=}N{g-+hVALKN8sFY(%jMM7Ib5j=fte`zgVw-6YXFf|L{>3b z6j}71)_~#S&gS>dBH^PZ7YNLiQ2>{RI2Z6m1c}h6O&;Ft;=$!)vnx6Zw=S!xQTQTZ zpH+D7RU|xBMb3QM^#6ypZh7p8!DT#s011z}$&RgiUOr-2SW-0p(>?B!hU51C0`x{= A2LJ#7 diff --git a/orderly/data/test_data/extracted_ord_test_data_dont_trust_labelling/extracted_ords/uspto-grants-1995_11.parquet b/orderly/data/test_data/extracted_ord_test_data_dont_trust_labelling/extracted_ords/uspto-grants-1995_11.parquet index a9951d4bfeff3a1ebeacbaf79c3662904bc08fb3..dff9d8895adf167e09da6ecaff2fdd4e629b5b13 100644 GIT binary patch delta 1926 zcmZvcdr%Wc9LIOV9ladMk~=O3Iq(rMA>k#Q5DE%oawLF`31X!cv=&iEtfhY_wzUr` z9r4leHMB0$Q3SzSJ5@m|t&a+h#X*r4XX?<0s7!IxTA$dFqOlw6KTSHb-@EzF=lk2e z-|o(>J7isR$SM`s3sdw!|LCSlzkg|hz9*cRU_ArC@m488tSZ7ljsadR*r^4uRa4^5 zlZ!bk*X7*|c5ViXmjLvUwzIM^2_;)IKXJ(!z`>|WD>GlWqYL4!A9NmZ9SfFc0a*(Q zMQECq`5nJ$J&=n*y%wl{+!G8+V@DdflW=N9nMU3SW*^q8y_W?t%$EhakIGA|YFRS2 z)z-Lz)5zVx+wL2N_D-7KEj@}~WwbeFsH_{9%6a(BCN7cJTD@)k@PaRj@FdQYVE2{$PlpC?0o~hyR!nkA}jzo%z#7qszHP-F!!= zS|>Z9?ZcNr^f9eEZe=iluh0XwEh>3F;7-~j;RxCua`kW7U<1lc&kXk5v}3DhB2NehfkVt3PkEM0YhsGZV=AiQKolrG3ibQ=M;zHU`*vVahfvnC z^0y5-@rQ9<3pi^5^cQG%JpYtK9a?FQ2Dp(f5qMC~B=YzRF-n$a^?9J66HaM=7CpJ~ zboad~O0kIZLx7cI;JwBs``-1{^unT_6VP0%#*?L_NVg0+xvb3M;U8!y@*GXprIlIA zQpE=v#-pV1H#O*-R+>Bm4TXJON{%7AMc0yn)Bv5|J+=RugFb26+F!Dm*b%M!jYQ${SZCdVbg3>$9U6#hE9{~fx8G8 z6Zsn-HArh4BuJzp3L>J2Xrf@E7l;@lb$f%v-Zn)x{h<>3ml}Y`OtArAh$24_pjQC^ z%vbQg&A$xyN4on!?5lj|0&W+(2r@BiB;=yQn{#iW?O+4Bu8*ok*O2pyT)F zf!G{YClKc!sKL{7*jS-vIDib}8Xp;cV)L&V#+^B=Sr|U?qCe@!qdwAqJ%H@TX--1+ zKM5e~@S9FHT3EL!!+*9O+~Q=7um_)UvazrW-*K`gvCHf6&-Hc`;CoK83t2AGBys+M zW?bqbrOkH&$lbLw|1CX0N=4%QO9$$n2d?|(CeA-_3Ttx7`JTclx#T13@SI#Wn!Mig gxeK`2=PuwgKG%frL6SS0qLqL9`Xky@c>#E@t-B}I`^p;YUm)ymqt`pMlU_kSlRzjMyL z=gi67eA2b;q-*31H78yKV()#OSigLn*k&e1gvbC;e+oXBia8rA99IOXSj1hUE9X#( z2MibiZG=Y+dA=-VuHMC+8(0eHr2{IURrHCmsj%9CoUY`ZUS%Qh09}4za)F9k&Bxa? zed9eDq09n`5#&hFJ}vn)U%DMAbHP3%&|fPCh7w>1hO1A z1iHAL${Js@aO6>U^$O0Qw17ZEa9J9v2kFD|gShn;w`aP}wS)P1UiPAXpgaR8I?1<9 z3viIEBo5yBv7#w}?nmW6ss6{2%+rs%3hj@n8H-b2FcZvxuZ zNuBn{wv1cS!Z$|gO-fz{bbktLCyn~pH4y;5Pj|XM(=q-6#GV0~xBa|_((6{_=4#f;Mz6|^FZOtS=5vw{i;V-^{~jgDDhv#1V1za>o93=CQ07a zU$e*Y9mBg#i$1a!Z8zwD_Dq2u4k#snE~jQMOMQ_+yyz#Xwn$nejg)umnLPh0TjWTP zA`i95*<*|ChXS#nCl=^m%4HTcGx3LwGf`$F2Z2XYMpOP(@`jk){FtH=*clDxehSb>sk(_ zX;;Q2UD|L#zAvq7PcEaf4;IF$2T7OIuN>wP2uUUcrrL*7P7gX}r0<|kWMQQiBhQxk zEz#M>{Dv4!i?`keG4g~td}S2m3@cn(e6nMrl*Om6%(ga5QlzqgvX~bMqLosk`Sr0V zMt^S#qiG)Gv{QZ!-7`7VvU-ahYGt=9Q;IWP!?k#yqWLDHS~ul$qE$&?vO3W#0qYR& zYDQTcoEkW3^A@R8;P*~KhS8`)hSB`I5%5-qAQ@PwE@?+sr_d*Chn>^BfU~M6Oq%AL z<#Mkx!RcSb1?MqIeilXL&AOK$RWs-+3#lDj=AgYyAQ2eoL}efFA3;)kAdFPu zF&;vyz78Y3IM2hzioItDX*0i3P$S;wVaE%NXTyRz@NEw}26lX@u?|>;$7-xqtO|s= zmAu6BG&W8uc^QVz^u%K>ogL8yxR25S;Kd&7AB;A@^mxKKYB#!K;v4A$l@Elmo|eEqIJT4XIEuncEbt}R-ydRgI; z*KO*|m7dF_N!M|620IqE;r{Vv`Le;Nt@j!Z37KWLPH;r(iU21X`zG`izU(m z<)Kh2FL@(dh&-jdz(Sk-*>Nk->f8)wKADK zkG1~w&wRaZcK5p3aYH>7b_Os{&ZU{fYa`N`v!$dK$w&d{-XHfbWryDdZM(QfrliSUe2*7_UMgq|WuN`6uKX5)>9Lk!(*n|fP!E~ViI>h!4(sq_xR z7L2YGH1LdreSjr_2HM=c2Q?^6U(pg@OV3_NO5R@T91nz%5s+QGCo30!vPX++6AE#m#qxdKkuOk0P zGiZqlLaqgK1ENrVls1l| z00aea)Ei{cp%Fmn2haf;l-ZEK`v=QEH@#qJ-wB0KuL*TnKDfEA4Ac3Dm9ZL~ZC5^e zGf{AWMc>%&^xvMWbJ-+;X7_n)EckV2*Enhhq_y!q;w-f$IZcNR9_>Fum!ga|Nog8> zD+Nzh*1P(vEYvvusm@~*quVaNtWNJp_NhjDpN~*!g>rCS0bq^b){h<7hd&}IF7{>u zwphnE;z=wXFYEFslL8i>55KQ(=4gX9k=Z{h$yhT3g7nO->k;$-0y>cDbV*rx*a4VE-{ZDn&&_cV#c$04y* zQzE(N)qN?xp-I5$u@axX%Zb)#BzCiDyc|ss?`jM~=4%v??JGr*QW|1|141Dc@-e30 zV}v-cP65hOd5z0B5N*+VR-%1!G%*RaCwpkeQ~WZzz_cZkQc67I@n#z4!(>;ZSjTV% zza1jrHQlCRwEebC4obWLy(5#9<)dwJ<%!gXF0;6nNi?n~Vb9B@KHpGOB`Y>kd)$2D zCNrQH*Z_n((K`-yTnRf+yvnffp!gHhKex-_je}p0>Ye419v~!w@}}9uKGFL;z+01V zzv`ZvK?}k};5aL=fVR+)-UdKrp)+DLyH*?750B0O7WT;~zbNApWr73l)QDZB0d`6yEvggP7#GOmp8?reY{G0Q^+{r%Fm79?5u-Z!W1DFkx!cf-@kr3%IYRQcI=9);iR! zl>1t0hjn-97Wsv0Xpx}zL|!|u;j9e_ENWCyU&u?->L%Q*EYu>C1hbJpMOxX}> zZxn2l1OoW34cO7s9=u#^;xdd8{Ut#Gy}~n> zN;E;*nF1BKG`{Nz*N>xsu3x4n5T1hrSjUT|-{ybnjiKmFm{NxZ=RgnS^6(9<`0<^S*djjL{kf6)~!Qx*mD;nL6w;Izr zm!VuIuw$fSk~#-9gq+DhxY)b08VS~dR*KRZ;-uIiL6re zeFAz{JRfh0`_;?V3FucTb`=62dO2tNc)UADn&q>g^Y9Sx+GfM_X@uyn{qX-() zL*zox96}$7-{?$;%Lf9zOCn501v1@wQLis$57hS@s9$rSUN{%QE>3TLY0+h))$oxx zN1qUTs=nt`{e)BXou|hDBk@~Mi>Qv znvH%GRO8>oZi5a^60aFNnngpXWcOGo_yJmgs2o&l)jgk19v1nEbDmQ3^EV3&diVl#CHmE_7 zt~v|-uIA=UWCw{SllMTiILT;%--_#u>+HdqBskMGf*MdO-S76TV1SSVdK6%j0`QTz zA}F*1)n>O8z-unA6sjw-#YU5v{koQ$=f8=i#51uGaZ-DJfGRL}#I2@R;99XRWjCx7 zrDiiUigt4WY!KVbF1TOZYhD2NirJPjI9!}(sfKTfpICT|JKZBGRvCN!SNwP^+O9xS zF*>yXek=}6O@gb$DXD|#X`2Y#DJ*IG=HSUx9(JUxBm+spv*q7ZBzr^JnF_b#Ic7*x zSf~YXVb?bFu@uFiOPR8nen5-D^xh>*Tj@y85S>wG_#>K*YmWpluWmtmG-$lmiwOEg zr#e#jwV>AIOA!tLv_aaG1{zxU;V;h=`uCy_+i5^O6}S{72%w`EjATjtjd)(v)P{paF1kgC$z_})}!~)2us>;dW2}VFSkP7avLP&;-2*$!6(PWk2p9c!1 zKsPBx_fE8XmV$tz1fvJlRjE?w#T06Kx9+BjY)Q(_IdX4Kay~!bZwKBET|~l+<>z2R zM#4q3#?tX?(IMea3_FzC8&7-RT=b? zMr_cd)a4pg3hD6NKgXwva*$_Y?^@U;GPFMqjY1MjAmlbvdAgXW{Ci{_P{yg(ROrO$ zxsZMUCUg5qrp;1KgKsT!aHzXfD!$i}h^HT-86Cb^*uRxNppv zD{P_^1UcPcPx9i&Dcy!-m_beBmSBg=(k?6pHZza}%HSm~zn#5R#cU{vjbN(Bx}s3W zJ4(99KLxlw68vQd(N^Gdzxxt4SE5#R51H{Kf|2i%WuTb(nXGU+fp*Tn4~~24|r4d1TC~r%{^?KPjFGk?4rlq!DPd zRt-1uZ?TkG=&1M;@}5Osj4L1V_9OtTOSiydS%%->;@@(%U`#;{^WXO-OIYsrG4 zyajB!552z1Fs>iJ%=mca4<5s^oX}RYFbgo7y)yvoi7~JxY-|a>EN;n2QtUHI(}X(Y9|`geZQ?&OjKnYe zlA-HZGup5`76e^? z>F|7)lcsfkvgK)w%eh_kc_eIa>rn1`mw z;9B5oYBgLx&YuUhhA-QV&r)h!k|2x)Fid3;l&U&odS+eoyVt*BC(3b$6_uHcW^NS& zA729^RPQn!@S?=qCPagT;Z%k@I}T>KK42D)juO85bAB7SLkb0JvZ=6;0;p}MT`aZ7 zXMf&-F3>6^lZ{_X;3pzYiq2HYzt66fYh`$IJ+$y^EeEMclpPf>1cQ$Sg<#0tAg;G_ z>5JkJLvv4OqOw|BeEwwv_es#uC1b@*Pb&e%KR2RfwWyy7mKZH~;|C-WZwIeRymov*hZh-u099x?jn32ff;dq~gj#XA zPziU6p9>jLK?`fIg6`pO&7&OE>smQ}CP=5Nsk@&4-Udl1p({jgp`p-Kdll6rls1$e4l;jR&>3 zW|+hmK;x3SFP(XXZBn6Xl`@N9TtSt4y;Twn67+jqyq}c|-xsafX_5jn*Da39F6E3# zAfJ833jH>(%ozX^{oLeg%W&Ysx5VyjzbDPh@8oO?8iwJSBb+(X#%K~1LQ5m~Fja!m z)w`Tsm-s`ZPcXgeO${{ky!SLIw)dVc-oe$RbNBPOMPiYIXKjyzQK&8kjIwS-{dS>N zvn0b;k}GyN(hO~rCFZhIKLImqFVz*t6W_Y`KFORpE?zg|W%8iTDDhp#2KB}xsIFvj z65v0v+<`Jltsm8jk2|%nR9x-Mf`#H4XNKuLyK`p*OyS?;(l%RH)02Q7otuMGqp(7T z;>;)~3hSfL0a2Huv*L-xySaI4G%kvLCJP4W2ChJ1@Pl>TdV7P6DAu*$%yN>q|=eciv&9_ zv%_+VW?M9d!=frL54MV>d5);5Hz`@yr)cwWv`1W$=YTe`J1<^7=|wz@!vX_>>~!&V zUJBHSD%W&aA})6E(e`59v|N&G<5$$(!_Z3cn9E=o>VqRe@b)|oil%8C<(}FoGyy6O z_)T0Y-xn0Ki^Lw6E;_uyzZ>YDPi35VnauE`nAR^DW{E@lC1*H;)EZW=owd08zNEHP zF_13t=J%m$ZPy`fredmYZ>4!54faBj0?@*q7Ps~*hhg!jerX-M#^O1Q)Mw6D{2}pM z@CQ8lU~HGlWHrEnB?IFLy#N=10Djt!nqs#XqFrA8opu!<2+*4W3a`4yM8cNP{mIC~ z1fHPx|L)%0hXf4U#<_Uz+~OF&ejfgpn+@ zOMI@szVE>i?n;D^c&dLt?3%qOrRL1dvGV}?gaNh2vIQc}U#t9}x#!Pv^@C=1GVPr& zzM5~2`eZPHlW|dv8oW@vn=ilcJOh3 zx;νYe>55WWP)5b&bd1j^31;FE#{FnaN=3VU2K<6`LE=*vPgao^q#fOeW-29%p0 zc`&+i4(@1wKjQ8pKrQO=fWOKW4>+~lT^^-kRMPYD(0Hi3A63}a*|%6i+1oQBsU$e8 z>1n_uG4!T%;C&oac>%-vX&_7h)X(l)eG|U+o(40A0-+APm`)mYpQq99CiF5tc2KqO zSnqr>PkcQPP|vtE5tal|-83_z`J~Hp@}!)xs^* zTz-91JD{wEQ=5{3Gc^MK8?Be2Z=>+NsHQwX&GF_%Ff2nCmb#NhP)z-C$;>IB{XU?k z^XsS}iJ=otfTl+P+4)q3_cq{jO)Z?~Hk^p|$rh8GE2+0v#)BQkYeG!#Umv>#j{gHt znS2{X^O01mts~hfgTihwAr4SO`JE*3Wnw5AjiYD28reNow@NzvFqMx!Vx4s$uWsy1 zP`V%#+v!^i~u4F$KNzI>^?|IgWOWMH>@TQjq1`9tGS~IoF^jt90pT zG(dwIia;5v;!s2qWMPPW9&847-8zl#b#Y&tz9R(_I!cvcEb>lNQf%kFU2WVONys7< z3PIJrI4a+}9tvbgli(OX3%a^29Q}klN9i32sso4xMy;lzqpL#cGP%elp81cpX(&q1s9499nv;<*=1(no8Lf7olqqK z<1WwZ!Z#B4e)XD}-R5OGs;F_aBse^%6e0j=5s27u_itw^{ zWU5ay+K-Dn-N|Hs1S&z}T-2FcaYjV#DY4qu2ymqjl4Q$18xvmpKd5~g|0B8}4ryNo zw}~IOXGMSpQ8_I|{nhEd%(`j+*NNOeSTb!*WRdd*ARP}zGetrNaTGniDC~>cJrxK~ z_U;R$PL$UYCK`1pkz2;*)1DBu%Y%i?UZsuf9&PWIp{hKsqGh&IbzLG90p@~dfvGc7 zNCq>uiIZmA@MElnfM{2KG-D(_fMVN~;!Crw&U17Zqs;K-`!eumm7FBk76E}0m;iLB zC01pV3^l7{5}$>Uy;7l-EpP(kVBmLM6xAIT&6PMMakpg_)~{~r1~9d_Zni`m+L0H# z5RtBt5JiogAl{$jp&w)+8iP<zm_=AVyDLTH7uXg10~0nx z_FM*}T)Hf_E-;6O-q&Gq?Si3_eq|&~e!Jj#xa8pKg(IQFJ%e+L_ntCHKb&bS14Wk> zit&pwCiLV+JY3pxBet7%r1tHRN03q{uUgW(lJfh`%~ z3Y@J#2!Maz!GuO2`-T)3aos1ce4!+V5XXuEn-Oh`>^1o_TU~wtQpx_6p$YZUbeOhb+Wi7%i2m0)iaGlPj+cLww(Gx@vsK`IkIMIs?c)yZ`vFOAUYD#c$G z=fF3`%q7-=Iir|YNzgc4yKW)qszSVq9H_C0kGj^yP-Xo3b!ENH;2f!ca<(>xs^@F9 zYQfXnUvyx()L%PPH(Vok;ZtW73i2O!G;tZV&8d6s{{bYDIP-Mrn zlZSk%6npnKr2u2!UVdsG@64Y`ZRF23d?6_tivBG=v0^+qfB0gB2Xdlmr7h;(^h94k zAWQ(^;?ZY8WvDkN!Y9Q?S9+M3XGwPidXE(honrpe9+KR3Jk8V7YvewQ&yE&}yPmFq_r%|xcEPvBoK^i( z)9SZxkrKCKn7$R1=@yygW}hWsTHjAnme%zP*S^>m3($#^TDM`bxN%jP#(oNaoeFBv zJEz%-e6%;0rc7s7iSclkm>}+rS@#Maq9m<(S&ad$R-$Qz!Zd(d#p_}kNdXkk6vp@- zB`Io~2R^?`)szFybAb8OSS`FXxA~cc&Cft05k&eBkr1JXL=x#sL`sAvA|nz-L{3CO z1QCfQq9mdsf{Cb)G(V#m5~um?dpUD{r{w=eh~r1j?$dlYwZlbn=J~8tu5)T7mDcS8 znDN4tKiz(@e8~7Tfe?WFp=F=r8_`^(84Zh>n$!6LdV)}7>?b_>qXeoKtwLkO43P!bVGqFd zC?glQ$ZQ#U4hq1M?N<$0f+Oui~1VWD$JqZGX+ z11$~fSW6AR4K9WnoSka91~U{l@U>CR3)%1Gz-N%>fFzV!Rv#E;TMFRo+DrT`#rGhH z&nxf{o#HD)$LDhNo5DEVSWLJpZg?fT-mO~ciATreo%e3?nzH=bva~vW1iUi**)D{( zR`83Yc3(a|CeUBw35v!%>lA@3c^isK=PwhAos|q^jRuK1K=1GT2R(p(k(rD?lefOZ zE=1^yw8Bi3RS(MCX{x4efcr#>UWn-^EJu+vIo>1LDQ8oXqp2ksC{EpIgI=k{S9Z$5 z_iGwJxOQ9elhFmGV69>O*-xCLF*4FM{!wS!k{r zB_tRp@}=yUA$bcut$3b`T1wJQsT#rqYe3;fdYtDlN!F{K)w=CPolkdvN4N;qQsKBi z|08Ts!B5#eQq)1C6lLKc5SUo9U~mN9p;}i$4*HwRLCDaRtkvjVP6i=-OoQSI;AQ>{ z|2R#tOqAu;bpcDmKcq0~j*C)fJRm7cUN(W>h@x$CrD(piu#@jH-a?zwaVSQM6>91{ zos3q>Y+IwO7d<~%zr_EDB}YBlO5NK<05;GKL-423#OX%$*HKD(;QSKXAr*YTStkrs zLo*#?oFiOeJs*)d)hQATR0=h^v(FE4_*A~a)Iptuj}2dp+U5KX^jkc=m;VU8myDv6 z-6yEQxF`xvrS4H`DjK~R*V#?`3Mp!Vt6fU1w#@^;4{Nk4xHch~9!RNd4mGvH>&f0rhw{%RJ0Sp54yewmikDBO^n7RL#a|~7&&mcFD$PF9h8lt6t4rH>Epp3 z4J~r>rMG`VtLPen5J-r@#+$6fLXD)E?n%vOA!NfGn;4}q8kOD3Wp%9Aj^41qr{KG~ zH45FMvFMD-86V-XC=QfxpgD$HoXa%vO*dDOQ^}ftd}t|S=`6ON%(+4en}@Gz_yAPe z^TM4LhGo?w@trt6TANPSZZEJkMo=#>4nur+uZ6M0CE?F3OlIGU>Iw)`wP7}u@x$Ta zx>P2?6t5|1>qAg?<5<^hpjwMYN-YDN*+4hWKo`T7h8G()ks-ek-ki!9Iz+AO2>>^N z$qm=XjsVDoZ$=1W9zHOx=0HI_cg{1Qz$)KKSEk;Sq4^+3mK} zi|FEXJ^Ql8YJ-O22&03HeM3BPPe(9KZY&6>G?pt5k4a;gj!!X4lj9o&#_!pQCO8MZ zF$^QpEnsKA!zdgDbI@D;xsH0~1gv5d=s$KU$S>pcn6RhUrBFjQx9*6_n+FL9ZVVCj z?(tMOHFaqN3GMtYP47jAtUBzWT{{)1nzfyVmH{P)Ao@tkWoXgS3AXIM9Cg@pn)R!6 z0K3;q(L94^8Mm3k+4*>77^O3D9i2HKfcliuyWBy57sl~}QDv-yV^7K9FlunOmc2h1 zs>#sb13|+xnOr9gpEWFTQ3gY0Hp#0FI#8FkFo1UGmBeQa3E^`x{^a$ahP%-unuQu; zJzABTlxu+O77k#V1?Iv`#2tO!5BZ@(Qxj5gkLXI zf9pw&Mq{6f#Jdc2>qZ`Sjg#TF0RLX|yoN!H%vKLfWtqyL*%}f~jnYjDanA*$A#0ZI zCp|Ap%kqbII-$gMh7y`13P10>%}L_PVg0p9so${#f|AUp>k53$^@hJYM001*{NW83 z*bl;P8)NM_8johgcmB?Xqj@|2JjFP!_lnU5De(80f!hrDL3l6@$K0a7+{%8(yBlLiD+u&j&tkgO%N30Y$>uVq6=Zoyq8iF3yMh zNIb=p2~X~%YS0T&1ng|k(f?Pt=PLmPDqy^w{r|PG6y?VH%BxCZ^96V0hFiLm^l5aU>c$uBicXW(IZxs-xeF6jV40A~xXnvn6MSF4xzLf^e8R*}Q zR5I~6cU0u6*g1heQe3IXD!kQQiD%Aotpy6r`;z>{%TGkLj8Od4JgfTGw%Buj8Yiy_7C_4PEz~osrd*ON< zoeDSeN%UEryI|RL=$L;-NSBc9i4DceR|DtQ5^62Z4%@OAmaeivgdQzGpM-~GG3L0} z2N+%GjK)7ijt|6I$_UmNUJ7%aB>=g@E3+6Yi6zIgm|S=|{BstQ2*q%0HdCS5DS!Zd zoMcp2BcScmD74*&O2f0WnOOA9aH9;T7^pAYhh@6H;ceNBMV&tqI~!mD8X8kk0D#$Y z91RO!%O2YYJ-^pKsz93XTn95jnKKW!YsTsTwV2uoLf$vRzdM*_ zcp^OB$@GVN!`qxphLzmToRSDyHoXiy!^i@DzPSAXgcjT0K9*dp)HoNC7#Fo`D7rAM z^Hpy4njA(a+dCFTMoWBUxH@dkVe09+`r8-WDGX`}ugD?c_UUkU4pW>}8xjVB#vj;@ zxHdDmxrr*n>&JL5kxZ|Jjm`DEqnb!Rl3G=G(T#twP(NGlfZh<1Pr^NOx=m;XvrY&4 zgGHIfO$472&ai*@Xf9J8bEk{W*5omnaB{dXk13U3810hu*);~u;B~ha zZpdSZutZ{r#1hdG;fTZ$i6@dkM0aFEo<24w{>8a8dw6Yx7O>L(P5`v=nGC&(kH;iu#O&Oh;DKHW2*;01ZJAK)c z)5y<=&EJ3a<%MuTtKPtTc|r1b)o+`^(|T3EZKD3JI`&cc#a`91j~-IJwk&*;s5GxF zi~ReqR)*u+^hwOh;}2<$eH*T9>uvAbhg2svgcrBzE$oR6efvAlmj1tSIl3`?rcH0v z9NkEC|9{9fgk_WU$#6qBW3t{9vHPoV^<+KI?EdPZu|0e{ylgUQ>9mVV~izCUj5zRx^%a3pW(>feIRo^#i=MQrBKRYGFc6pee zO1kpYtzCbq2T#>28B%+wfi>aQsrnRV&DGrxsh$k)B`WsGje8za9X}s_ajKqO(@q}n z`1yzR_kRzY+Vx57{ofzfotPgU-L5x8koURm`jkF%uHFx;rt3BT`@sJ?9LfIjR`2pA znuq@Rm#%;RX&wH(U9XBEwTCQ@?fv;Li@a%iWell3Z1L-VEvEcw@$17DU+??nuQvAe zT9Dd9ZEU&rUyHAa1xsoVS$wr3temblGGDFu^&#EjaLIJN2`&z|PS=|wj?D|NpRP~t zeea&GPl21lx2Nlq$eUsY(Y|>woHIku$Gmy)0cq!NOaJhp@MA>D9zOK2l70Wkjv4xs G>i+>%#3v;H delta 12108 zcmZX43s@6J*Z&M;F&mO08L|smAYlUu36MY*2oNEXL?Rar3JL;(B7((gs;GFWil}(M z)m5XkqS$&ttG!sYwQ9w;YSp*Cy;!SlwOVVvv{h@{ueZ1KKiK#AKi~KIJadxWoU=RG znKS47&TqD-W8S@vc~L`Mr8WjIj~39(`prSf%(-H67Q#pYcp$`cE6d+@E6f z;qvk>;M-Hy*z0oGHBtIK36c_X;(0R~8mUcJ zy56ijMzz6S=B_yrU6%3RNc=X`C70OD3aO9?8Wn&Z$fpU(H1#8J_h@=$z$pX53ZSW} zQbqs`?~nofx{QBnSfG58*n+c?YYSenweq&rzAFas1M@ zBNU_U5?>2XZjJS*M!R~iX^6EM5DJNqk1+lgAw+==1sImds~!A6 z)Twc;MX$(FQyglJchMP3`BikDu``uYN?hadHX7!_ct>4|&;j7dDieTFnyZGSE{#0` z2txooWpvvh0&e478iv{KTIHa~4bXcsNr@~MZHpQfOMU8y7dJ4mIV+0T3v#K)GZdA} z3Jug=XH0CT3DAqJ0K(nq-3)e25u0c5N5j%qKv%2|0Y%-wnhD4fDtblyfyseeL@OH= zvNith+s-Ls3CqA!VgtK5E2uxNP61|iL8w=haj`PN2@F?3v36k@)rss1y0iQ^a32ju zC!|%6q~3{ujC}9j1jk;{9+96m+5$@0=i^XN1@PH-g9&L6MmY>Jq*ke3%l{QEGT0kp zh2Q$T)ErLYiJI1q5cfwU4AcpYvw^??x=jf0bHR7*ewU5H5xY(eY?Mk` zP$RJ64v@yL2kBVpz}9R^E2&B6AEHB|CNcwV5=$cU;5>0{>SEzty2|6#vXco1ewjzN= z4Jztud2v$Bgj;2NI{Myn!^-}liLI}2Yw=H*mXF2BrB-O3*bC&HZZ)4C+7g0dw|4Jfup$z&47<$jqx~F z%{+w;Sn#I;KB}O$P)UWQc-|0LUYY<96(tqS9yUxo8I_RyT-D#vi_S&4Qse%3$V)vF z-CdpB#s+j>2MY7oI&aw7<20(;!v=mH@fm%w75pg!a8xM))@76 zg^&yQ*+qjx`7=`nqaYenL*zoe$&W6Ix3xyZ<^qB4mIzZ(o=h7MqjbgW{@ULCwHx=> z3g?5^<;hL2F1uo|=r4+MbTN@VwY@#H6MAafPN9dU(l=_swj0wL?xX(VYq}CB5n=2M zcte~OTMXO8Q?Xm1Qmp6oc@6VX3zDYQB%@(c_Iov2#$I`c{j>t#S5p*TF-=n9!LMl+ z+%MQ@1UMvO@epwQML^wh>>5rF#1Gg|2{wz2e!eVtHWw$C`0LRDajQNu zB=l)efqxZ`>9ug8_=Vo3ZXZI$JI6x73(#UjWukJV9VOc05<}n$U~Zlv*2NVD zWB7vD8n1=7#m;!GWX>ekBc6!g3&X^T1~a@XZZvej(W1>56Yi@QrU2D8rxd{J4!0DB zm!^t!Micu@HMhun3rk7hV1uHhwp<^Tr_U9)8sCB&#L9#{uu_zmOpq5XraV|9`b`eF zPwX}=hTUSSxdaXq=bJ0wJK|+Ck5QYml46ySH-5vpi6#H^X&sC5_#Afw^kU{muk)bLxr$6rEQ6D2cB=LJ8vHc;KHtL=u;_)v6qr% zv%P>`)A27rTj&s1p_Z5F{|KX_T0#KKu369$1{%KTOYFg6iM9lO1E@9*ND$ruXp^)t z3DiBsk2u&Olr{T^?ej72#@hZTC2p{faix^NfP1!J(e=-%dx-%Rg_9fR<_*< z?CU~`%z%V`)i5${f&tKVlfiRV8hU zX*a4=?rlKW0x6^RN6y(yGCDZ94wPR3)RdUJDy1+HsF-5X>BZh+RuwqLJ(EBvlTK^Zt-$CfTsz2DdkGEU>*h_j!uu~Jaaaf4m) z%b%sRo8n;#HG^A%GaTj?VL7mxfW%h17K;~4yYH=(zUn3R9z=+prHnt9Z1t3{gx&7cs%K_<%h}i7&MM0 zsq0!cNJAjSydR|L$1g*>M>_BzgT8&ILCR;a?=$)pv)an#X3xYl=+9ovnmp*>I{ldb z{3`c7;C&jRxNvHstL`qz(j<|POo#s*q4+m2p`ymNPYe=;0)M6hB{=ptI{zAY28U$& zx0-~xfZ67r1=!pOJuYI^MfekOOG=z#uR)q5)F5{y$kqGB_frh5m1QGB>5uM*0H&BA z_KpvLkN~`OU^amSZ+CY(){WLLj|BCOKeCa9m6YT<4kx08Y4|Tq`%4B3@NsXb#ukFH z8Xz11JGOvA?i0@WOb8XDEtE?JH~BAw2_t~}TCiIMdW~Arj7II*AXF$t@2Jc#?>L6< z87;!6eE`>HhhFv_OX1ZKxehx`YrP~Yq~i8)tc%*w=FbuXsd2*(_xSHgGI+jO>dW=u zO;P4%p&uke0<+OH8Qcs!jg#~@j`J5lwf?IX!zL;mdt-#L00yWufd#6w#+4mS|GaUB zog~N4T2QgkVB%Jb$QDtR8U@oFA2Q2FhYEKdPjdtJhJw{wRhSG0^%82Y75Z?vD*aq7sHn7CtN zMS3yE$AMh-papuZZkgQ&CV9DMD$FCu)Z7%0rh7vZ-284XadF*n+#)J6qPahl%1@}F zzW4ldq*>2DSGbcaPv#zGbM2xhgJ%<;1*1?+1Q=!6g3`LtB$Fh?Q5o3j3&`|-%q4+=D29>l!N5??ES?A#ZpQp zEi4eY8G4f9ysy*E9YELx`1!&hajiWK=8EsyQ;g?r_T51+fj_~;ZL@5kCj&1!Gao00 zVucJvnNVgZ)`g<|q9#*24oxcD!z~I&<3icBY0yX4ad`>@=PnBZa=bgdYG!I<1|${F zsm!K<&MQm!V9jhkv2J_AF5(en!H}p#yv1w)Q_+EnOM{9V>i{Wc+Y6hogg55&9pcLx zheLcmGshqt5#|6iO_~2Q8n3Xv^P7}Y*PJo;!cg%!tvE9LkSqil?vPd0>-wN*;j#i2X9eVv>53B_BcNcMgt~1Wy zDChJBp%G9~z-#0Z`F@~~Z5MxYXv4%8ynBFlS0ZD_t7Q5IqOm_oH)1jQ6vWB>waQyB z(!7ua`=Co6n8^M^?Cd`bisHlmNv%7^;`xl!W6D+hCGnc^hdg?JY?sPt(Zhj71EUGG z0GE*(a+Mb~M()T*JKX$xEh<28o;wBPUvrKLffM}?ry>{Qn?t|$r*m5{$uHK;3vuhh z!U%8FB7D(F5d24H7WOm_=>TT!j_>Ox8m2XblEqCz3@>$v9XYywuZ(n-AtVz|ht9Un8 zfXBsX=XBMho9G%%#UK?!>XRPYc}3jr%*~;5OhJST@=OdNpVi|?8Ie>YBZnoq9#eIR zrN&(Hw>n&*j`rK?NzK_uJ4H(RtVP`bvjl{|Xt8L39qtm_25_V}eqlg@1bFBZ2Tl#x z5~4nNx*zk?N%+4hz^(%=-eh?+L%Y*}@C`8dfg44}Fg6ln7YJrRu>`ChFSWtB;=@u6 zbR3YC&Bv|RmPqasF#ck%3;f-hUEtJe=de%}qmo_-f*Eg6`OR(p3MG`SB_)K4Ymo*q zaSYvf3@~vdT0cu}>-GbN_0m9?0H|M_xBJE1TV4r-p?xDhdkzqi0kzC|fkuBC(LsLf zpbD~7h=!SSZy=!7i;~9HVDvV9puKU7td+`;7t~Z2nA7ThZgloB3-{k>=H6&FAEXxM zZ#5j^HwjITQ-vkJSs=`d0@xFR;E!mN4BZRG4?;;-TQN2agq&*^^N@!QYULWCJ&rd<288VX8VJf3Hju5|| z;;V`p!@ltXS`dyVTBT|ACVX^}#3rBhxPrr&l6dvY3g z0OUI^_4#t?{@U_fH08Ko9&a}WyR7JNq|iXfG%5GXA>eliPuZgYM8-UYFc6fFgYNub zA-!*+@+bSeC4@_*#v_B=FxmS;8>uv8+|!Z%*EDr*cDeY@RGxlEiNdm7spvWJ?$pFq zla6A2qnE(Vj$E*l@E1GHYPZefDe)P)UB`rPCC>ff-A5)|0@mO#_fG)L{G58v`Ogdz z1Ma8^D~eQ!LuXR++>apLAxs5D;UIrCekqZE=zn7v@u2({aT-=_fbHI^iG+pyd)igA zgmU1Rt40Zffm>WRZ4!)LH(i96#P!oX>QP?oZF9y0gWXpK8Wy5?S*54NZ>J|jvPTI- zq7RVb$~F(D($w#0>J0uz^nsYzvI=&IZ?>cb{W(Vbz9k|2@|k|j%QOCWQ26dZ!iFlM08w`O|>gyI3Ad!_?nLEqq^uvmFDp^wqCO5~8S*|ci{)$Kw;YM;_d zTwL>brIhn&A~nyhx+)P00Q0eHv2kvy5D#W;69><;;-^^y!OM>9FvcJ}L;7fui!0|@ z>}TjMMw#Nt^`zhzRB}>Cn*{`l{)`4%^i-tEDj8~0$s`^#BYUe z&#b-_r6l39#LT+G8+!muEUcLa#N^iO$c24>jieW9+_m%IcrNI@4Qp@1mZ!wItx>R6 z+}zqfcBu}z%qTj{$cHgK%~jz39ME_NT;PD?d=RzSsTO}~%_p(dF~23YELN}&?5%*L z-LHU_3J``Om609-hI4%$?k}78!F*TTFKLLzz~6i;}uv;bq5P^ST zxUph-TUz913)*aHoD3)t?F%I_gqlURDeMdo=d?8_E<4UjJ_#p@DL>tDPrTo@4d)JO z^nhNjuWjc5^_f=`w=LKtDe_$^5_1;Lm7FR4=8Sl3pE*iR8r_C7C58 z%bi}l3obd}Us4Grj#->TJik;g{dl&a1Qc9aDE_=OrFBFWMfYx&{Qq`@93KiI&3`Cp zhkmEApW(0GBa+!iBpoTRd_|v6Z3~*cEeKUB^f#TocZ%=dDdz4JQ$O`?fcrPV#Th}A z*zzT!;(8B({fB@sGHCHXfROT^g+a~T`FT$7TwtFI?)w1c>wQanVVPOtbs4?l*=1=Z zoyy)Rz&-^CKJY(xFyRx(dLqq10`$9|U00G#g(HQ4O$xJy^qD*^SXaLesd%rli{PyM znRPo!r9vjlpe8tRU9niaTwABkmgM@lm&V}-<+HozH~Ii6z^Nx2KisfSUz| zJKB9rxfVFDQ`A#13EA=_o?QAj0-;gZEGDYsY+qSNyVpqyY9KAG_SBT-wZrJ% zFHNNF{8HpsCcuxx!j;~zVN=j}B~ao&lsI{y{#CJaWmLrS#{8QET`Q#fSs;}BX9AE( z{9vWUwrP(l_rfedPf--Po~KpZ-0@IFLe~BHVZe;M1LR~mdXQrP%In}_3Did4iPlvd z94QvA8VF0o)vMN!{Rr*qEK*5StR4*?iHBEL7XhH05&dBHRkor?&8~;lSe?u zY|xPa&PicD<6hNId~Izs-P+QCN!QC4iNw@p@wgdEYMb&dIQn{5q$J5iQ>Kce)@ozwWd!AN0N-&b=Ql41XNWI$f*K4)KR|``|iptH_XP zJ}4#;EPq4Hj~MVYDM6cEa96jgF|#j@F;@*0!pjSD*DuLk4~Ya32__;TLJ2|t{>UOgmAz2FBq^WJXB|4t-5hdW%O@UkbF@ z8cCUDhntxo%;?Jt{D;cwA>)$-!U^(+SKY$jgmJV9RYXj8w7r5_30=m1$)liR7`|)+ zYS7U!miM^oYD0hHE@iE|%Jut@b*`^*K^R4o9YVMg4Iw-h$}_)Cwp`Bo%k;b-J{Tm{xdbX{;YqN}06j7Mr!GkPL>9#AxOb5cIk*r$cMJ|=sP$0a0j zv~@RxqIYCqV%-)txr%=YE{AHInaF;tfGLWb_~xjl zNch=^O`74AJXZsCgO?rvRN59tg0X=}; z&rHQ%$)}uWmm_pLDL)n2Ye9+As%m@*a9>K%D-rp_P%BN7Ejm(7ayBK|E?S*}wBZd_ z=$4v2C8x6Z{*40w2*))M<_)KwhB=`AY!-Vpu`t1K6ZqL_+72F0pex6 zMv`drT(meG#l{#W@x^TYknAO{DR_y4T1kq=&~OM}-w5)z(BoW(`s(m1ZR?KWw&!{t z5q5!1F3tF<{W4Bg!K>_kDOy0IL}mUU;G0ymcyJKzR&^AS9s8zXz^{KPUZd8&84vvU zm>OyG;1&KX|13?hk)h_>H!^VOpteZLWc@k_l=0)sIpB5#8!t=k(STGf+35s*BPnfN zEJaJD`E9Lym*F<*O2$(nG*}T%U7+KUD6{SiwS40G!SW6MAd+mtXv(x_3jnO6>xSU# zVI&%(9IaGI4{R^89#+8*nzX{8aA=|<4D*Gntm`tFVO|lVrxK{qZNWc8;rDVCM(SPo z%!uV^mYm;-{)(pe@t4s@@dznfdrne=ad9Y|PCcT+sW9}9sJ0&3lTT5L9kZm=2J0dK zys%26f}3OF>4B8WniWp1aby5k0O#^wx%QG_wJKnEOd{Hg;D=ogfw`7u*tjq}WGGck z4JR8WzX+?#0A-~p#WCP9UK{LE(<0X{z56rTKvz}n;K=geu;CWl&rDU)OwZ(|a}cr} z92*y^Fc_3Q%2hRNz73r;!=><_+RX~BKN9^@WseSWnH8@WaiA%JTb0E$@{PCFk$uCe z_dhk4u<|t487ML{@hn@3A4KtCnq<0qN1nAIhl((l?&RWj7<`8;w!t+6OdX+BYP1wll)J4s~1e7<7b<6{*&8Sjp z9$-%g+VOh25KgSyU)M>-{+qyzL`JXNsBz2zuoFzJyD@eoKuLk_L?$Ytx>zyDUJV=O zJBI^S#d4oTpd=-h1U^q>bgiLG%>}|7Y0t(2vvXlE{ya>03Y^-s!KMFAW82%|tK%tWVixCOP$Nm$M(Oz2M=RnM>D^q5ekH>6Na7Pa<>%AE}f zK5p<69&TPDoSwKQ(3QlfTdP>>8E76*^aezirJSHaZ%?qM_oEKE&ahsU7GUS|QnWh*zCu8oG9^s6f!lwMz+PKvHW8R#WVz7KWllq6&g@#FI{ z{@oitk8q;NG%HaXYSBuCVTJ1``%$>sQ|+3VRD-uCYTrsez6c0EL8Nf9v9j&&0=_@m z&eXk$1_xFpGgg=oIGW51%X*K23(@xqJj;mQFTlNdsJamNsP72(6~_vk(*C= zY2&z&xQw&vz-FgpqM6$j$hR=Dt-re^LOt1K>XB1Lz*f3{I4bOK$<`(y|1N+AsH;u` z^cA9;P`~&(m#UJX8-AwaF{~x~xw&paz2}F2?@A0qW0!{Dm-RIrm4_VTWO%xd|5wwZ zxlY0W zdvkK)1C}6FQVMl_29G*UdVBobS=2tF?ql|1fUzKcv7mN4NU01koX6KM3q2TY z{{>q58>06;xG^1-u;i^2#a7mk$;}R&wlZ;&&Sdu2!1q=rKF%_nZ|mC3onVPCOpAhw zgL{yl1j$Ljw&QL6wn=1b#2QFXVPaLMJaxs@MNMW*vJwJ-K;KThT zHDnKE67W#+Y!_NF_{PZ=MVz1rB6%G&+TP^bP>hTnSH!m|X25mof*O-Qtsd!zqgW1P zwB44}XJFL~c{+JHK|3+Nv$b~|2`3tu5g6;G9scf6!k=;crXced*2qIP-1@67eyzW{0W zzX0_*{W9tvf_kI-teO7C6Er6jkYiW4a31zmtLQD-l`S+H*n3L6NeZ2o2)tM0SO$f` z04nH!{xhZP8S@#g+<}Y=bRlprm5FZ6Ew}-N?mjGo+C$OL(db}u*F3X<;5xT?w0Rso zi;g0Y9}g_275jF|2@O-v?H$97%EWfnt&#R|P`eqCw>9+j!i^E|M*imnE00swjS|*F z@+%iUrsx5nGKK5F`Y`~uu0e%s&JI)T)xM+DZc#xmT2MI{oz5c!`veB!Ii{i-(DVUS zjP_>|bS$kmrJ&y$sCW{*zEhDG$1Vu`;leUSTK?^xGCX&#V>1A0w^YEBk!KCi69z%F zHt?a1iA>w-;y*x(L#tXL8b1+8SQ6Zx*ig871F+wdP@8dPDpfft*cTtt06sMI3ShMW zP=m}^X@v;+^U#-p_%y~8bz*?QfzGPEL*)4NNOK7RDI<$vmc0ldS73Y^V z&jl`|F|lxC;6WNw>ewp)AAOum(X}9Gy&8&kdyvmcFbaKR-+JZamuJx72<>hS@e>pn zVUXbjJ@vKokW4EJJe$s#6Z0x@b{)(^LnBJ_05F-4qlzJ#BD6NzZw0TX5Y~2Q3Xod- z>1bBqLOP>EV~Z@Z=C^~!l3KarV3D zDGhl1AQx==+_k5W7j*qxf7U*&KoNne3}%ATu?V-Q$7%t!oZ1ci?vsIw8B7yA6>!?g z+g`5*p0zV6meDzAS}d5@c@VgUlV!1|u;mX3&DMRv<}9pK+m`?|C$xJgx-_HhZLTfw zsh!cvULA`0caCvgA_e6{HZsfgo@x^PRAOcRCrnfk^29UxgLpeEpfIoC|6*tutXIbd0; zp_8CeLPQP-tjS`Ah5qcEl)(>kei7_E9QZbiNrf$e=xnA~erdEr65Kro&EmE9mQ2lN zh_FN=h(r?65aEbK5s4-eLqvOcYPK#ilj>YZvnK*;C+pbk_cjMbGja_BG?8@O_y+&9 z(W$96CWt*(%-%oUkKOhUr~xc7%CQ5Wi6(Ey0;Z2>H{J<+Ia$YP7rfVxy)=WE-1#E- zzfF!`3`9)P>6zmfCI79uxHaJGQ(fFj{kLk}l|Xx+YTcD5RO?p*C>0fsm;>W6*-T0qay9&n&nr{cpq7_X6Xll6$Pa_k`+UV8>Kl0`u_O4e-CO zx*xbf?#td^-1&rR`^7-aG@XIne(_1&#YKUUL>F;!(bgxfyMJJ7;KOM;CeS@ir(($2 z6BgeD{xwaf<@f&}XMfr)A@ljo@f}YXeRjM1@of@>za3=LVkx`2Z}N%a3FAB0_Wa%F z=;=BoOU|CKxVRE6Jf({*v|>E0)G>lX*? zEjoP=`L1fwB?NzT?We$r7M(h%`Js3?z=@f|96eBuO6N|OZRU0 F{{y}`LD~QS From 1ab7a3015776750b9237dd2db49cb7051d920a57 Mon Sep 17 00:00:00 2001 From: Daniel Wigh Date: Sat, 20 Jan 2024 22:15:12 +0000 Subject: [PATCH 29/29] fix strict mypy --- orderly/extract/extractor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/orderly/extract/extractor.py b/orderly/extract/extractor.py index a205e83c..ba8e4179 100644 --- a/orderly/extract/extractor.py +++ b/orderly/extract/extractor.py @@ -1028,7 +1028,7 @@ def contains_transition_metal(agents: AGENTS) -> bool: return True return False - def contains_charcoal(procedure_details) -> bool: + def contains_charcoal(procedure_details: str) -> bool: if "charcoal" in procedure_details.lower(): return True return False