moj-analytical-services · RobinL · Feb 16, 2023 · Jan 21, 2023 · Jan 21, 2023 · Jan 21, 2023
diff --git a/poetry.lock b/poetry.lock
diff --git a/splink/accuracy.py b/splink/accuracy.py
@@ -153,6 +153,9 @@ def truth_space_table_from_labels_table(
     linker, labels_tablename, threshold_actual=0.5, match_weight_round_to_nearest=None
 ):
 
+    # Read from the cache or generate
+    concat_with_tf = linker._initialise_df_concat_with_tf()
+
     sqls = predictions_from_sample_of_pairwise_labels_sql(linker, labels_tablename)
 
     for sql in sqls:
@@ -166,7 +169,7 @@ def truth_space_table_from_labels_table(
     for sql in sqls:
         linker._enqueue_sql(sql["sql"], sql["output_table_name"])
 
-    df_truth_space_table = linker._execute_sql_pipeline()
+    df_truth_space_table = linker._execute_sql_pipeline([concat_with_tf])
 
     return df_truth_space_table
 
@@ -252,6 +255,10 @@ def prediction_errors_from_labels_table(
     include_false_negatives=True,
     threshold=0.5,
 ):
+
+    # Read from the cache or generate
+    nodes_with_tf = linker._initialise_df_concat_with_tf()
+
     sqls = predictions_from_sample_of_pairwise_labels_sql(linker, labels_tablename)
 
     for sql in sqls:
@@ -290,7 +297,7 @@ def prediction_errors_from_labels_table(
 
     linker._enqueue_sql(sql, "__splink__labels_with_fp_fn_status")
 
-    return linker._execute_sql_pipeline()
+    return linker._execute_sql_pipeline([nodes_with_tf])
 
 
 def _predict_from_label_column_sql(linker, label_colname):

diff --git a/splink/analyse_blocking.py b/splink/analyse_blocking.py
@@ -6,7 +6,6 @@
 from .blocking import _sql_gen_where_condition, block_using_rules_sql
 
 from .misc import calculate_cartesian, calculate_reduction_ratio
-from .vertically_concatenate import vertically_concatenate_sql
 
 # https://stackoverflow.com/questions/39740632/python-type-hinting-without-cyclic-imports
 if TYPE_CHECKING:
@@ -43,19 +42,28 @@ def cumulative_comparisons_generated_by_blocking_rules(
     output_chart=True,
 ):
 
+    # Deepcopy our original linker so we can safely adjust our settings.
+    # This is particularly important to ensure we don't overwrite our
+    # original blocking rules.
     linker = deepcopy(linker)
 
     settings_obj = linker._settings_obj
     linker._settings_obj_ = settings_obj
-
-    # Deepcopy our original linker so we can safely adjust our settings.
-    # This is particularly important to ensure we don't overwrite our
-    # original blocking rules.
+    linker._analyse_blocking_mode = True
 
     if blocking_rules:
         brs_as_objs = settings_obj._brs_as_objs(blocking_rules)
         linker._settings_obj_._blocking_rules_to_generate_predictions = brs_as_objs
 
+    # Turn tf off.  No need to apply term frequencies to perform these calcs
+    settings_obj._retain_matching_columns = False
+    settings_obj._retain_intermediate_calculation_columns = False
+    for cc in settings_obj.comparisons:
+        for cl in cc.comparison_levels:
+            cl._level_dict["tf_adjustment_column"] = None
+
+    concat = linker._initialise_df_concat(materialise=True)
+
     # Calculate the Cartesian Product
     if output_chart:
         # We only need the cartesian product if we want to output the chart view
@@ -64,23 +72,19 @@ def cumulative_comparisons_generated_by_blocking_rules(
         else:
             group_by_statement = "group by source_dataset"
 
-        sql = vertically_concatenate_sql(linker)
-        linker._enqueue_sql(sql, "__splink__df_concat")
-
         sql = f"""
             select count(*) as count
-            from __splink__df_concat
+            from {concat.physical_name}
             {group_by_statement}
         """
         linker._enqueue_sql(sql, "__splink__cartesian_product")
-        cartesian_count = linker._execute_sql_pipeline()
+        cartesian_count = linker._execute_sql_pipeline([concat])
         row_count_df = cartesian_count.as_record_dict()
         cartesian_count.drop_table_from_database()
 
         cartesian = calculate_cartesian(row_count_df, settings_obj._link_type)
 
     # Calculate the total number of rows generated by each blocking rule
-    linker._initialise_df_concat_with_tf(materialise=False)
     sql = block_using_rules_sql(linker)
     linker._enqueue_sql(sql, "__splink__df_blocked_data")
 
@@ -95,7 +99,7 @@ def cumulative_comparisons_generated_by_blocking_rules(
         order by cast(match_key as int) asc
     """
     linker._enqueue_sql(sql, "__splink__df_count_cumulative_blocks")
-    cumulative_blocking_rule_count = linker._execute_sql_pipeline()
+    cumulative_blocking_rule_count = linker._execute_sql_pipeline([concat])
     br_n = cumulative_blocking_rule_count.as_pandas_dataframe()
     cumulative_blocking_rule_count.drop_table_from_database()
     br_count, br_keys = list(br_n.row_count), list(br_n["match_key"].astype("int"))
@@ -136,4 +140,6 @@ def cumulative_comparisons_generated_by_blocking_rules(
 
         br_comparisons.append(out_dict.copy())
 
+    linker._analyse_blocking_mode = False
+
     return br_comparisons
diff --git a/splink/athena/athena_linker.py b/splink/athena/athena_linker.py
@@ -341,6 +341,10 @@ def _execute_sql_against_backend(self, sql, templated_name, physical_name):
 
     def register_table(self, input, table_name, overwrite=False):
 
+        # If the user has provided a table name, return it as a SplinkDataframe
+        if isinstance(input, str):
+            return self._table_to_splink_dataframe(table_name, input)
+
         # Check if table name is already in use
         exists = self._table_exists_in_database(table_name)
         if exists:

diff --git a/splink/blocking.py b/splink/blocking.py
@@ -116,6 +116,31 @@ def block_using_rules_sql(linker: Linker):
             " will not be implemented for this run."
         )
 
+    if (
+        linker._two_dataset_link_only
+        and not linker._find_new_matches_mode
+        and not linker._compare_two_records_mode
+    ):
+        source_dataset_col = linker._settings_obj._source_dataset_column_name
+        # Need df_l to be the one with the lowest id to preeserve the property
+        # that the left dataset is the one with the lowest concatenated id
+        keys = linker._input_tables_dict.keys()
+        keys = list(sorted(keys))
+        df_l = linker._input_tables_dict[keys[0]]
+        df_r = linker._input_tables_dict[keys[1]]
+
+        sql = f"""
+        select * from __splink__df_concat_with_tf
+        where {source_dataset_col} = '{df_l.templated_name}'
+        """
+        linker._enqueue_sql(sql, "__splink__df_concat_with_tf_left")
+
+        sql = f"""
+        select * from __splink__df_concat_with_tf
+        where {source_dataset_col} = '{df_r.templated_name}'
+        """
+        linker._enqueue_sql(sql, "__splink_df_concat_with_tf_right")
+
     # Cover the case where there are no blocking rules
     # This is a bit of a hack where if you do a self-join on 'true'
     # you create a cartesian product, rather than having separate code