From aab2afb96dc328be365767d3059c6f4693e62d61 Mon Sep 17 00:00:00 2001 From: Shreya Shankar Date: Sun, 27 Oct 2024 19:08:08 -0700 Subject: [PATCH] fix: simplify formula for equation --- docetl/operations/resolve.py | 44 ++++++++++++++++++++++-------------- mkdocs.yml | 1 - 2 files changed, 27 insertions(+), 18 deletions(-) diff --git a/docetl/operations/resolve.py b/docetl/operations/resolve.py index 379556ec..11715399 100644 --- a/docetl/operations/resolve.py +++ b/docetl/operations/resolve.py @@ -295,9 +295,9 @@ def get_embeddings_batch( total_cost += sum(costs) # Generate all pairs to compare, ensuring no duplicate comparisons - def get_unique_comparison_pairs(): + def get_unique_comparison_pairs() -> Tuple[List[Tuple[int, int]], Dict[Tuple[str, ...], List[int]]]: # Create a mapping of values to their indices - value_to_indices = {} + value_to_indices: Dict[Tuple[str, ...], List[int]] = {} for i, item in enumerate(input_data): # Create a hashable key from the blocking keys key = tuple(str(item.get(k, "")) for k in blocking_keys) @@ -323,7 +323,7 @@ def get_unique_comparison_pairs(): comparison_pairs, value_to_indices = get_unique_comparison_pairs() # Filter pairs based on blocking conditions - def meets_blocking_conditions(pair): + def meets_blocking_conditions(pair: Tuple[int, int]) -> bool: i, j = pair return ( is_match(input_data[i], input_data[j]) if blocking_conditions else False @@ -374,7 +374,7 @@ def meets_blocking_conditions(pair): # Modified merge_clusters to handle all indices with the same value - def merge_clusters(item1, item2): + def merge_clusters(item1: int, item2: int) -> None: root1, root2 = find_cluster(item1, cluster_map), find_cluster( item2, cluster_map ) @@ -416,21 +416,31 @@ def merge_clusters(item1, item2): ) # Compute an auto-batch size based on the number of comparisons - def auto_batch(): - M = 500 # should be made dynamic in the future based on the model, but this is the current rate limit for 4o-mini. - # (n-k)(k-1) is approximately len(filtered_pairs) - N = len(input_data) - # -k^2 + (n+1)k- n- fp + k^2/2 - k/2= 0, solve for k - quadratic_discriminant = (N + 0.5) ** 2 - 4 * (-1 / 2) * (-N - len(blocked_pairs)) - K_cands = [(-1 * (N+0.5) + (quadratic_discriminant) ** 0.5 )/ (-1), (-1 * (N + 0.5) - (quadratic_discriminant) ** 0.5)/ (-1)] - K = max(K_cands) - if K < 0: - return M - else: - return min(math.ceil(2*K), M) + def auto_batch() -> int: + # Maximum batch size limit for 4o-mini model + M = 500 + + n = len(input_data) + m = len(blocked_pairs) + + # https://www.wolframalpha.com/input?i=k%28k-1%29%2F2+%2B+%28n-k%29%28k-1%29+%3D+m%2C+solve+for+k + # Two possible solutions for k: + # k = -1/2 sqrt((1 - 2n)^2 - 8m) + n + 1/2 + # k = 1/2 (sqrt((1 - 2n)^2 - 8m) + 2n + 1) + + discriminant = (1 - 2*n)**2 - 8*m + sqrt_discriminant = discriminant ** 0.5 + + k1 = -0.5 * sqrt_discriminant + n + 0.5 + k2 = 0.5 * (sqrt_discriminant + 2*n + 1) + + # Take the maximum viable solution + k = max(k1, k2) + return M if k < 0 else min(int(k), M) # Compare pairs and update clusters in real-time batch_size = self.config.get("compare_batch_size", auto_batch()) + self.console.log(f"Using compare batch size: {batch_size}") pair_costs = 0 pbar = RichLoopBar( @@ -441,7 +451,7 @@ def auto_batch(): last_processed = 0 for i in pbar: batch_end = last_processed + batch_size - batch = blocked_pairs[last_processed:batch_end] + batch = blocked_pairs[last_processed : batch_end] # Filter pairs for the initial batch better_batch = [ pair for pair in batch diff --git a/mkdocs.yml b/mkdocs.yml index 988c272f..42622b64 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -1,5 +1,4 @@ site_name: docetl docs -site_url: https://docetl.com/ # use_directory_urls: false # strict: true # docs_dir: docs