diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 00000000..e69de29b diff --git a/404.html b/404.html new file mode 100644 index 00000000..c60d9f73 --- /dev/null +++ b/404.html @@ -0,0 +1,749 @@ + + + + + + + + + + + + + + + + + + + docetl docs + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+
+ +
+ + + + + + +
+ + + + + + + +
+ +
+ + + + +
+
+ + + +
+
+
+ + + + + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ +

404 - Not found

+ +
+
+ + + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + + \ No newline at end of file diff --git a/advanced/custom-operators/index.html b/advanced/custom-operators/index.html new file mode 100644 index 00000000..fbd0bf38 --- /dev/null +++ b/advanced/custom-operators/index.html @@ -0,0 +1,783 @@ + + + + + + + + + + + + + + + + + + + Custom operators - docetl docs + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+
+ +
+ + + + + + +
+ + + + + + + +
+ +
+ + + + +
+
+ + + +
+
+
+ + + + + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

Custom operators

+ + + + + + + + + + + + + + + + + + +
+
+ + + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + + \ No newline at end of file diff --git a/advanced/extending-agents/index.html b/advanced/extending-agents/index.html new file mode 100644 index 00000000..5bd3d662 --- /dev/null +++ b/advanced/extending-agents/index.html @@ -0,0 +1,783 @@ + + + + + + + + + + + + + + + + + + + Extending agents - docetl docs + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+
+ +
+ + + + + + +
+ + + + + + + +
+ +
+ + + + +
+
+ + + +
+
+
+ + + + + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

Extending agents

+ + + + + + + + + + + + + + + + + + +
+
+ + + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + + \ No newline at end of file diff --git a/advanced/performance-tuning/index.html b/advanced/performance-tuning/index.html new file mode 100644 index 00000000..12f975d1 --- /dev/null +++ b/advanced/performance-tuning/index.html @@ -0,0 +1,783 @@ + + + + + + + + + + + + + + + + + + + Performance tuning - docetl docs + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+
+ +
+ + + + + + +
+ + + + + + + +
+ +
+ + + + +
+
+ + + +
+
+
+ + + + + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

Performance tuning

+ + + + + + + + + + + + + + + + + + +
+
+ + + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + + \ No newline at end of file diff --git a/api-reference/cli/index.html b/api-reference/cli/index.html new file mode 100644 index 00000000..33162670 --- /dev/null +++ b/api-reference/cli/index.html @@ -0,0 +1,1549 @@ + + + + + + + + + + + + + + + + + + + + + + + CLI Interface - docetl docs + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + Skip to content + + +
+
+ +
+ + + + + + +
+ + + + + + + +
+ +
+ + + + +
+
+ + + +
+
+
+ + + + + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

CLI Interface

+ +
+ + +

+ docetl.cli.run(yaml_file=typer.Argument(..., help='Path to the YAML file containing the pipeline configuration'), max_threads=typer.Option(None, help='Maximum number of threads to use for running operations')) + +

+ + +
+ +

Run the configuration specified in the YAML file.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ yaml_file + + Path + +
+

Path to the YAML file containing the pipeline configuration.

+
+
+ Argument(..., help='Path to the YAML file containing the pipeline configuration') +
+ max_threads + + Optional[int] + +
+

Maximum number of threads to use for running operations.

+
+
+ Option(None, help='Maximum number of threads to use for running operations') +
+ +
+ Source code in docetl/cli.py +
54
+55
+56
+57
+58
+59
+60
+61
+62
+63
+64
+65
+66
+67
+68
+69
+70
+71
+72
+73
+74
+75
+76
+77
+78
+79
@app.command()
+def run(
+    yaml_file: Path = typer.Argument(
+        ..., help="Path to the YAML file containing the pipeline configuration"
+    ),
+    max_threads: Optional[int] = typer.Option(
+        None, help="Maximum number of threads to use for running operations"
+    ),
+):
+    """
+    Run the configuration specified in the YAML file.
+
+    Args:
+        yaml_file (Path): Path to the YAML file containing the pipeline configuration.
+        max_threads (Optional[int]): Maximum number of threads to use for running operations.
+    """
+    # Get the current working directory (where the user called the command)
+    cwd = os.getcwd()
+
+    # Load .env file from the current working directory
+    env_file = os.path.join(cwd, ".env")
+    if os.path.exists(env_file):
+        load_dotenv(env_file)
+
+    runner = DSLRunner.from_yaml(str(yaml_file), max_threads=max_threads)
+    runner.load_run_save()
+
+
+
+ +
+ +
+ + +

+ docetl.cli.build(yaml_file=typer.Argument(..., help='Path to the YAML file containing the pipeline configuration'), max_threads=typer.Option(None, help='Maximum number of threads to use for running operations'), model=typer.Option('gpt-4o', help='Model to use for optimization'), resume=typer.Option(False, help='Resume optimization from a previous build that may have failed'), timeout=typer.Option(60, help='Timeout for optimization operations in seconds')) + +

+ + +
+ +

Build and optimize the configuration specified in the YAML file.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ yaml_file + + Path + +
+

Path to the YAML file containing the pipeline configuration.

+
+
+ Argument(..., help='Path to the YAML file containing the pipeline configuration') +
+ max_threads + + Optional[int] + +
+

Maximum number of threads to use for running operations.

+
+
+ Option(None, help='Maximum number of threads to use for running operations') +
+ model + + str + +
+

Model to use for optimization. Defaults to "gpt-4o".

+
+
+ Option('gpt-4o', help='Model to use for optimization') +
+ resume + + bool + +
+

Whether to resume optimization from a previous run. Defaults to False.

+
+
+ Option(False, help='Resume optimization from a previous build that may have failed') +
+ timeout + + int + +
+

Timeout for optimization operations in seconds. Defaults to 60.

+
+
+ Option(60, help='Timeout for optimization operations in seconds') +
+ +
+ Source code in docetl/cli.py +
14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
@app.command()
+def build(
+    yaml_file: Path = typer.Argument(
+        ..., help="Path to the YAML file containing the pipeline configuration"
+    ),
+    max_threads: Optional[int] = typer.Option(
+        None, help="Maximum number of threads to use for running operations"
+    ),
+    model: str = typer.Option("gpt-4o", help="Model to use for optimization"),
+    resume: bool = typer.Option(
+        False, help="Resume optimization from a previous build that may have failed"
+    ),
+    timeout: int = typer.Option(
+        60, help="Timeout for optimization operations in seconds"
+    ),
+):
+    """
+    Build and optimize the configuration specified in the YAML file.
+
+    Args:
+        yaml_file (Path): Path to the YAML file containing the pipeline configuration.
+        max_threads (Optional[int]): Maximum number of threads to use for running operations.
+        model (str): Model to use for optimization. Defaults to "gpt-4o".
+        resume (bool): Whether to resume optimization from a previous run. Defaults to False.
+        timeout (int): Timeout for optimization operations in seconds. Defaults to 60.
+    """
+    # Get the current working directory (where the user called the command)
+    cwd = os.getcwd()
+
+    # Load .env file from the current working directory
+    env_file = os.path.join(cwd, ".env")
+    if os.path.exists(env_file):
+        load_dotenv(env_file)
+
+    runner = DSLRunner.from_yaml(str(yaml_file), max_threads=max_threads)
+    runner.optimize(
+        save=True, return_pipeline=False, model=model, resume=resume, timeout=timeout
+    )
+
+
+
+ +
+ +
+ + +

+ docetl.cli.clear_cache() + +

+ + +
+ +

Clear the LLM cache stored on disk.

+ +
+ Source code in docetl/cli.py +
82
+83
+84
+85
+86
+87
@app.command()
+def clear_cache():
+    """
+    Clear the LLM cache stored on disk.
+    """
+    cc()
+
+
+
+ +
+ + + + + + + + + + + + + +
+
+ + + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + + \ No newline at end of file diff --git a/api-reference/docetl/index.html b/api-reference/docetl/index.html new file mode 100644 index 00000000..42d55541 --- /dev/null +++ b/api-reference/docetl/index.html @@ -0,0 +1,6161 @@ + + + + + + + + + + + + + + + + + + + + + + + docetl Core - docetl docs + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + Skip to content + + +
+
+ +
+ + + + + + +
+ + + + + + + +
+ +
+ + + + +
+
+ + + +
+
+
+ + + + + + + +
+
+
+ + + + + + + +
+
+ + + + + + + +

docetl Core

+ +
+ + + +

+ docetl.DSLRunner + + +

+ + +
+

+ Bases: ConfigWrapper

+ + +

DSLRunner orchestrates pipeline execution by building and traversing a DAG of OpContainers. +The runner uses a two-phase approach:

+
    +
  1. Build Phase:
  2. +
  3. Parses YAML config into a DAG of OpContainers
  4. +
  5. Each operation becomes a node connected to its dependencies
  6. +
  7. Special handling for equijoins which have two parent nodes
  8. +
  9. +

    Validates operation syntax and schema compatibility

    +
  10. +
  11. +

    Execution Phase:

    +
  12. +
  13. Starts from the final operation and pulls data through the DAG
  14. +
  15. Handles caching/checkpointing of intermediate results
  16. +
  17. Tracks costs and execution metrics
  18. +
  19. Manages dataset loading and result persistence
  20. +
+

The separation between build and execution phases allows for: +- Pipeline validation before any execution +- Cost estimation and optimization +- Partial pipeline execution for testing

+ + + + + + +
+ Source code in docetl/runner.py +
 53
+ 54
+ 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
+162
+163
+164
+165
+166
+167
+168
+169
+170
+171
+172
+173
+174
+175
+176
+177
+178
+179
+180
+181
+182
+183
+184
+185
+186
+187
+188
+189
+190
+191
+192
+193
+194
+195
+196
+197
+198
+199
+200
+201
+202
+203
+204
+205
+206
+207
+208
+209
+210
+211
+212
+213
+214
+215
+216
+217
+218
+219
+220
+221
+222
+223
+224
+225
+226
+227
+228
+229
+230
+231
+232
+233
+234
+235
+236
+237
+238
+239
+240
+241
+242
+243
+244
+245
+246
+247
+248
+249
+250
+251
+252
+253
+254
+255
+256
+257
+258
+259
+260
+261
+262
+263
+264
+265
+266
+267
+268
+269
+270
+271
+272
+273
+274
+275
+276
+277
+278
+279
+280
+281
+282
+283
+284
+285
+286
+287
+288
+289
+290
+291
+292
+293
+294
+295
+296
+297
+298
+299
+300
+301
+302
+303
+304
+305
+306
+307
+308
+309
+310
+311
+312
+313
+314
+315
+316
+317
+318
+319
+320
+321
+322
+323
+324
+325
+326
+327
+328
+329
+330
+331
+332
+333
+334
+335
+336
+337
+338
+339
+340
+341
+342
+343
+344
+345
+346
+347
+348
+349
+350
+351
+352
+353
+354
+355
+356
+357
+358
+359
+360
+361
+362
+363
+364
+365
+366
+367
+368
+369
+370
+371
+372
+373
+374
+375
+376
+377
+378
+379
+380
+381
+382
+383
+384
+385
+386
+387
+388
+389
+390
+391
+392
+393
+394
+395
+396
+397
+398
+399
+400
+401
+402
+403
+404
+405
+406
+407
+408
+409
+410
+411
+412
+413
+414
+415
+416
+417
+418
+419
+420
+421
+422
+423
+424
+425
+426
+427
+428
+429
+430
+431
+432
+433
+434
+435
+436
+437
+438
+439
+440
+441
+442
+443
+444
+445
+446
+447
+448
+449
+450
+451
+452
+453
+454
+455
+456
+457
+458
+459
+460
+461
+462
+463
+464
+465
+466
+467
+468
+469
+470
+471
+472
+473
+474
+475
+476
+477
+478
+479
+480
+481
+482
+483
+484
+485
+486
+487
+488
+489
+490
+491
+492
+493
+494
+495
+496
+497
+498
+499
+500
+501
+502
+503
+504
+505
+506
+507
+508
+509
+510
+511
+512
+513
+514
+515
+516
+517
+518
+519
+520
+521
+522
+523
+524
+525
+526
+527
+528
+529
+530
+531
+532
+533
+534
+535
+536
+537
+538
+539
+540
+541
+542
+543
+544
+545
+546
+547
+548
+549
+550
+551
+552
+553
+554
+555
+556
+557
+558
+559
+560
+561
+562
+563
+564
+565
+566
+567
+568
+569
+570
+571
+572
+573
+574
+575
+576
+577
+578
+579
+580
+581
+582
+583
+584
+585
+586
+587
+588
+589
+590
+591
+592
+593
+594
+595
+596
+597
+598
+599
+600
+601
+602
+603
+604
+605
+606
+607
+608
+609
+610
+611
+612
+613
+614
+615
+616
+617
+618
+619
+620
+621
+622
+623
+624
+625
+626
+627
+628
+629
+630
+631
+632
+633
+634
+635
+636
+637
+638
+639
+640
+641
+642
+643
+644
+645
+646
+647
+648
+649
+650
+651
+652
+653
+654
+655
+656
+657
+658
+659
+660
+661
+662
+663
+664
+665
+666
+667
+668
+669
+670
+671
+672
+673
+674
+675
+676
+677
+678
+679
+680
+681
+682
+683
+684
+685
+686
+687
+688
+689
+690
+691
+692
+693
+694
+695
+696
+697
+698
+699
+700
+701
+702
+703
+704
+705
+706
+707
+708
+709
+710
class DSLRunner(ConfigWrapper):
+    """
+    DSLRunner orchestrates pipeline execution by building and traversing a DAG of OpContainers.
+    The runner uses a two-phase approach:
+
+    1. Build Phase:
+       - Parses YAML config into a DAG of OpContainers
+       - Each operation becomes a node connected to its dependencies
+       - Special handling for equijoins which have two parent nodes
+       - Validates operation syntax and schema compatibility
+
+    2. Execution Phase:
+       - Starts from the final operation and pulls data through the DAG
+       - Handles caching/checkpointing of intermediate results
+       - Tracks costs and execution metrics
+       - Manages dataset loading and result persistence
+
+    The separation between build and execution phases allows for:
+    - Pipeline validation before any execution
+    - Cost estimation and optimization
+    - Partial pipeline execution for testing
+    """
+
+    @classproperty
+    def schema(cls):
+        # Accessing the schema loads all operations, so only do this
+        # when we actually need it...
+        # Yes, this means DSLRunner.schema isn't really accessible to
+        # static type checkers. But it /is/ available for dynamic
+        # checking, and for generating json schema.
+
+        OpType = functools.reduce(
+            lambda a, b: a | b, [op.schema for op in get_operations().values()]
+        )
+        # More pythonic implementation of the above, but only works in python 3.11:
+        # OpType = Union[*[op.schema for op in get_operations().values()]]
+
+        class Pipeline(BaseModel):
+            config: Optional[dict[str, Any]]
+            parsing_tools: Optional[list[schemas.ParsingTool]]
+            datasets: Dict[str, schemas.Dataset]
+            operations: list[OpType]
+            pipeline: schemas.PipelineSpec
+
+        return Pipeline
+
+    @classproperty
+    def json_schema(cls):
+        return cls.schema.model_json_schema()
+
+    def __init__(self, config: Dict, max_threads: int = None, **kwargs):
+        """
+        Initialize the DSLRunner with a YAML configuration file.
+
+        Args:
+            max_threads (int, optional): Maximum number of threads to use. Defaults to None.
+        """
+        super().__init__(
+            config,
+            base_name=kwargs.pop("base_name", None),
+            yaml_file_suffix=kwargs.pop("yaml_file_suffix", None),
+            max_threads=max_threads,
+            **kwargs,
+        )
+        self.total_cost = 0
+        self._initialize_state()
+        self._setup_parsing_tools()
+        self._build_operation_graph(config)
+        self._compute_operation_hashes()
+
+        # Run initial validation
+        self._from_df_accessors = kwargs.get("from_df_accessors", False)
+        if not self._from_df_accessors:
+            self.syntax_check()
+
+    def _initialize_state(self) -> None:
+        """Initialize basic runner state and datasets"""
+        self.datasets = {}
+        self.intermediate_dir = (
+            self.config.get("pipeline", {}).get("output", {}).get("intermediate_dir")
+        )
+
+    def _setup_parsing_tools(self) -> None:
+        """Set up parsing tools from configuration"""
+        self.parsing_tool_map = create_parsing_tool_map(
+            self.config.get("parsing_tools", None)
+        )
+
+    def _build_operation_graph(self, config: Dict) -> None:
+        """Build the DAG of operations from configuration"""
+        self.config = config
+        self.op_container_map = {}
+        self.last_op_container = None
+
+        for step in self.config["pipeline"]["steps"]:
+            self._validate_step(step)
+
+            if step.get("input"):
+                self._add_scan_operation(step)
+            else:
+                self._add_equijoin_operation(step)
+
+            self._add_step_operations(step)
+            self._add_step_boundary(step)
+
+    def _validate_step(self, step: Dict) -> None:
+        """Validate step configuration"""
+        assert "name" in step.keys(), f"Step {step} does not have a name"
+        assert "operations" in step.keys(), f"Step {step} does not have `operations`"
+
+    def _add_scan_operation(self, step: Dict) -> None:
+        """Add a scan operation for input datasets"""
+        scan_op_container = OpContainer(
+            f"{step['name']}/scan_{step['input']}",
+            self,
+            {
+                "type": "scan",
+                "dataset_name": step["input"],
+                "name": f"scan_{step['input']}",
+            },
+        )
+        self.op_container_map[f"{step['name']}/scan_{step['input']}"] = (
+            scan_op_container
+        )
+        if self.last_op_container:
+            scan_op_container.add_child(self.last_op_container)
+        self.last_op_container = scan_op_container
+
+    def _add_equijoin_operation(self, step: Dict) -> None:
+        """Add an equijoin operation with its scan operations"""
+        equijoin_operation_name = list(step["operations"][0].keys())[0]
+        left_dataset_name = list(step["operations"][0].values())[0]["left"]
+        right_dataset_name = list(step["operations"][0].values())[0]["right"]
+
+        left_scan_op_container = OpContainer(
+            f"{step['name']}/scan_{left_dataset_name}",
+            self,
+            {
+                "type": "scan",
+                "dataset_name": left_dataset_name,
+                "name": f"scan_{left_dataset_name}",
+            },
+        )
+        if self.last_op_container:
+            left_scan_op_container.add_child(self.last_op_container)
+        right_scan_op_container = OpContainer(
+            f"{step['name']}/scan_{right_dataset_name}",
+            self,
+            {
+                "type": "scan",
+                "dataset_name": right_dataset_name,
+                "name": f"scan_{right_dataset_name}",
+            },
+        )
+        if self.last_op_container:
+            right_scan_op_container.add_child(self.last_op_container)
+        equijoin_op_container = OpContainer(
+            f"{step['name']}/{equijoin_operation_name}",
+            self,
+            self.find_operation(equijoin_operation_name),
+            left_name=left_dataset_name,
+            right_name=right_dataset_name,
+        )
+
+        equijoin_op_container.add_child(left_scan_op_container)
+        equijoin_op_container.add_child(right_scan_op_container)
+
+        self.last_op_container = equijoin_op_container
+        self.op_container_map[f"{step['name']}/{equijoin_operation_name}"] = (
+            equijoin_op_container
+        )
+        self.op_container_map[f"{step['name']}/scan_{left_dataset_name}"] = (
+            left_scan_op_container
+        )
+        self.op_container_map[f"{step['name']}/scan_{right_dataset_name}"] = (
+            right_scan_op_container
+        )
+
+    def _add_step_operations(self, step: Dict) -> None:
+        """Add operations for a step"""
+        op_start_idx = 1 if not step.get("input") else 0
+
+        for operation_name in step["operations"][op_start_idx:]:
+            if not isinstance(operation_name, str):
+                raise ValueError(
+                    f"Operation {operation_name} in step {step['name']} should be a string. "
+                    "If you intend for it to be an equijoin, don't specify an input in the step."
+                )
+
+            op_container = OpContainer(
+                f"{step['name']}/{operation_name}",
+                self,
+                self.find_operation(operation_name),
+            )
+            op_container.add_child(self.last_op_container)
+            self.last_op_container = op_container
+            self.op_container_map[f"{step['name']}/{operation_name}"] = op_container
+
+    def _add_step_boundary(self, step: Dict) -> None:
+        """Add a step boundary node"""
+        step_boundary = StepBoundary(
+            f"{step['name']}/boundary",
+            self,
+            {"type": "step_boundary", "name": f"{step['name']}/boundary"},
+        )
+        step_boundary.add_child(self.last_op_container)
+        self.op_container_map[f"{step['name']}/boundary"] = step_boundary
+        self.last_op_container = step_boundary
+
+    def _compute_operation_hashes(self) -> None:
+        """Compute hashes for operations to enable caching"""
+        op_map = {op["name"]: op for op in self.config["operations"]}
+        self.step_op_hashes = defaultdict(dict)
+
+        for step in self.config["pipeline"]["steps"]:
+            for idx, op in enumerate(step["operations"]):
+                op_name = op if isinstance(op, str) else list(op.keys())[0]
+
+                all_ops_until_and_including_current = (
+                    [op_map[prev_op] for prev_op in step["operations"][:idx]]
+                    + [op_map[op_name]]
+                    + [self.config.get("system_prompt", {})]
+                )
+
+                for op in all_ops_until_and_including_current:
+                    if "model" not in op:
+                        op["model"] = self.default_model
+
+                all_ops_str = json.dumps(all_ops_until_and_including_current)
+                self.step_op_hashes[step["name"]][op_name] = hashlib.sha256(
+                    all_ops_str.encode()
+                ).hexdigest()
+
+    def get_output_path(self, require=False):
+        output_path = self.config.get("pipeline", {}).get("output", {}).get("path")
+        if output_path:
+            if not (
+                output_path.lower().endswith(".json")
+                or output_path.lower().endswith(".csv")
+            ):
+                raise ValueError(
+                    f"Output path '{output_path}' is not a JSON or CSV file. Please provide a path ending with '.json' or '.csv'."
+                )
+        elif require:
+            raise ValueError(
+                "No output path specified in the configuration. Please provide an output path ending with '.json' or '.csv' in the configuration to use the save() method."
+            )
+
+        return output_path
+
+    def syntax_check(self):
+        """
+        Perform a syntax check on all operations defined in the configuration.
+        """
+        self.console.log("[yellow]Checking operations...[/yellow]")
+
+        # Just validate that it's a json file if specified
+        self.get_output_path()
+        current = self.last_op_container
+
+        try:
+            # Walk the last op container to check syntax
+            op_containers = []
+            if self.last_op_container:
+                op_containers = [self.last_op_container]
+
+            while op_containers:
+                current = op_containers.pop(0)
+                syntax_result = current.syntax_check()
+                self.console.log(syntax_result, end="")
+                # Add all children to the queue
+                op_containers.extend(current.children)
+        except Exception as e:
+            raise ValueError(
+                f"Syntax check failed for operation '{current.name}': {str(e)}"
+            )
+
+        self.console.log("[green]✓ All operations passed syntax check[/green]")
+
+    def print_query_plan(self, show_boundaries=False):
+        """
+        Print a visual representation of the entire query plan using indentation and arrows.
+        Operations are color-coded by step to show the pipeline structure while maintaining
+        dependencies between steps.
+        """
+        if not self.last_op_container:
+            self.console.log("\n[bold]Pipeline Steps:[/bold]")
+            self.console.log(
+                Panel("No operations in pipeline", title="Query Plan", width=100)
+            )
+            self.console.log()
+            return
+
+        def _print_op(
+            op: OpContainer, indent: int = 0, step_colors: Dict[str, str] = None
+        ) -> str:
+            # Handle boundary operations based on show_boundaries flag
+            if isinstance(op, StepBoundary):
+                if show_boundaries:
+                    output = []
+                    indent_str = "  " * indent
+                    step_name = op.name.split("/")[0]
+                    color = step_colors.get(step_name, "white")
+                    output.append(
+                        f"{indent_str}[{color}][bold]{op.name}[/bold][/{color}]"
+                    )
+                    output.append(f"{indent_str}Type: step_boundary")
+                    if op.children:
+                        output.append(f"{indent_str}[yellow]▼[/yellow]")
+                        for child in op.children:
+                            output.append(_print_op(child, indent + 1, step_colors))
+                    return "\n".join(output)
+                elif op.children:
+                    return _print_op(op.children[0], indent, step_colors)
+                return ""
+
+            # Build the string for the current operation with indentation
+            indent_str = "  " * indent
+            output = []
+
+            # Color code the operation name based on its step
+            step_name = op.name.split("/")[0]
+            color = step_colors.get(step_name, "white")
+            output.append(f"{indent_str}[{color}][bold]{op.name}[/bold][/{color}]")
+            output.append(f"{indent_str}Type: {op.config['type']}")
+
+            # Add schema if available
+            if "output" in op.config and "schema" in op.config["output"]:
+                output.append(f"{indent_str}Output Schema:")
+                for field, field_type in op.config["output"]["schema"].items():
+                    escaped_type = escape(str(field_type))
+                    output.append(
+                        f"{indent_str}  {field}: [bright_white]{escaped_type}[/bright_white]"
+                    )
+
+            # Add children
+            if op.children:
+                if op.is_equijoin:
+                    output.append(f"{indent_str}[yellow]▼ LEFT[/yellow]")
+                    output.append(_print_op(op.children[0], indent + 1, step_colors))
+                    output.append(f"{indent_str}[yellow]▼ RIGHT[/yellow]")
+                    output.append(_print_op(op.children[1], indent + 1, step_colors))
+                else:
+                    output.append(f"{indent_str}[yellow]▼[/yellow]")
+                    for child in op.children:
+                        output.append(_print_op(child, indent + 1, step_colors))
+
+            return "\n".join(output)
+
+        # Get all step boundaries and extract unique step names
+        step_boundaries = [
+            op
+            for name, op in self.op_container_map.items()
+            if isinstance(op, StepBoundary)
+        ]
+        step_boundaries.sort(key=lambda x: x.name)
+
+        # Create a color map for steps - using distinct colors
+        colors = ["cyan", "magenta", "green", "yellow", "blue", "red"]
+        step_names = [b.name.split("/")[0] for b in step_boundaries]
+        step_colors = {
+            name: colors[i % len(colors)] for i, name in enumerate(step_names)
+        }
+
+        # Print the legend
+        self.console.log("\n[bold]Pipeline Steps:[/bold]")
+        for step_name, color in step_colors.items():
+            self.console.log(f"[{color}]■[/{color}] {step_name}")
+
+        # Print the full query plan starting from the last step boundary
+        query_plan = _print_op(self.last_op_container, step_colors=step_colors)
+        self.console.log(Panel(query_plan, title="Query Plan", width=100))
+        self.console.log()
+
+    def find_operation(self, op_name: str) -> Dict:
+        for operation_config in self.config["operations"]:
+            if operation_config["name"] == op_name:
+                return operation_config
+        raise ValueError(f"Operation '{op_name}' not found in configuration.")
+
+    def load_run_save(self) -> float:
+        """
+        Execute the entire pipeline defined in the configuration.
+        """
+        output_path = self.get_output_path(require=True)
+
+        # Print the query plan
+        self.print_query_plan()
+
+        start_time = time.time()
+
+        if self.last_op_container:
+            self.load()
+            self.console.rule("[bold]Pipeline Execution[/bold]")
+            output, _, _ = self.last_op_container.next()
+            self.save(output)
+
+        execution_time = time.time() - start_time
+
+        # Print execution summary
+        summary = (
+            f"Cost: [green]${self.total_cost:.2f}[/green]\n"
+            f"Time: {execution_time:.2f}s\n"
+            + (
+                f"Cache: [dim]{self.intermediate_dir}[/dim]\n"
+                if self.intermediate_dir
+                else ""
+            )
+            + f"Output: [dim]{output_path}[/dim]"
+        )
+        self.console.log(Panel(summary, title="Execution Summary"))
+
+        return self.total_cost
+
+    def load(self) -> None:
+        """
+        Load all datasets defined in the configuration.
+        """
+        datasets = {}
+        self.console.rule("[bold]Loading Datasets[/bold]")
+
+        for name, dataset_config in self.config["datasets"].items():
+            if dataset_config["type"] == "file":
+                datasets[name] = Dataset(
+                    self,
+                    "file",
+                    dataset_config["path"],
+                    source="local",
+                    parsing=dataset_config.get("parsing", []),
+                    user_defined_parsing_tool_map=self.parsing_tool_map,
+                )
+                self.console.log(
+                    f"[green]✓[/green] Loaded dataset '{name}' from {dataset_config['path']}"
+                )
+            else:
+                raise ValueError(f"Unsupported dataset type: {dataset_config['type']}")
+
+        self.datasets = {
+            name: (
+                dataset
+                if isinstance(dataset, Dataset)
+                else Dataset(self, "memory", dataset)
+            )
+            for name, dataset in datasets.items()
+        }
+        self.console.log()
+
+    def save(self, data: List[Dict]) -> None:
+        """
+        Save the final output of the pipeline.
+        """
+        self.get_output_path(require=True)
+
+        output_config = self.config["pipeline"]["output"]
+        if output_config["type"] == "file":
+            # Create the directory if it doesn't exist
+            if os.path.dirname(output_config["path"]):
+                os.makedirs(os.path.dirname(output_config["path"]), exist_ok=True)
+            if output_config["path"].lower().endswith(".json"):
+                with open(output_config["path"], "w") as file:
+                    json.dump(data, file, indent=2)
+            else:  # CSV
+                import csv
+
+                with open(output_config["path"], "w", newline="") as file:
+                    writer = csv.DictWriter(file, fieldnames=data[0].keys())
+                    limited_data = [
+                        {k: d.get(k, None) for k in data[0].keys()} for d in data
+                    ]
+                    writer.writeheader()
+                    writer.writerows(limited_data)
+            self.console.log(
+                f"[green]✓[/green] Saved to [dim]{output_config['path']}[/dim]\n"
+            )
+        else:
+            raise ValueError(
+                f"Unsupported output type: {output_config['type']}. Supported types: file"
+            )
+
+    def _load_from_checkpoint_if_exists(
+        self, step_name: str, operation_name: str
+    ) -> Optional[List[Dict]]:
+        if self.intermediate_dir is None:
+            return None
+
+        intermediate_config_path = os.path.join(
+            self.intermediate_dir, ".docetl_intermediate_config.json"
+        )
+
+        if not os.path.exists(intermediate_config_path):
+            return None
+
+        # Make sure the step and op name is in the checkpoint config path
+        if (
+            step_name not in self.step_op_hashes
+            or operation_name not in self.step_op_hashes[step_name]
+        ):
+            return None
+
+        # See if the checkpoint config is the same as the current step op hash
+        with open(intermediate_config_path, "r") as f:
+            intermediate_config = json.load(f)
+
+        if (
+            intermediate_config.get(step_name, {}).get(operation_name, "")
+            != self.step_op_hashes[step_name][operation_name]
+        ):
+            return None
+
+        checkpoint_path = os.path.join(
+            self.intermediate_dir, step_name, f"{operation_name}.json"
+        )
+        # check if checkpoint exists
+        if os.path.exists(checkpoint_path):
+            if f"{step_name}_{operation_name}" not in self.datasets:
+                self.datasets[f"{step_name}_{operation_name}"] = Dataset(
+                    self, "file", checkpoint_path, "local"
+                )
+
+                self.console.log(
+                    f"[green]✓[/green] [italic]Loaded checkpoint for operation '{operation_name}' in step '{step_name}' from {checkpoint_path}[/italic]"
+                )
+
+                return self.datasets[f"{step_name}_{operation_name}"].load()
+        return None
+
+    def clear_intermediate(self) -> None:
+        """
+        Clear the intermediate directory.
+        """
+        # Remove the intermediate directory
+        if self.intermediate_dir:
+            shutil.rmtree(self.intermediate_dir)
+            return
+
+        raise ValueError("Intermediate directory not set. Cannot clear intermediate.")
+
+    def _save_checkpoint(
+        self, step_name: str, operation_name: str, data: List[Dict]
+    ) -> None:
+        """
+        Save a checkpoint of the current data after an operation.
+
+        This method creates a JSON file containing the current state of the data
+        after an operation has been executed. The checkpoint is saved in a directory
+        structure that reflects the step and operation names.
+
+        Args:
+            step_name (str): The name of the current step in the pipeline.
+            operation_name (str): The name of the operation that was just executed.
+            data (List[Dict]): The current state of the data to be checkpointed.
+
+        Note:
+            The checkpoint is saved only if a checkpoint directory has been specified
+            when initializing the DSLRunner.
+        """
+        checkpoint_path = os.path.join(
+            self.intermediate_dir, step_name, f"{operation_name}.json"
+        )
+        if os.path.dirname(checkpoint_path):
+            os.makedirs(os.path.dirname(checkpoint_path), exist_ok=True)
+        with open(checkpoint_path, "w") as f:
+            json.dump(data, f)
+
+        self.console.log(
+            f"[green]✓ [italic]Intermediate saved for operation '{operation_name}' in step '{step_name}' at {checkpoint_path}[/italic][/green]"
+        )
+
+    def should_optimize(
+        self, step_name: str, op_name: str, **kwargs
+    ) -> Tuple[str, float, List[Dict[str, Any]], List[Dict[str, Any]]]:
+        self.load()
+        builder = Optimizer(self, **kwargs)
+        self.optimizer = builder
+        result = builder.should_optimize(step_name, op_name)
+        return result
+
+    def optimize(
+        self,
+        save: bool = False,
+        return_pipeline: bool = True,
+        **kwargs,
+    ) -> Tuple[Union[Dict, "DSLRunner"], float]:
+
+        if not self.last_op_container:
+            raise ValueError("No operations in pipeline. Cannot optimize.")
+
+        self.load()
+
+        builder = Optimizer(
+            self,
+            **kwargs,
+        )
+        self.optimizer = builder
+        llm_api_cost = builder.optimize()
+        self.total_cost += llm_api_cost
+
+        if save:
+            builder.save_optimized_config(f"{self.base_name}_opt.yaml")
+            self.optimized_config_path = f"{self.base_name}_opt.yaml"
+
+        if return_pipeline:
+            return (
+                DSLRunner(builder.clean_optimized_config(), self.max_threads),
+                self.total_cost,
+            )
+
+        return builder.clean_optimized_config(), self.total_cost
+
+    def _run_operation(
+        self,
+        op_config: Dict[str, Any],
+        input_data: Union[List[Dict[str, Any]], Dict[str, Any]],
+        return_instance: bool = False,
+        is_build: bool = False,
+    ) -> Union[List[Dict[str, Any]], Tuple[List[Dict[str, Any]], BaseOperation, float]]:
+        """
+        Run a single operation based on its configuration.
+
+        This method creates an instance of the appropriate operation class and executes it.
+        It also updates the total operation cost.
+
+        Args:
+            op_config (Dict[str, Any]): The configuration of the operation to run.
+            input_data (List[Dict[str, Any]]): The input data for the operation.
+            return_instance (bool, optional): If True, return the operation instance along with the output data.
+
+        Returns:
+            Union[List[Dict[str, Any]], Tuple[List[Dict[str, Any]], BaseOperation, float]]:
+            If return_instance is False, returns the output data.
+            If return_instance is True, returns a tuple of the output data, the operation instance, and the cost.
+        """
+        operation_class = get_operation(op_config["type"])
+
+        oc_kwargs = {
+            "runner": self,
+            "config": op_config,
+            "default_model": self.config["default_model"],
+            "max_threads": self.max_threads,
+            "console": self.console,
+            "status": self.status,
+        }
+        operation_instance = operation_class(**oc_kwargs)
+        if op_config["type"] == "equijoin":
+            output_data, cost = operation_instance.execute(
+                input_data["left_data"], input_data["right_data"]
+            )
+        elif op_config["type"] == "filter":
+            output_data, cost = operation_instance.execute(input_data, is_build)
+        else:
+            output_data, cost = operation_instance.execute(input_data)
+
+        self.total_cost += cost
+
+        if return_instance:
+            return output_data, operation_instance
+        else:
+            return output_data
+
+
+ + + +
+ + + + + + + + + +
+ + +

+ __init__(config, max_threads=None, **kwargs) + +

+ + +
+ +

Initialize the DSLRunner with a YAML configuration file.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ max_threads + + int + +
+

Maximum number of threads to use. Defaults to None.

+
+
+ None +
+ +
+ Source code in docetl/runner.py +
103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
def __init__(self, config: Dict, max_threads: int = None, **kwargs):
+    """
+    Initialize the DSLRunner with a YAML configuration file.
+
+    Args:
+        max_threads (int, optional): Maximum number of threads to use. Defaults to None.
+    """
+    super().__init__(
+        config,
+        base_name=kwargs.pop("base_name", None),
+        yaml_file_suffix=kwargs.pop("yaml_file_suffix", None),
+        max_threads=max_threads,
+        **kwargs,
+    )
+    self.total_cost = 0
+    self._initialize_state()
+    self._setup_parsing_tools()
+    self._build_operation_graph(config)
+    self._compute_operation_hashes()
+
+    # Run initial validation
+    self._from_df_accessors = kwargs.get("from_df_accessors", False)
+    if not self._from_df_accessors:
+        self.syntax_check()
+
+
+
+ +
+ +
+ + +

+ clear_intermediate() + +

+ + +
+ +

Clear the intermediate directory.

+ +
+ Source code in docetl/runner.py +
579
+580
+581
+582
+583
+584
+585
+586
+587
+588
def clear_intermediate(self) -> None:
+    """
+    Clear the intermediate directory.
+    """
+    # Remove the intermediate directory
+    if self.intermediate_dir:
+        shutil.rmtree(self.intermediate_dir)
+        return
+
+    raise ValueError("Intermediate directory not set. Cannot clear intermediate.")
+
+
+
+ +
+ +
+ + +

+ load() + +

+ + +
+ +

Load all datasets defined in the configuration.

+ +
+ Source code in docetl/runner.py +
467
+468
+469
+470
+471
+472
+473
+474
+475
+476
+477
+478
+479
+480
+481
+482
+483
+484
+485
+486
+487
+488
+489
+490
+491
+492
+493
+494
+495
+496
+497
+498
def load(self) -> None:
+    """
+    Load all datasets defined in the configuration.
+    """
+    datasets = {}
+    self.console.rule("[bold]Loading Datasets[/bold]")
+
+    for name, dataset_config in self.config["datasets"].items():
+        if dataset_config["type"] == "file":
+            datasets[name] = Dataset(
+                self,
+                "file",
+                dataset_config["path"],
+                source="local",
+                parsing=dataset_config.get("parsing", []),
+                user_defined_parsing_tool_map=self.parsing_tool_map,
+            )
+            self.console.log(
+                f"[green]✓[/green] Loaded dataset '{name}' from {dataset_config['path']}"
+            )
+        else:
+            raise ValueError(f"Unsupported dataset type: {dataset_config['type']}")
+
+    self.datasets = {
+        name: (
+            dataset
+            if isinstance(dataset, Dataset)
+            else Dataset(self, "memory", dataset)
+        )
+        for name, dataset in datasets.items()
+    }
+    self.console.log()
+
+
+
+ +
+ +
+ + +

+ load_run_save() + +

+ + +
+ +

Execute the entire pipeline defined in the configuration.

+ +
+ Source code in docetl/runner.py +
433
+434
+435
+436
+437
+438
+439
+440
+441
+442
+443
+444
+445
+446
+447
+448
+449
+450
+451
+452
+453
+454
+455
+456
+457
+458
+459
+460
+461
+462
+463
+464
+465
def load_run_save(self) -> float:
+    """
+    Execute the entire pipeline defined in the configuration.
+    """
+    output_path = self.get_output_path(require=True)
+
+    # Print the query plan
+    self.print_query_plan()
+
+    start_time = time.time()
+
+    if self.last_op_container:
+        self.load()
+        self.console.rule("[bold]Pipeline Execution[/bold]")
+        output, _, _ = self.last_op_container.next()
+        self.save(output)
+
+    execution_time = time.time() - start_time
+
+    # Print execution summary
+    summary = (
+        f"Cost: [green]${self.total_cost:.2f}[/green]\n"
+        f"Time: {execution_time:.2f}s\n"
+        + (
+            f"Cache: [dim]{self.intermediate_dir}[/dim]\n"
+            if self.intermediate_dir
+            else ""
+        )
+        + f"Output: [dim]{output_path}[/dim]"
+    )
+    self.console.log(Panel(summary, title="Execution Summary"))
+
+    return self.total_cost
+
+
+
+ +
+ +
+ + +

+ print_query_plan(show_boundaries=False) + +

+ + +
+ +

Print a visual representation of the entire query plan using indentation and arrows. +Operations are color-coded by step to show the pipeline structure while maintaining +dependencies between steps.

+ +
+ Source code in docetl/runner.py +
332
+333
+334
+335
+336
+337
+338
+339
+340
+341
+342
+343
+344
+345
+346
+347
+348
+349
+350
+351
+352
+353
+354
+355
+356
+357
+358
+359
+360
+361
+362
+363
+364
+365
+366
+367
+368
+369
+370
+371
+372
+373
+374
+375
+376
+377
+378
+379
+380
+381
+382
+383
+384
+385
+386
+387
+388
+389
+390
+391
+392
+393
+394
+395
+396
+397
+398
+399
+400
+401
+402
+403
+404
+405
+406
+407
+408
+409
+410
+411
+412
+413
+414
+415
+416
+417
+418
+419
+420
+421
+422
+423
+424
+425
def print_query_plan(self, show_boundaries=False):
+    """
+    Print a visual representation of the entire query plan using indentation and arrows.
+    Operations are color-coded by step to show the pipeline structure while maintaining
+    dependencies between steps.
+    """
+    if not self.last_op_container:
+        self.console.log("\n[bold]Pipeline Steps:[/bold]")
+        self.console.log(
+            Panel("No operations in pipeline", title="Query Plan", width=100)
+        )
+        self.console.log()
+        return
+
+    def _print_op(
+        op: OpContainer, indent: int = 0, step_colors: Dict[str, str] = None
+    ) -> str:
+        # Handle boundary operations based on show_boundaries flag
+        if isinstance(op, StepBoundary):
+            if show_boundaries:
+                output = []
+                indent_str = "  " * indent
+                step_name = op.name.split("/")[0]
+                color = step_colors.get(step_name, "white")
+                output.append(
+                    f"{indent_str}[{color}][bold]{op.name}[/bold][/{color}]"
+                )
+                output.append(f"{indent_str}Type: step_boundary")
+                if op.children:
+                    output.append(f"{indent_str}[yellow]▼[/yellow]")
+                    for child in op.children:
+                        output.append(_print_op(child, indent + 1, step_colors))
+                return "\n".join(output)
+            elif op.children:
+                return _print_op(op.children[0], indent, step_colors)
+            return ""
+
+        # Build the string for the current operation with indentation
+        indent_str = "  " * indent
+        output = []
+
+        # Color code the operation name based on its step
+        step_name = op.name.split("/")[0]
+        color = step_colors.get(step_name, "white")
+        output.append(f"{indent_str}[{color}][bold]{op.name}[/bold][/{color}]")
+        output.append(f"{indent_str}Type: {op.config['type']}")
+
+        # Add schema if available
+        if "output" in op.config and "schema" in op.config["output"]:
+            output.append(f"{indent_str}Output Schema:")
+            for field, field_type in op.config["output"]["schema"].items():
+                escaped_type = escape(str(field_type))
+                output.append(
+                    f"{indent_str}  {field}: [bright_white]{escaped_type}[/bright_white]"
+                )
+
+        # Add children
+        if op.children:
+            if op.is_equijoin:
+                output.append(f"{indent_str}[yellow]▼ LEFT[/yellow]")
+                output.append(_print_op(op.children[0], indent + 1, step_colors))
+                output.append(f"{indent_str}[yellow]▼ RIGHT[/yellow]")
+                output.append(_print_op(op.children[1], indent + 1, step_colors))
+            else:
+                output.append(f"{indent_str}[yellow]▼[/yellow]")
+                for child in op.children:
+                    output.append(_print_op(child, indent + 1, step_colors))
+
+        return "\n".join(output)
+
+    # Get all step boundaries and extract unique step names
+    step_boundaries = [
+        op
+        for name, op in self.op_container_map.items()
+        if isinstance(op, StepBoundary)
+    ]
+    step_boundaries.sort(key=lambda x: x.name)
+
+    # Create a color map for steps - using distinct colors
+    colors = ["cyan", "magenta", "green", "yellow", "blue", "red"]
+    step_names = [b.name.split("/")[0] for b in step_boundaries]
+    step_colors = {
+        name: colors[i % len(colors)] for i, name in enumerate(step_names)
+    }
+
+    # Print the legend
+    self.console.log("\n[bold]Pipeline Steps:[/bold]")
+    for step_name, color in step_colors.items():
+        self.console.log(f"[{color}]■[/{color}] {step_name}")
+
+    # Print the full query plan starting from the last step boundary
+    query_plan = _print_op(self.last_op_container, step_colors=step_colors)
+    self.console.log(Panel(query_plan, title="Query Plan", width=100))
+    self.console.log()
+
+
+
+ +
+ +
+ + +

+ save(data) + +

+ + +
+ +

Save the final output of the pipeline.

+ +
+ Source code in docetl/runner.py +
500
+501
+502
+503
+504
+505
+506
+507
+508
+509
+510
+511
+512
+513
+514
+515
+516
+517
+518
+519
+520
+521
+522
+523
+524
+525
+526
+527
+528
+529
+530
def save(self, data: List[Dict]) -> None:
+    """
+    Save the final output of the pipeline.
+    """
+    self.get_output_path(require=True)
+
+    output_config = self.config["pipeline"]["output"]
+    if output_config["type"] == "file":
+        # Create the directory if it doesn't exist
+        if os.path.dirname(output_config["path"]):
+            os.makedirs(os.path.dirname(output_config["path"]), exist_ok=True)
+        if output_config["path"].lower().endswith(".json"):
+            with open(output_config["path"], "w") as file:
+                json.dump(data, file, indent=2)
+        else:  # CSV
+            import csv
+
+            with open(output_config["path"], "w", newline="") as file:
+                writer = csv.DictWriter(file, fieldnames=data[0].keys())
+                limited_data = [
+                    {k: d.get(k, None) for k in data[0].keys()} for d in data
+                ]
+                writer.writeheader()
+                writer.writerows(limited_data)
+        self.console.log(
+            f"[green]✓[/green] Saved to [dim]{output_config['path']}[/dim]\n"
+        )
+    else:
+        raise ValueError(
+            f"Unsupported output type: {output_config['type']}. Supported types: file"
+        )
+
+
+
+ +
+ +
+ + +

+ syntax_check() + +

+ + +
+ +

Perform a syntax check on all operations defined in the configuration.

+ +
+ Source code in docetl/runner.py +
303
+304
+305
+306
+307
+308
+309
+310
+311
+312
+313
+314
+315
+316
+317
+318
+319
+320
+321
+322
+323
+324
+325
+326
+327
+328
+329
+330
def syntax_check(self):
+    """
+    Perform a syntax check on all operations defined in the configuration.
+    """
+    self.console.log("[yellow]Checking operations...[/yellow]")
+
+    # Just validate that it's a json file if specified
+    self.get_output_path()
+    current = self.last_op_container
+
+    try:
+        # Walk the last op container to check syntax
+        op_containers = []
+        if self.last_op_container:
+            op_containers = [self.last_op_container]
+
+        while op_containers:
+            current = op_containers.pop(0)
+            syntax_result = current.syntax_check()
+            self.console.log(syntax_result, end="")
+            # Add all children to the queue
+            op_containers.extend(current.children)
+    except Exception as e:
+        raise ValueError(
+            f"Syntax check failed for operation '{current.name}': {str(e)}"
+        )
+
+    self.console.log("[green]✓ All operations passed syntax check[/green]")
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ docetl.Optimizer + + +

+ + +
+ + +

Orchestrates the optimization of a DocETL pipeline by analyzing and potentially rewriting +operations marked for optimization. Works with the runner's pull-based execution model +to maintain lazy evaluation while improving pipeline efficiency.

+ + + + + + +
+ Source code in docetl/optimizer.py +
 48
+ 49
+ 50
+ 51
+ 52
+ 53
+ 54
+ 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
+162
+163
+164
+165
+166
+167
+168
+169
+170
+171
+172
+173
+174
+175
+176
+177
+178
+179
+180
+181
+182
+183
+184
+185
+186
+187
+188
+189
+190
+191
+192
+193
+194
+195
+196
+197
+198
+199
+200
+201
+202
+203
+204
+205
+206
+207
+208
+209
+210
+211
+212
+213
+214
+215
+216
+217
+218
+219
+220
+221
+222
+223
+224
+225
+226
+227
+228
+229
+230
+231
+232
+233
+234
+235
+236
+237
+238
+239
+240
+241
+242
+243
+244
+245
+246
+247
+248
+249
+250
+251
+252
+253
+254
+255
+256
+257
+258
+259
+260
+261
+262
+263
+264
+265
+266
+267
+268
+269
+270
+271
+272
+273
+274
+275
+276
+277
+278
+279
+280
+281
+282
+283
+284
+285
+286
+287
+288
+289
+290
+291
+292
+293
+294
+295
+296
+297
+298
+299
+300
+301
+302
+303
+304
+305
+306
+307
+308
+309
+310
+311
+312
+313
+314
+315
+316
+317
+318
+319
+320
+321
+322
+323
+324
+325
+326
+327
+328
+329
+330
+331
+332
+333
+334
+335
+336
+337
+338
+339
+340
+341
+342
+343
+344
+345
+346
+347
+348
+349
+350
+351
+352
+353
+354
+355
+356
+357
+358
+359
+360
+361
+362
+363
+364
+365
+366
+367
+368
+369
+370
+371
+372
+373
+374
+375
+376
+377
+378
+379
+380
+381
+382
+383
+384
+385
+386
+387
+388
+389
+390
+391
+392
+393
+394
+395
+396
+397
+398
+399
+400
+401
+402
+403
+404
+405
+406
+407
+408
+409
+410
+411
+412
+413
+414
+415
+416
+417
+418
+419
+420
+421
+422
+423
+424
+425
+426
+427
+428
+429
+430
+431
+432
+433
+434
+435
+436
+437
+438
+439
+440
+441
+442
+443
+444
+445
+446
+447
+448
+449
+450
+451
+452
+453
+454
+455
+456
+457
+458
+459
+460
+461
+462
+463
+464
+465
+466
+467
+468
+469
+470
+471
+472
+473
+474
+475
+476
+477
+478
+479
+480
+481
+482
+483
+484
+485
+486
+487
+488
+489
+490
+491
+492
+493
+494
+495
+496
+497
+498
+499
+500
+501
+502
+503
+504
+505
+506
+507
+508
+509
+510
+511
+512
+513
+514
+515
+516
+517
+518
+519
+520
+521
+522
+523
+524
+525
+526
+527
+528
+529
+530
+531
+532
+533
+534
+535
+536
+537
+538
+539
+540
+541
+542
+543
+544
+545
+546
+547
+548
+549
+550
+551
+552
+553
+554
+555
+556
+557
+558
+559
+560
+561
+562
+563
+564
+565
+566
+567
+568
+569
+570
+571
+572
+573
+574
+575
+576
+577
+578
+579
+580
+581
+582
+583
+584
+585
+586
+587
+588
+589
+590
+591
+592
+593
+594
+595
+596
+597
+598
+599
+600
+601
+602
+603
+604
+605
+606
+607
+608
+609
+610
+611
+612
+613
+614
+615
+616
+617
+618
+619
+620
+621
+622
+623
+624
+625
+626
+627
+628
+629
+630
+631
+632
+633
+634
+635
+636
+637
+638
+639
+640
+641
+642
+643
+644
+645
+646
+647
+648
+649
+650
+651
+652
+653
+654
+655
+656
+657
+658
+659
+660
+661
+662
+663
+664
+665
+666
+667
+668
+669
+670
+671
+672
+673
+674
+675
+676
+677
+678
+679
+680
+681
+682
+683
+684
+685
+686
+687
+688
+689
+690
+691
+692
+693
+694
+695
+696
+697
+698
+699
+700
+701
+702
+703
+704
+705
+706
+707
+708
+709
+710
+711
+712
+713
+714
+715
+716
+717
+718
+719
+720
+721
+722
+723
class Optimizer:
+    """
+    Orchestrates the optimization of a DocETL pipeline by analyzing and potentially rewriting
+    operations marked for optimization. Works with the runner's pull-based execution model
+    to maintain lazy evaluation while improving pipeline efficiency.
+    """
+
+    def __init__(
+        self,
+        runner: "DSLRunner",
+        model: str = "gpt-4o",
+        resume: bool = False,
+        timeout: int = 60,
+    ):
+        """
+        Initialize the optimizer with a runner instance and configuration.
+        Sets up optimization parameters, caching, and cost tracking.
+
+        Args:
+            yaml_file (str): Path to the YAML configuration file.
+            model (str): The name of the language model to use. Defaults to "gpt-4o".
+            resume (bool): Whether to resume optimization from a previous run. Defaults to False.
+            timeout (int): Timeout in seconds for operations. Defaults to 60.
+
+        Attributes:
+            config (Dict): Stores the loaded configuration from the YAML file.
+            console (Console): Rich console for formatted output.
+            max_threads (int): Maximum number of threads for parallel processing.
+            base_name (str): Base name used for file paths.
+            yaml_file_suffix (str): Suffix for YAML configuration files.
+            runner (DSLRunner): The DSL runner instance.
+            status: Status tracking for the runner.
+            optimized_config (Dict): A copy of the original config to be optimized.
+            llm_client (LLMClient): Client for interacting with the language model.
+            timeout (int): Timeout for operations in seconds.
+            resume (bool): Whether to resume from previous optimization.
+            captured_output (CapturedOutput): Captures output during optimization.
+            sample_cache (Dict): Maps operation names to tuples of (output_data, sample_size).
+            optimized_ops_path (str): Path to store optimized operations.
+            sample_size_map (Dict): Maps operation types to sample sizes.
+
+        The method also calls print_optimizer_config() to display the initial configuration.
+        """
+        self.config = runner.config
+        self.console = runner.console
+        self.max_threads = runner.max_threads
+
+        self.base_name = runner.base_name
+        self.yaml_file_suffix = runner.yaml_file_suffix
+        self.runner = runner
+        self.status = runner.status
+
+        self.optimized_config = copy.deepcopy(self.config)
+        self.llm_client = LLMClient(model)
+        self.timeout = timeout
+        self.resume = resume
+        self.captured_output = CapturedOutput()
+
+        # Add sample cache for build operations
+        self.sample_cache = {}  # Maps operation names to (output_data, sample_size)
+
+        home_dir = os.environ.get("DOCETL_HOME_DIR", os.path.expanduser("~"))
+        cache_dir = os.path.join(home_dir, f".docetl/cache/{runner.yaml_file_suffix}")
+        os.makedirs(cache_dir, exist_ok=True)
+
+        # Hash the config to create a unique identifier
+        config_hash = hashlib.sha256(str(self.config).encode()).hexdigest()
+        self.optimized_ops_path = f"{cache_dir}/{config_hash}.yaml"
+
+        # Update sample size map
+        self.sample_size_map = SAMPLE_SIZE_MAP
+        if self.config.get("optimizer_config", {}).get("sample_sizes", {}):
+            self.sample_size_map.update(self.config["optimizer_config"]["sample_sizes"])
+
+        if not self.runner._from_df_accessors:
+            self.print_optimizer_config()
+
+    def print_optimizer_config(self):
+        """
+        Print the current configuration of the optimizer.
+
+        This method uses the Rich console to display a formatted output of the optimizer's
+        configuration. It includes details such as the YAML file path, sample sizes for
+        different operation types, maximum number of threads, the language model being used,
+        and the timeout setting.
+
+        The output is color-coded and formatted for easy readability, with a header and
+        separator lines to clearly delineate the configuration information.
+        """
+        self.console.log(
+            Panel.fit(
+                "[bold cyan]Optimizer Configuration[/bold cyan]\n"
+                f"[yellow]Sample Size:[/yellow] {self.sample_size_map}\n"
+                f"[yellow]Max Threads:[/yellow] {self.max_threads}\n"
+                f"[yellow]Model:[/yellow] {self.llm_client.model}\n"
+                f"[yellow]Timeout:[/yellow] {self.timeout} seconds",
+                title="Optimizer Configuration",
+            )
+        )
+
+    def _insert_empty_resolve_operations(self):
+        """
+        Determines whether to insert resolve operations in the pipeline.
+
+        For each reduce operation in the tree, checks if it has any map operation as a descendant
+        without a resolve operation in between. If found, inserts an empty resolve operation
+        right after the reduce operation.
+
+        The method modifies the operation container tree in-place.
+
+        Returns:
+            None
+        """
+        if not self.runner.last_op_container:
+            return
+
+        def find_map_without_resolve(container, visited=None):
+            """Helper to find first map descendant without a resolve operation in between."""
+            if visited is None:
+                visited = set()
+
+            if container.name in visited:
+                return None
+            visited.add(container.name)
+
+            if not container.children:
+                return None
+
+            for child in container.children:
+                if child.config["type"] == "map":
+                    return child
+                if child.config["type"] == "resolve":
+                    continue
+                map_desc = find_map_without_resolve(child, visited)
+                if map_desc:
+                    return map_desc
+            return None
+
+        # Walk down the operation container tree
+        containers_to_check = [self.runner.last_op_container]
+        while containers_to_check:
+            current = containers_to_check.pop(0)
+
+            # Skip if this is a boundary or has no children
+            if isinstance(current, StepBoundary) or not current.children:
+                containers_to_check.extend(current.children)
+                continue
+
+            # Get the step name from the container's name
+            step_name = current.name.split("/")[0]
+
+            # Check if current container is a reduce operation
+            if current.config["type"] == "reduce" and current.config.get(
+                "synthesize_resolve", True
+            ):
+                reduce_key = current.config.get("reduce_key", "_all")
+                if isinstance(reduce_key, str):
+                    reduce_key = [reduce_key]
+
+                if "_all" not in reduce_key:
+                    # Find map descendant without resolve
+                    map_desc = find_map_without_resolve(current)
+                    if map_desc:
+                        # Synthesize an empty resolver
+                        self.console.log(
+                            "[yellow]Synthesizing empty resolver operation:[/yellow]"
+                        )
+                        self.console.log(
+                            f"  • [cyan]Reduce operation:[/cyan] [bold]{current.name}[/bold]"
+                        )
+                        self.console.log(
+                            f"  • [cyan]Step:[/cyan] [bold]{step_name}[/bold]"
+                        )
+
+                        # Create new resolve operation config
+                        new_resolve_name = (
+                            f"synthesized_resolve_{len(self.config['operations'])}"
+                        )
+                        new_resolve_config = {
+                            "name": new_resolve_name,
+                            "type": "resolve",
+                            "empty": True,
+                            "optimize": True,
+                            "embedding_model": "text-embedding-3-small",
+                            "resolution_model": self.config.get(
+                                "default_model", "gpt-4o-mini"
+                            ),
+                            "comparison_model": self.config.get(
+                                "default_model", "gpt-4o-mini"
+                            ),
+                            "_intermediates": {
+                                "map_prompt": map_desc.config.get("prompt"),
+                                "reduce_key": reduce_key,
+                            },
+                        }
+
+                        # Add to operations list
+                        self.config["operations"].append(new_resolve_config)
+
+                        # Create new resolve container
+                        new_resolve_container = OpContainer(
+                            f"{step_name}/{new_resolve_name}",
+                            self.runner,
+                            new_resolve_config,
+                        )
+
+                        # Insert the new container between reduce and its children
+                        new_resolve_container.children = current.children
+                        for child in new_resolve_container.children:
+                            child.parent = new_resolve_container
+                        current.children = [new_resolve_container]
+                        new_resolve_container.parent = current
+
+                        # Add to container map
+                        self.runner.op_container_map[
+                            f"{step_name}/{new_resolve_name}"
+                        ] = new_resolve_container
+
+                        # Add children to the queue
+                        containers_to_check.extend(new_resolve_container.children)
+
+    def _add_map_prompts_to_reduce_operations(self):
+        """
+        Add relevant map prompts to reduce operations based on their reduce keys.
+
+        This method walks the operation container tree to find map operations and their
+        output schemas, then associates those with reduce operations that use those keys.
+        When a reduce operation is found, it looks through its descendants to find the
+        relevant map operations and adds their prompts.
+
+        The method modifies the operation container tree in-place.
+        """
+        if not self.runner.last_op_container:
+            return
+
+        def find_map_prompts_for_keys(container, keys, visited=None):
+            """Helper to find map prompts for given keys in the container's descendants."""
+            if visited is None:
+                visited = set()
+
+            if container.name in visited:
+                return []
+            visited.add(container.name)
+
+            prompts = []
+            if container.config["type"] == "map":
+                output_schema = container.config.get("output", {}).get("schema", {})
+                if any(key in output_schema for key in keys):
+                    prompts.append(container.config.get("prompt", ""))
+
+            for child in container.children:
+                prompts.extend(find_map_prompts_for_keys(child, keys, visited))
+
+            return prompts
+
+        # Walk down the operation container tree
+        containers_to_check = [self.runner.last_op_container]
+        while containers_to_check:
+            current = containers_to_check.pop(0)
+
+            # Skip if this is a boundary or has no children
+            if isinstance(current, StepBoundary) or not current.children:
+                containers_to_check.extend(current.children)
+                continue
+
+            # If this is a reduce operation, find relevant map prompts
+            if current.config["type"] == "reduce":
+                reduce_keys = current.config.get("reduce_key", [])
+                if isinstance(reduce_keys, str):
+                    reduce_keys = [reduce_keys]
+
+                # Find map prompts in descendants
+                relevant_prompts = find_map_prompts_for_keys(current, reduce_keys)
+
+                if relevant_prompts:
+                    current.config["_intermediates"] = current.config.get(
+                        "_intermediates", {}
+                    )
+                    current.config["_intermediates"]["last_map_prompt"] = (
+                        relevant_prompts[-1]
+                    )
+
+            # Add children to the queue
+            containers_to_check.extend(current.children)
+
+    def should_optimize(
+        self, step_name: str, op_name: str
+    ) -> Tuple[str, List[Dict[str, Any]], List[Dict[str, Any]], float]:
+        """
+        Analyzes whether an operation should be optimized by running it on a sample of input data
+        and evaluating potential optimizations. Returns the optimization suggestion and relevant data.
+        """
+        self.console.rule("[bold cyan]Beginning Pipeline Optimization[/bold cyan]")
+
+        self._insert_empty_resolve_operations()
+
+        node_of_interest = self.runner.op_container_map[f"{step_name}/{op_name}"]
+
+        # Run the node_of_interest's children
+        input_data = []
+        for child in node_of_interest.children:
+            input_data.append(
+                child.next(
+                    is_build=True,
+                    sample_size_needed=SAMPLE_SIZE_MAP.get(child.config["type"]),
+                )[0]
+            )
+
+        # Set the step
+        self.captured_output.set_step(step_name)
+
+        # Determine whether we should optimize the node_of_interest
+        if (
+            node_of_interest.config.get("type") == "map"
+            or node_of_interest.config.get("type") == "filter"
+        ):
+            # Create instance of map optimizer
+            map_optimizer = MapOptimizer(
+                self.runner,
+                self.runner._run_operation,
+                is_filter=node_of_interest.config.get("type") == "filter",
+            )
+            should_optimize_output, input_data, output_data = (
+                map_optimizer.should_optimize(node_of_interest.config, input_data[0])
+            )
+        elif node_of_interest.config.get("type") == "reduce":
+            reduce_optimizer = ReduceOptimizer(
+                self.runner,
+                self.runner._run_operation,
+            )
+            should_optimize_output, input_data, output_data = (
+                reduce_optimizer.should_optimize(node_of_interest.config, input_data[0])
+            )
+        elif node_of_interest.config.get("type") == "resolve":
+            resolve_optimizer = JoinOptimizer(
+                self.runner,
+                node_of_interest.config,
+                target_recall=self.config.get("optimizer_config", {})
+                .get("resolve", {})
+                .get("target_recall", 0.95),
+            )
+            _, should_optimize_output = resolve_optimizer.should_optimize(input_data[0])
+
+            # if should_optimize_output is empty, then we should move to the reduce operation
+            if should_optimize_output == "":
+                return "", [], [], 0.0
+        else:
+            return "", [], [], 0.0
+
+        # Return the string and operation cost
+        return (
+            should_optimize_output,
+            input_data,
+            output_data,
+            self.runner.total_cost + self.llm_client.total_cost,
+        )
+
+    def optimize(self) -> float:
+        """
+        Optimizes the entire pipeline by walking the operation DAG and applying
+        operation-specific optimizers where marked. Returns the total optimization cost.
+        """
+        self.console.rule("[bold cyan]Beginning Pipeline Optimization[/bold cyan]")
+
+        # If self.resume is True and there's a checkpoint, load it
+        if self.resume:
+            if os.path.exists(self.optimized_ops_path):
+                # Load the yaml and change the runner with it
+                with open(self.optimized_ops_path, "r") as f:
+                    partial_optimized_config = yaml.safe_load(f)
+                    self.console.log(
+                        "[yellow]Loading partially optimized pipeline from checkpoint...[/yellow]"
+                    )
+                    self.runner._build_operation_graph(partial_optimized_config)
+            else:
+                self.console.log(
+                    "[yellow]No checkpoint found, starting optimization from scratch...[/yellow]"
+                )
+
+        else:
+            self._insert_empty_resolve_operations()
+
+        # Start with the last operation container and visit each child
+        self.runner.last_op_container.optimize()
+
+        flush_cache(self.console)
+
+        # Print the query plan
+        self.console.rule("[bold cyan]Optimized Query Plan[/bold cyan]")
+        self.runner.print_query_plan()
+
+        return self.llm_client.total_cost
+
+    def _optimize_equijoin(
+        self,
+        op_config: Dict[str, Any],
+        left_name: str,
+        right_name: str,
+        left_data: List[Dict[str, Any]],
+        right_data: List[Dict[str, Any]],
+        run_operation: Callable[
+            [Dict[str, Any], List[Dict[str, Any]]], List[Dict[str, Any]]
+        ],
+    ) -> Tuple[List[Dict[str, Any]], Dict[str, List[Dict[str, Any]]], str, str]:
+        """
+        Optimizes an equijoin operation by analyzing join conditions and potentially inserting
+        map operations to improve join efficiency. Returns the optimized configuration and updated data.
+        """
+        max_iterations = 2
+        new_left_name = left_name
+        new_right_name = right_name
+        new_steps = []
+        for _ in range(max_iterations):
+            join_optimizer = JoinOptimizer(
+                self.runner,
+                op_config,
+                target_recall=self.runner.config.get("optimizer_config", {})
+                .get("equijoin", {})
+                .get("target_recall", 0.95),
+                estimated_selectivity=self.runner.config.get("optimizer_config", {})
+                .get("equijoin", {})
+                .get("estimated_selectivity", None),
+            )
+            optimized_config, cost, agent_results = join_optimizer.optimize_equijoin(
+                left_data, right_data
+            )
+            self.runner.total_cost += cost
+            # Update the operation config with the optimized values
+            op_config.update(optimized_config)
+
+            if not agent_results.get("optimize_map", False):
+                break  # Exit the loop if no more map optimizations are necessary
+
+            # Update the status to indicate we're optimizing a map operation
+            output_key = agent_results["output_key"]
+            if self.runner.status:
+                self.runner.status.update(
+                    f"Optimizing map operation for {output_key} extraction to help with the equijoin"
+                )
+            map_prompt = agent_results["map_prompt"]
+            dataset_to_transform = (
+                left_data
+                if agent_results["dataset_to_transform"] == "left"
+                else right_data
+            )
+
+            # Create a new step for the map operation
+            map_operation = {
+                "name": f"synthesized_{output_key}_extraction",
+                "type": "map",
+                "prompt": map_prompt,
+                "model": self.config.get("default_model", "gpt-4o-mini"),
+                "output": {"schema": {output_key: "string"}},
+                "optimize": False,
+            }
+
+            # Optimize the map operation
+            if map_operation["optimize"]:
+                dataset_to_transform_sample = (
+                    random.sample(dataset_to_transform, self.sample_size_map.get("map"))
+                    if self.config.get("optimizer_config", {}).get(
+                        "random_sample", False
+                    )
+                    else dataset_to_transform[: self.sample_size_map.get("map")]
+                )
+                optimized_map_operations = self._optimize_map(
+                    map_operation, dataset_to_transform_sample
+                )
+            else:
+                optimized_map_operations = [map_operation]
+
+            new_step = {
+                "name": f"synthesized_{output_key}_extraction",
+                "input": (
+                    left_name
+                    if agent_results["dataset_to_transform"] == "left"
+                    else right_name
+                ),
+                "operations": [mo["name"] for mo in optimized_map_operations],
+            }
+            if agent_results["dataset_to_transform"] == "left":
+                new_left_name = new_step["name"]
+            else:
+                new_right_name = new_step["name"]
+
+            new_steps.append((new_step["name"], new_step, optimized_map_operations))
+
+            # Now run the optimized map operation on the entire dataset_to_transform
+            for op in optimized_map_operations:
+                dataset_to_transform = run_operation(op, dataset_to_transform)
+
+            # Update the appropriate dataset for the next iteration
+            if agent_results["dataset_to_transform"] == "left":
+                left_data = dataset_to_transform
+            else:
+                right_data = dataset_to_transform
+
+            if self.runner.status:
+                self.runner.status.update(
+                    f"Optimizing equijoin operation with {output_key} extraction"
+                )
+
+        return op_config, new_steps, new_left_name, new_right_name
+
+    def checkpoint_optimized_ops(self) -> None:
+        """
+        Generates the clean config and saves it to the self.optimized_ops_path
+        This is used to resume optimization from a previous run
+        """
+        clean_config = self.clean_optimized_config()
+        with open(self.optimized_ops_path, "w") as f:
+            yaml.safe_dump(clean_config, f, default_flow_style=False, width=80)
+
+    # Recursively resolve all anchors and aliases
+    @staticmethod
+    def resolve_anchors(data):
+        """
+        Recursively resolve all anchors and aliases in a nested data structure.
+
+        This static method traverses through dictionaries and lists, resolving any YAML anchors and aliases.
+
+        Args:
+            data: The data structure to resolve. Can be a dictionary, list, or any other type.
+
+        Returns:
+            The resolved data structure with all anchors and aliases replaced by their actual values.
+        """
+        if isinstance(data, dict):
+            return {k: Optimizer.resolve_anchors(v) for k, v in data.items()}
+        elif isinstance(data, list):
+            return [Optimizer.resolve_anchors(item) for item in data]
+        else:
+            return data
+
+    def clean_optimized_config(self) -> Dict:
+        """
+        Creates a clean YAML configuration from the optimized operation containers,
+        removing internal fields and organizing operations into proper pipeline steps.
+        """
+        if not self.runner.last_op_container:
+            return self.config
+
+        # Create a clean copy of the config
+        clean_config = {
+            "datasets": self.config.get("datasets", {}),
+            "operations": [],
+            "pipeline": self.runner.config.get(
+                "pipeline", {}
+            ).copy(),  # Copy entire pipeline config
+        }
+
+        # Reset steps to regenerate
+        clean_config["pipeline"]["steps"] = []
+
+        # Keep track of operations we've seen to avoid duplicates
+        seen_operations = set()
+
+        def clean_operation(op_container: OpContainer) -> Dict:
+            """Remove internal fields from operation config"""
+            op_config = op_container.config
+            clean_op = op_config.copy()
+
+            clean_op.pop("_intermediates", None)
+
+            # If op has already been optimized, remove the recursively_optimize and optimize fields
+            if op_container.is_optimized:
+                for field in ["recursively_optimize", "optimize"]:
+                    clean_op.pop(field, None)
+
+            return clean_op
+
+        def process_container(container, current_step=None):
+            """Process an operation container and its dependencies"""
+            # Skip step boundaries
+            if isinstance(container, StepBoundary):
+                if container.children:
+                    return process_container(container.children[0], current_step)
+                return None, None
+
+            # Get step name from container name
+            step_name = container.name.split("/")[0]
+
+            # If this is a new step, create it
+            if not current_step or current_step["name"] != step_name:
+                current_step = {"name": step_name, "operations": []}
+                clean_config["pipeline"]["steps"].insert(0, current_step)
+
+            # Skip scan operations but process their dependencies
+            if container.config["type"] == "scan":
+                if container.children:
+                    return process_container(container.children[0], current_step)
+                return None, current_step
+
+            # Handle equijoin operations
+            if container.is_equijoin:
+                # Add operation to list if not seen
+                if container.name not in seen_operations:
+                    op_config = clean_operation(container)
+                    clean_config["operations"].append(op_config)
+                    seen_operations.add(container.name)
+
+                # Add to step operations with left and right inputs
+                current_step["operations"].insert(
+                    0,
+                    {
+                        container.config["name"]: {
+                            "left": container.kwargs["left_name"],
+                            "right": container.kwargs["right_name"],
+                        }
+                    },
+                )
+
+                # Process both children
+                if container.children:
+                    process_container(container.children[0], current_step)
+                    process_container(container.children[1], current_step)
+            else:
+                # Add operation to list if not seen
+                if container.name not in seen_operations:
+                    op_config = clean_operation(container)
+                    clean_config["operations"].append(op_config)
+                    seen_operations.add(container.name)
+
+                # Add to step operations
+                current_step["operations"].insert(0, container.config["name"])
+
+                # Process children
+                if container.children:
+                    for child in container.children:
+                        process_container(child, current_step)
+
+            return container, current_step
+
+        # Start processing from the last container
+        process_container(self.runner.last_op_container)
+
+        # Add inputs to steps based on their first operation
+        for step in clean_config["pipeline"]["steps"]:
+            first_op = step["operations"][0]
+            if isinstance(first_op, dict):  # This is an equijoin
+                continue  # Equijoin steps don't need an input field
+            elif len(step["operations"]) > 0:
+                # Find the first non-scan operation's input by looking at its dependencies
+                op_container = self.runner.op_container_map.get(
+                    f"{step['name']}/{first_op}"
+                )
+                if op_container and op_container.children:
+                    child = op_container.children[0]
+                    while (
+                        child
+                        and child.config["type"] == "step_boundary"
+                        and child.children
+                    ):
+                        child = child.children[0]
+                    if child and child.config["type"] == "scan":
+                        step["input"] = child.config["dataset_name"]
+
+        # Preserve all other config key-value pairs from original config
+        for key, value in self.config.items():
+            if key not in ["datasets", "operations", "pipeline"]:
+                clean_config[key] = value
+
+        return clean_config
+
+    def save_optimized_config(self, optimized_config_path: str):
+        """
+        Saves the optimized configuration to a YAML file after resolving all references
+        and cleaning up internal optimization artifacts.
+        """
+        resolved_config = self.clean_optimized_config()
+
+        with open(optimized_config_path, "w") as f:
+            yaml.safe_dump(resolved_config, f, default_flow_style=False, width=80)
+            self.console.log(
+                f"[green italic]💾 Optimized config saved to {optimized_config_path}[/green italic]"
+            )
+
+
+ + + +
+ + + + + + + + + +
+ + +

+ __init__(runner, model='gpt-4o', resume=False, timeout=60) + +

+ + +
+ +

Initialize the optimizer with a runner instance and configuration. +Sets up optimization parameters, caching, and cost tracking.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ yaml_file + + str + +
+

Path to the YAML configuration file.

+
+
+ required +
+ model + + str + +
+

The name of the language model to use. Defaults to "gpt-4o".

+
+
+ 'gpt-4o' +
+ resume + + bool + +
+

Whether to resume optimization from a previous run. Defaults to False.

+
+
+ False +
+ timeout + + int + +
+

Timeout in seconds for operations. Defaults to 60.

+
+
+ 60 +
+ + +

Attributes:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescription
config + Dict + +
+

Stores the loaded configuration from the YAML file.

+
+
console + Console + +
+

Rich console for formatted output.

+
+
max_threads + int + +
+

Maximum number of threads for parallel processing.

+
+
base_name + str + +
+

Base name used for file paths.

+
+
yaml_file_suffix + str + +
+

Suffix for YAML configuration files.

+
+
runner + DSLRunner + +
+

The DSL runner instance.

+
+
status + DSLRunner + +
+

Status tracking for the runner.

+
+
optimized_config + Dict + +
+

A copy of the original config to be optimized.

+
+
llm_client + LLMClient + +
+

Client for interacting with the language model.

+
+
timeout + int + +
+

Timeout for operations in seconds.

+
+
resume + bool + +
+

Whether to resume from previous optimization.

+
+
captured_output + CapturedOutput + +
+

Captures output during optimization.

+
+
sample_cache + Dict + +
+

Maps operation names to tuples of (output_data, sample_size).

+
+
optimized_ops_path + str + +
+

Path to store optimized operations.

+
+
sample_size_map + Dict + +
+

Maps operation types to sample sizes.

+
+
+

The method also calls print_optimizer_config() to display the initial configuration.

+ +
+ Source code in docetl/optimizer.py +
 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
def __init__(
+    self,
+    runner: "DSLRunner",
+    model: str = "gpt-4o",
+    resume: bool = False,
+    timeout: int = 60,
+):
+    """
+    Initialize the optimizer with a runner instance and configuration.
+    Sets up optimization parameters, caching, and cost tracking.
+
+    Args:
+        yaml_file (str): Path to the YAML configuration file.
+        model (str): The name of the language model to use. Defaults to "gpt-4o".
+        resume (bool): Whether to resume optimization from a previous run. Defaults to False.
+        timeout (int): Timeout in seconds for operations. Defaults to 60.
+
+    Attributes:
+        config (Dict): Stores the loaded configuration from the YAML file.
+        console (Console): Rich console for formatted output.
+        max_threads (int): Maximum number of threads for parallel processing.
+        base_name (str): Base name used for file paths.
+        yaml_file_suffix (str): Suffix for YAML configuration files.
+        runner (DSLRunner): The DSL runner instance.
+        status: Status tracking for the runner.
+        optimized_config (Dict): A copy of the original config to be optimized.
+        llm_client (LLMClient): Client for interacting with the language model.
+        timeout (int): Timeout for operations in seconds.
+        resume (bool): Whether to resume from previous optimization.
+        captured_output (CapturedOutput): Captures output during optimization.
+        sample_cache (Dict): Maps operation names to tuples of (output_data, sample_size).
+        optimized_ops_path (str): Path to store optimized operations.
+        sample_size_map (Dict): Maps operation types to sample sizes.
+
+    The method also calls print_optimizer_config() to display the initial configuration.
+    """
+    self.config = runner.config
+    self.console = runner.console
+    self.max_threads = runner.max_threads
+
+    self.base_name = runner.base_name
+    self.yaml_file_suffix = runner.yaml_file_suffix
+    self.runner = runner
+    self.status = runner.status
+
+    self.optimized_config = copy.deepcopy(self.config)
+    self.llm_client = LLMClient(model)
+    self.timeout = timeout
+    self.resume = resume
+    self.captured_output = CapturedOutput()
+
+    # Add sample cache for build operations
+    self.sample_cache = {}  # Maps operation names to (output_data, sample_size)
+
+    home_dir = os.environ.get("DOCETL_HOME_DIR", os.path.expanduser("~"))
+    cache_dir = os.path.join(home_dir, f".docetl/cache/{runner.yaml_file_suffix}")
+    os.makedirs(cache_dir, exist_ok=True)
+
+    # Hash the config to create a unique identifier
+    config_hash = hashlib.sha256(str(self.config).encode()).hexdigest()
+    self.optimized_ops_path = f"{cache_dir}/{config_hash}.yaml"
+
+    # Update sample size map
+    self.sample_size_map = SAMPLE_SIZE_MAP
+    if self.config.get("optimizer_config", {}).get("sample_sizes", {}):
+        self.sample_size_map.update(self.config["optimizer_config"]["sample_sizes"])
+
+    if not self.runner._from_df_accessors:
+        self.print_optimizer_config()
+
+
+
+ +
+ +
+ + +

+ checkpoint_optimized_ops() + +

+ + +
+ +

Generates the clean config and saves it to the self.optimized_ops_path +This is used to resume optimization from a previous run

+ +
+ Source code in docetl/optimizer.py +
552
+553
+554
+555
+556
+557
+558
+559
def checkpoint_optimized_ops(self) -> None:
+    """
+    Generates the clean config and saves it to the self.optimized_ops_path
+    This is used to resume optimization from a previous run
+    """
+    clean_config = self.clean_optimized_config()
+    with open(self.optimized_ops_path, "w") as f:
+        yaml.safe_dump(clean_config, f, default_flow_style=False, width=80)
+
+
+
+ +
+ +
+ + +

+ clean_optimized_config() + +

+ + +
+ +

Creates a clean YAML configuration from the optimized operation containers, +removing internal fields and organizing operations into proper pipeline steps.

+ +
+ Source code in docetl/optimizer.py +
582
+583
+584
+585
+586
+587
+588
+589
+590
+591
+592
+593
+594
+595
+596
+597
+598
+599
+600
+601
+602
+603
+604
+605
+606
+607
+608
+609
+610
+611
+612
+613
+614
+615
+616
+617
+618
+619
+620
+621
+622
+623
+624
+625
+626
+627
+628
+629
+630
+631
+632
+633
+634
+635
+636
+637
+638
+639
+640
+641
+642
+643
+644
+645
+646
+647
+648
+649
+650
+651
+652
+653
+654
+655
+656
+657
+658
+659
+660
+661
+662
+663
+664
+665
+666
+667
+668
+669
+670
+671
+672
+673
+674
+675
+676
+677
+678
+679
+680
+681
+682
+683
+684
+685
+686
+687
+688
+689
+690
+691
+692
+693
+694
+695
+696
+697
+698
+699
+700
+701
+702
+703
+704
+705
+706
+707
+708
+709
+710
def clean_optimized_config(self) -> Dict:
+    """
+    Creates a clean YAML configuration from the optimized operation containers,
+    removing internal fields and organizing operations into proper pipeline steps.
+    """
+    if not self.runner.last_op_container:
+        return self.config
+
+    # Create a clean copy of the config
+    clean_config = {
+        "datasets": self.config.get("datasets", {}),
+        "operations": [],
+        "pipeline": self.runner.config.get(
+            "pipeline", {}
+        ).copy(),  # Copy entire pipeline config
+    }
+
+    # Reset steps to regenerate
+    clean_config["pipeline"]["steps"] = []
+
+    # Keep track of operations we've seen to avoid duplicates
+    seen_operations = set()
+
+    def clean_operation(op_container: OpContainer) -> Dict:
+        """Remove internal fields from operation config"""
+        op_config = op_container.config
+        clean_op = op_config.copy()
+
+        clean_op.pop("_intermediates", None)
+
+        # If op has already been optimized, remove the recursively_optimize and optimize fields
+        if op_container.is_optimized:
+            for field in ["recursively_optimize", "optimize"]:
+                clean_op.pop(field, None)
+
+        return clean_op
+
+    def process_container(container, current_step=None):
+        """Process an operation container and its dependencies"""
+        # Skip step boundaries
+        if isinstance(container, StepBoundary):
+            if container.children:
+                return process_container(container.children[0], current_step)
+            return None, None
+
+        # Get step name from container name
+        step_name = container.name.split("/")[0]
+
+        # If this is a new step, create it
+        if not current_step or current_step["name"] != step_name:
+            current_step = {"name": step_name, "operations": []}
+            clean_config["pipeline"]["steps"].insert(0, current_step)
+
+        # Skip scan operations but process their dependencies
+        if container.config["type"] == "scan":
+            if container.children:
+                return process_container(container.children[0], current_step)
+            return None, current_step
+
+        # Handle equijoin operations
+        if container.is_equijoin:
+            # Add operation to list if not seen
+            if container.name not in seen_operations:
+                op_config = clean_operation(container)
+                clean_config["operations"].append(op_config)
+                seen_operations.add(container.name)
+
+            # Add to step operations with left and right inputs
+            current_step["operations"].insert(
+                0,
+                {
+                    container.config["name"]: {
+                        "left": container.kwargs["left_name"],
+                        "right": container.kwargs["right_name"],
+                    }
+                },
+            )
+
+            # Process both children
+            if container.children:
+                process_container(container.children[0], current_step)
+                process_container(container.children[1], current_step)
+        else:
+            # Add operation to list if not seen
+            if container.name not in seen_operations:
+                op_config = clean_operation(container)
+                clean_config["operations"].append(op_config)
+                seen_operations.add(container.name)
+
+            # Add to step operations
+            current_step["operations"].insert(0, container.config["name"])
+
+            # Process children
+            if container.children:
+                for child in container.children:
+                    process_container(child, current_step)
+
+        return container, current_step
+
+    # Start processing from the last container
+    process_container(self.runner.last_op_container)
+
+    # Add inputs to steps based on their first operation
+    for step in clean_config["pipeline"]["steps"]:
+        first_op = step["operations"][0]
+        if isinstance(first_op, dict):  # This is an equijoin
+            continue  # Equijoin steps don't need an input field
+        elif len(step["operations"]) > 0:
+            # Find the first non-scan operation's input by looking at its dependencies
+            op_container = self.runner.op_container_map.get(
+                f"{step['name']}/{first_op}"
+            )
+            if op_container and op_container.children:
+                child = op_container.children[0]
+                while (
+                    child
+                    and child.config["type"] == "step_boundary"
+                    and child.children
+                ):
+                    child = child.children[0]
+                if child and child.config["type"] == "scan":
+                    step["input"] = child.config["dataset_name"]
+
+    # Preserve all other config key-value pairs from original config
+    for key, value in self.config.items():
+        if key not in ["datasets", "operations", "pipeline"]:
+            clean_config[key] = value
+
+    return clean_config
+
+
+
+ +
+ +
+ + +

+ optimize() + +

+ + +
+ +

Optimizes the entire pipeline by walking the operation DAG and applying +operation-specific optimizers where marked. Returns the total optimization cost.

+ +
+ Source code in docetl/optimizer.py +
405
+406
+407
+408
+409
+410
+411
+412
+413
+414
+415
+416
+417
+418
+419
+420
+421
+422
+423
+424
+425
+426
+427
+428
+429
+430
+431
+432
+433
+434
+435
+436
+437
+438
+439
def optimize(self) -> float:
+    """
+    Optimizes the entire pipeline by walking the operation DAG and applying
+    operation-specific optimizers where marked. Returns the total optimization cost.
+    """
+    self.console.rule("[bold cyan]Beginning Pipeline Optimization[/bold cyan]")
+
+    # If self.resume is True and there's a checkpoint, load it
+    if self.resume:
+        if os.path.exists(self.optimized_ops_path):
+            # Load the yaml and change the runner with it
+            with open(self.optimized_ops_path, "r") as f:
+                partial_optimized_config = yaml.safe_load(f)
+                self.console.log(
+                    "[yellow]Loading partially optimized pipeline from checkpoint...[/yellow]"
+                )
+                self.runner._build_operation_graph(partial_optimized_config)
+        else:
+            self.console.log(
+                "[yellow]No checkpoint found, starting optimization from scratch...[/yellow]"
+            )
+
+    else:
+        self._insert_empty_resolve_operations()
+
+    # Start with the last operation container and visit each child
+    self.runner.last_op_container.optimize()
+
+    flush_cache(self.console)
+
+    # Print the query plan
+    self.console.rule("[bold cyan]Optimized Query Plan[/bold cyan]")
+    self.runner.print_query_plan()
+
+    return self.llm_client.total_cost
+
+
+
+ +
+ +
+ + +

+ print_optimizer_config() + +

+ + +
+ +

Print the current configuration of the optimizer.

+

This method uses the Rich console to display a formatted output of the optimizer's +configuration. It includes details such as the YAML file path, sample sizes for +different operation types, maximum number of threads, the language model being used, +and the timeout setting.

+

The output is color-coded and formatted for easy readability, with a header and +separator lines to clearly delineate the configuration information.

+ +
+ Source code in docetl/optimizer.py +
125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
def print_optimizer_config(self):
+    """
+    Print the current configuration of the optimizer.
+
+    This method uses the Rich console to display a formatted output of the optimizer's
+    configuration. It includes details such as the YAML file path, sample sizes for
+    different operation types, maximum number of threads, the language model being used,
+    and the timeout setting.
+
+    The output is color-coded and formatted for easy readability, with a header and
+    separator lines to clearly delineate the configuration information.
+    """
+    self.console.log(
+        Panel.fit(
+            "[bold cyan]Optimizer Configuration[/bold cyan]\n"
+            f"[yellow]Sample Size:[/yellow] {self.sample_size_map}\n"
+            f"[yellow]Max Threads:[/yellow] {self.max_threads}\n"
+            f"[yellow]Model:[/yellow] {self.llm_client.model}\n"
+            f"[yellow]Timeout:[/yellow] {self.timeout} seconds",
+            title="Optimizer Configuration",
+        )
+    )
+
+
+
+ +
+ +
+ + +

+ resolve_anchors(data) + + + staticmethod + + +

+ + +
+ +

Recursively resolve all anchors and aliases in a nested data structure.

+

This static method traverses through dictionaries and lists, resolving any YAML anchors and aliases.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ data + + +
+

The data structure to resolve. Can be a dictionary, list, or any other type.

+
+
+ required +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ +
+

The resolved data structure with all anchors and aliases replaced by their actual values.

+
+
+ +
+ Source code in docetl/optimizer.py +
562
+563
+564
+565
+566
+567
+568
+569
+570
+571
+572
+573
+574
+575
+576
+577
+578
+579
+580
@staticmethod
+def resolve_anchors(data):
+    """
+    Recursively resolve all anchors and aliases in a nested data structure.
+
+    This static method traverses through dictionaries and lists, resolving any YAML anchors and aliases.
+
+    Args:
+        data: The data structure to resolve. Can be a dictionary, list, or any other type.
+
+    Returns:
+        The resolved data structure with all anchors and aliases replaced by their actual values.
+    """
+    if isinstance(data, dict):
+        return {k: Optimizer.resolve_anchors(v) for k, v in data.items()}
+    elif isinstance(data, list):
+        return [Optimizer.resolve_anchors(item) for item in data]
+    else:
+        return data
+
+
+
+ +
+ +
+ + +

+ save_optimized_config(optimized_config_path) + +

+ + +
+ +

Saves the optimized configuration to a YAML file after resolving all references +and cleaning up internal optimization artifacts.

+ +
+ Source code in docetl/optimizer.py +
712
+713
+714
+715
+716
+717
+718
+719
+720
+721
+722
+723
def save_optimized_config(self, optimized_config_path: str):
+    """
+    Saves the optimized configuration to a YAML file after resolving all references
+    and cleaning up internal optimization artifacts.
+    """
+    resolved_config = self.clean_optimized_config()
+
+    with open(optimized_config_path, "w") as f:
+        yaml.safe_dump(resolved_config, f, default_flow_style=False, width=80)
+        self.console.log(
+            f"[green italic]💾 Optimized config saved to {optimized_config_path}[/green italic]"
+        )
+
+
+
+ +
+ +
+ + +

+ should_optimize(step_name, op_name) + +

+ + +
+ +

Analyzes whether an operation should be optimized by running it on a sample of input data +and evaluating potential optimizations. Returns the optimization suggestion and relevant data.

+ +
+ Source code in docetl/optimizer.py +
333
+334
+335
+336
+337
+338
+339
+340
+341
+342
+343
+344
+345
+346
+347
+348
+349
+350
+351
+352
+353
+354
+355
+356
+357
+358
+359
+360
+361
+362
+363
+364
+365
+366
+367
+368
+369
+370
+371
+372
+373
+374
+375
+376
+377
+378
+379
+380
+381
+382
+383
+384
+385
+386
+387
+388
+389
+390
+391
+392
+393
+394
+395
+396
+397
+398
+399
+400
+401
+402
+403
def should_optimize(
+    self, step_name: str, op_name: str
+) -> Tuple[str, List[Dict[str, Any]], List[Dict[str, Any]], float]:
+    """
+    Analyzes whether an operation should be optimized by running it on a sample of input data
+    and evaluating potential optimizations. Returns the optimization suggestion and relevant data.
+    """
+    self.console.rule("[bold cyan]Beginning Pipeline Optimization[/bold cyan]")
+
+    self._insert_empty_resolve_operations()
+
+    node_of_interest = self.runner.op_container_map[f"{step_name}/{op_name}"]
+
+    # Run the node_of_interest's children
+    input_data = []
+    for child in node_of_interest.children:
+        input_data.append(
+            child.next(
+                is_build=True,
+                sample_size_needed=SAMPLE_SIZE_MAP.get(child.config["type"]),
+            )[0]
+        )
+
+    # Set the step
+    self.captured_output.set_step(step_name)
+
+    # Determine whether we should optimize the node_of_interest
+    if (
+        node_of_interest.config.get("type") == "map"
+        or node_of_interest.config.get("type") == "filter"
+    ):
+        # Create instance of map optimizer
+        map_optimizer = MapOptimizer(
+            self.runner,
+            self.runner._run_operation,
+            is_filter=node_of_interest.config.get("type") == "filter",
+        )
+        should_optimize_output, input_data, output_data = (
+            map_optimizer.should_optimize(node_of_interest.config, input_data[0])
+        )
+    elif node_of_interest.config.get("type") == "reduce":
+        reduce_optimizer = ReduceOptimizer(
+            self.runner,
+            self.runner._run_operation,
+        )
+        should_optimize_output, input_data, output_data = (
+            reduce_optimizer.should_optimize(node_of_interest.config, input_data[0])
+        )
+    elif node_of_interest.config.get("type") == "resolve":
+        resolve_optimizer = JoinOptimizer(
+            self.runner,
+            node_of_interest.config,
+            target_recall=self.config.get("optimizer_config", {})
+            .get("resolve", {})
+            .get("target_recall", 0.95),
+        )
+        _, should_optimize_output = resolve_optimizer.should_optimize(input_data[0])
+
+        # if should_optimize_output is empty, then we should move to the reduce operation
+        if should_optimize_output == "":
+            return "", [], [], 0.0
+    else:
+        return "", [], [], 0.0
+
+    # Return the string and operation cost
+    return (
+        should_optimize_output,
+        input_data,
+        output_data,
+        self.runner.total_cost + self.llm_client.total_cost,
+    )
+
+
+
+ +
+ + + +
+ +
+ +
+ + + + + + + + + + + + + +
+
+ + + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + + \ No newline at end of file diff --git a/api-reference/operations/index.html b/api-reference/operations/index.html new file mode 100644 index 00000000..0a414086 --- /dev/null +++ b/api-reference/operations/index.html @@ -0,0 +1,17222 @@ + + + + + + + + + + + + + + + + + + + + + + + Operations - docetl docs + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + Skip to content + + +
+
+ +
+ + + + + + +
+ + + + + + + +
+ +
+ + + + +
+
+ + + +
+
+
+ + + + + + + +
+
+
+ + + +
+
+ +
+
+ + + +
+
+ + + + + + + +

LLM-Powered Operators

+ + +
+ + + +

+ docetl.operations.map.MapOperation + + +

+ + +
+

+ Bases: BaseOperation

+ + + + + + + +
+ Source code in docetl/operations/map.py +
 18
+ 19
+ 20
+ 21
+ 22
+ 23
+ 24
+ 25
+ 26
+ 27
+ 28
+ 29
+ 30
+ 31
+ 32
+ 33
+ 34
+ 35
+ 36
+ 37
+ 38
+ 39
+ 40
+ 41
+ 42
+ 43
+ 44
+ 45
+ 46
+ 47
+ 48
+ 49
+ 50
+ 51
+ 52
+ 53
+ 54
+ 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
+162
+163
+164
+165
+166
+167
+168
+169
+170
+171
+172
+173
+174
+175
+176
+177
+178
+179
+180
+181
+182
+183
+184
+185
+186
+187
+188
+189
+190
+191
+192
+193
+194
+195
+196
+197
+198
+199
+200
+201
+202
+203
+204
+205
+206
+207
+208
+209
+210
+211
+212
+213
+214
+215
+216
+217
+218
+219
+220
+221
+222
+223
+224
+225
+226
+227
+228
+229
+230
+231
+232
+233
+234
+235
+236
+237
+238
+239
+240
+241
+242
+243
+244
+245
+246
+247
+248
+249
+250
+251
+252
+253
+254
+255
+256
+257
+258
+259
+260
+261
+262
+263
+264
+265
+266
+267
+268
+269
+270
+271
+272
+273
+274
+275
+276
+277
+278
+279
+280
+281
+282
+283
+284
+285
+286
+287
+288
+289
+290
+291
+292
+293
+294
+295
+296
+297
+298
+299
+300
+301
+302
+303
+304
+305
+306
+307
+308
+309
+310
+311
+312
+313
+314
+315
+316
+317
+318
+319
+320
+321
+322
+323
+324
+325
+326
+327
+328
+329
+330
+331
+332
+333
+334
+335
+336
+337
+338
+339
+340
+341
+342
+343
+344
+345
+346
+347
+348
+349
+350
+351
+352
+353
+354
class MapOperation(BaseOperation):
+    class schema(BaseOperation.schema):
+        type: str = "map"
+        output: Optional[Dict[str, Any]] = None
+        prompt: Optional[str] = None
+        model: Optional[str] = None
+        optimize: Optional[bool] = None
+        recursively_optimize: Optional[bool] = None
+        sample_size: Optional[int] = None
+        tools: Optional[List[Dict[str, Any]]] = (
+            None  # FIXME: Why isn't this using the Tool data class so validation works automatically?
+        )
+        validation_rules: Optional[List[str]] = Field(None, alias="validate")
+        num_retries_on_validate_failure: Optional[int] = None
+        gleaning: Optional[Dict[str, Any]] = None
+        drop_keys: Optional[List[str]] = None
+        timeout: Optional[int] = None
+        enable_observability: bool = False
+        batch_size: Optional[int] = None
+        clustering_method: Optional[str] = None
+        batch_prompt: Optional[str] = None
+        litellm_completion_kwargs: Dict[str, Any] = {}
+
+        @field_validator("drop_keys")
+        def validate_drop_keys(cls, v):
+            if isinstance(v, str):
+                return [v]
+            return v
+
+    def __init__(
+        self,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.max_batch_size: int = self.config.get(
+            "max_batch_size", kwargs.get("max_batch_size", None)
+        )
+        self.clustering_method = "random"
+
+    def syntax_check(self) -> None:
+        """
+            Checks the configuration of the MapOperation for required keys and valid structure.
+
+        Raises:
+            ValueError: If required keys are missing or invalid in the configuration.
+            TypeError: If configuration values have incorrect types.
+        """
+        config = self.schema(**self.config)
+
+        if config.drop_keys:
+            if any(not isinstance(key, str) for key in config.drop_keys):
+                raise TypeError("All items in 'drop_keys' must be strings")
+        elif not (config.prompt and config.output):
+            raise ValueError(
+                "If 'drop_keys' is not specified, both 'prompt' and 'output' must be present in the configuration"
+            )
+
+        if config.batch_prompt:
+            try:
+                template = Template(config.batch_prompt)
+                # Test render with a minimal inputs list to validate template
+                template.render(inputs=[{}])
+            except Exception as e:
+                raise ValueError(
+                    f"Invalid Jinja2 template in 'batch_prompt' or missing required 'inputs' variable: {str(e)}"
+                ) from e
+
+        if config.prompt or config.output:
+            for key in ["prompt", "output"]:
+                if not getattr(config, key):
+                    raise ValueError(
+                        f"Missing required key '{key}' in MapOperation configuration"
+                    )
+
+            if config.output and not config.output["schema"]:
+                raise ValueError("Missing 'schema' in 'output' configuration")
+
+            if config.prompt:
+                try:
+                    Template(config.prompt)
+                except Exception as e:
+                    raise ValueError(
+                        f"Invalid Jinja2 template in 'prompt': {str(e)}"
+                    ) from e
+
+            if config.model and not isinstance(config.model, str):
+                raise TypeError("'model' in configuration must be a string")
+
+            if config.tools:
+                for tool in config.tools:
+                    try:
+                        tool_obj = Tool(**tool)
+                    except Exception:
+                        raise TypeError("Tool must be a dictionary")
+
+                    if not (tool_obj.code and tool_obj.function):
+                        raise ValueError(
+                            "Tool is missing required 'code' or 'function' key"
+                        )
+
+                    if not isinstance(tool_obj.function, ToolFunction):
+                        raise TypeError("'function' in tool must be a dictionary")
+
+                    for key in ["name", "description", "parameters"]:
+                        if not getattr(tool_obj.function, key):
+                            raise ValueError(
+                                f"Tool is missing required '{key}' in 'function'"
+                            )
+
+            self.gleaning_check()
+
+    def execute(self, input_data: List[Dict]) -> Tuple[List[Dict], float]:
+        """
+        Executes the map operation on the provided input data.
+
+        Args:
+            input_data (List[Dict]): The input data to process.
+
+        Returns:
+            Tuple[List[Dict], float]: A tuple containing the processed results and the total cost of the operation.
+
+        This method performs the following steps:
+        1. If a prompt is specified, it processes each input item using the specified prompt and LLM model
+        2. Applies gleaning if configured
+        3. Validates the output
+        4. If drop_keys is specified, it drops the specified keys from each document
+        5. Aggregates results and calculates total cost
+
+        The method uses parallel processing to improve performance.
+        """
+        # Check if there's no prompt and only drop_keys
+        if "prompt" not in self.config and "drop_keys" in self.config:
+            # If only drop_keys is specified, simply drop the keys and return
+            dropped_results = []
+            for item in input_data:
+                new_item = {
+                    k: v for k, v in item.items() if k not in self.config["drop_keys"]
+                }
+                dropped_results.append(new_item)
+            return dropped_results, 0.0  # Return the modified data with no cost
+
+        if self.status:
+            self.status.stop()
+
+        def _process_map_item(
+            item: Dict, initial_result: Optional[Dict] = None
+        ) -> Tuple[Optional[Dict], float]:
+
+            prompt = strict_render(self.config["prompt"], {"input": item})
+
+            def validation_fn(response: Union[Dict[str, Any], ModelResponse]):
+                output = (
+                    self.runner.api.parse_llm_response(
+                        response,
+                        schema=self.config["output"]["schema"],
+                        tools=self.config.get("tools", None),
+                        manually_fix_errors=self.manually_fix_errors,
+                    )[0]
+                    if isinstance(response, ModelResponse)
+                    else response
+                )
+
+                # Check that the output has all the keys in the schema
+                for key in self.config["output"]["schema"]:
+                    if key not in output:
+                        return output, False
+
+                for key, value in item.items():
+                    if key not in self.config["output"]["schema"]:
+                        output[key] = value
+                if self.runner.api.validate_output(self.config, output, self.console):
+                    return output, True
+                return output, False
+
+            self.runner.rate_limiter.try_acquire("call", weight=1)
+            llm_result = self.runner.api.call_llm(
+                self.config.get("model", self.default_model),
+                "map",
+                [{"role": "user", "content": prompt}],
+                self.config["output"]["schema"],
+                tools=self.config.get("tools", None),
+                scratchpad=None,
+                timeout_seconds=self.config.get("timeout", 120),
+                max_retries_per_timeout=self.config.get("max_retries_per_timeout", 2),
+                validation_config=(
+                    {
+                        "num_retries": self.num_retries_on_validate_failure,
+                        "val_rule": self.config.get("validate", []),
+                        "validation_fn": validation_fn,
+                    }
+                    if self.config.get("validate", None)
+                    else None
+                ),
+                gleaning_config=self.config.get("gleaning", None),
+                verbose=self.config.get("verbose", False),
+                bypass_cache=self.config.get("bypass_cache", False),
+                initial_result=initial_result,
+                litellm_completion_kwargs=self.config.get(
+                    "litellm_completion_kwargs", {}
+                ),
+            )
+
+            if llm_result.validated:
+                # Parse the response
+                if isinstance(llm_result.response, ModelResponse):
+                    output = self.runner.api.parse_llm_response(
+                        llm_result.response,
+                        schema=self.config["output"]["schema"],
+                        tools=self.config.get("tools", None),
+                        manually_fix_errors=self.manually_fix_errors,
+                    )[0]
+                else:
+                    output = llm_result.response
+
+                # Augment the output with the original item
+                output = {**item, **output}
+                if self.config.get("enable_observability", False):
+                    output[f"_observability_{self.config['name']}"] = {"prompt": prompt}
+                return output, llm_result.total_cost
+
+            return None, llm_result.total_cost
+
+        # If there's a batch prompt, let's use that
+        def _process_map_batch(items: List[Dict]) -> Tuple[List[Dict], float]:
+            total_cost = 0
+            if len(items) > 1 and self.config.get("batch_prompt", None):
+                batch_prompt = strict_render(
+                    self.config["batch_prompt"], {"inputs": items}
+                )
+
+                # Issue the batch call
+                llm_result = self.runner.api.call_llm_batch(
+                    self.config.get("model", self.default_model),
+                    "batch map",
+                    [{"role": "user", "content": batch_prompt}],
+                    self.config["output"]["schema"],
+                    verbose=self.config.get("verbose", False),
+                    timeout_seconds=self.config.get("timeout", 120),
+                    max_retries_per_timeout=self.config.get(
+                        "max_retries_per_timeout", 2
+                    ),
+                    bypass_cache=self.config.get("bypass_cache", False),
+                    litellm_completion_kwargs=self.config.get(
+                        "litellm_completion_kwargs", {}
+                    ),
+                )
+                total_cost += llm_result.total_cost
+
+                # Parse the LLM response
+                parsed_output = self.runner.api.parse_llm_response(
+                    llm_result.response, self.config["output"]["schema"]
+                )[0].get("results", [])
+                items_and_outputs = [
+                    (item, parsed_output[idx] if idx < len(parsed_output) else None)
+                    for idx, item in enumerate(items)
+                ]
+            else:
+                items_and_outputs = [(item, None) for item in items]
+
+            # Run _process_map_item for each item
+            all_results = []
+            if len(items_and_outputs) > 1:
+                with ThreadPoolExecutor(max_workers=self.max_batch_size) as executor:
+                    futures = [
+                        executor.submit(
+                            _process_map_item,
+                            items_and_outputs[i][0],
+                            items_and_outputs[i][1],
+                        )
+                        for i in range(len(items_and_outputs))
+                    ]
+                    for i in range(len(futures)):
+                        try:
+                            result, item_cost = futures[i].result()
+                            if result is not None:
+                                all_results.append(result)
+                            total_cost += item_cost
+                        except Exception as e:
+                            if self.config.get("skip_on_error", False):
+                                self.console.log(
+                                    f"[bold red]Error in map operation {self.config['name']}, skipping item:[/bold red] {e}"
+                                )
+                                continue
+                            else:
+                                raise e
+            else:
+                try:
+                    result, item_cost = _process_map_item(
+                        items_and_outputs[0][0], items_and_outputs[0][1]
+                    )
+                    if result is not None:
+                        all_results.append(result)
+                    total_cost += item_cost
+                except Exception as e:
+                    if self.config.get("skip_on_error", False):
+                        self.console.log(
+                            f"[bold red]Error in map operation {self.config['name']}, skipping item:[/bold red] {e}"
+                        )
+                    else:
+                        raise e
+
+            # Return items and cost
+            return all_results, total_cost
+
+        with ThreadPoolExecutor(max_workers=self.max_batch_size) as executor:
+            batch_size = self.max_batch_size if self.max_batch_size is not None else 1
+            futures = []
+            for i in range(0, len(input_data), batch_size):
+                batch = input_data[i : i + batch_size]
+                futures.append(executor.submit(_process_map_batch, batch))
+            results = []
+            total_cost = 0
+            pbar = RichLoopBar(
+                range(len(futures)),
+                desc=f"Processing {self.config['name']} (map) on all documents",
+                console=self.console,
+            )
+            for i in pbar:
+                result_list, item_cost = futures[i].result()
+                if result_list:
+                    if "drop_keys" in self.config:
+                        result_list = [
+                            {
+                                k: v
+                                for k, v in result.items()
+                                if k not in self.config["drop_keys"]
+                            }
+                            for result in result_list
+                        ]
+                    results.extend(result_list)
+                total_cost += item_cost
+
+        if self.status:
+            self.status.start()
+
+        return results, total_cost
+
+
+ + + +
+ + + + + + + + + +
+ + +

+ execute(input_data) + +

+ + +
+ +

Executes the map operation on the provided input data.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ input_data + + List[Dict] + +
+

The input data to process.

+
+
+ required +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ Tuple[List[Dict], float] + +
+

Tuple[List[Dict], float]: A tuple containing the processed results and the total cost of the operation.

+
+
+

This method performs the following steps: +1. If a prompt is specified, it processes each input item using the specified prompt and LLM model +2. Applies gleaning if configured +3. Validates the output +4. If drop_keys is specified, it drops the specified keys from each document +5. Aggregates results and calculates total cost

+

The method uses parallel processing to improve performance.

+ +
+ Source code in docetl/operations/map.py +
130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
+162
+163
+164
+165
+166
+167
+168
+169
+170
+171
+172
+173
+174
+175
+176
+177
+178
+179
+180
+181
+182
+183
+184
+185
+186
+187
+188
+189
+190
+191
+192
+193
+194
+195
+196
+197
+198
+199
+200
+201
+202
+203
+204
+205
+206
+207
+208
+209
+210
+211
+212
+213
+214
+215
+216
+217
+218
+219
+220
+221
+222
+223
+224
+225
+226
+227
+228
+229
+230
+231
+232
+233
+234
+235
+236
+237
+238
+239
+240
+241
+242
+243
+244
+245
+246
+247
+248
+249
+250
+251
+252
+253
+254
+255
+256
+257
+258
+259
+260
+261
+262
+263
+264
+265
+266
+267
+268
+269
+270
+271
+272
+273
+274
+275
+276
+277
+278
+279
+280
+281
+282
+283
+284
+285
+286
+287
+288
+289
+290
+291
+292
+293
+294
+295
+296
+297
+298
+299
+300
+301
+302
+303
+304
+305
+306
+307
+308
+309
+310
+311
+312
+313
+314
+315
+316
+317
+318
+319
+320
+321
+322
+323
+324
+325
+326
+327
+328
+329
+330
+331
+332
+333
+334
+335
+336
+337
+338
+339
+340
+341
+342
+343
+344
+345
+346
+347
+348
+349
+350
+351
+352
+353
+354
def execute(self, input_data: List[Dict]) -> Tuple[List[Dict], float]:
+    """
+    Executes the map operation on the provided input data.
+
+    Args:
+        input_data (List[Dict]): The input data to process.
+
+    Returns:
+        Tuple[List[Dict], float]: A tuple containing the processed results and the total cost of the operation.
+
+    This method performs the following steps:
+    1. If a prompt is specified, it processes each input item using the specified prompt and LLM model
+    2. Applies gleaning if configured
+    3. Validates the output
+    4. If drop_keys is specified, it drops the specified keys from each document
+    5. Aggregates results and calculates total cost
+
+    The method uses parallel processing to improve performance.
+    """
+    # Check if there's no prompt and only drop_keys
+    if "prompt" not in self.config and "drop_keys" in self.config:
+        # If only drop_keys is specified, simply drop the keys and return
+        dropped_results = []
+        for item in input_data:
+            new_item = {
+                k: v for k, v in item.items() if k not in self.config["drop_keys"]
+            }
+            dropped_results.append(new_item)
+        return dropped_results, 0.0  # Return the modified data with no cost
+
+    if self.status:
+        self.status.stop()
+
+    def _process_map_item(
+        item: Dict, initial_result: Optional[Dict] = None
+    ) -> Tuple[Optional[Dict], float]:
+
+        prompt = strict_render(self.config["prompt"], {"input": item})
+
+        def validation_fn(response: Union[Dict[str, Any], ModelResponse]):
+            output = (
+                self.runner.api.parse_llm_response(
+                    response,
+                    schema=self.config["output"]["schema"],
+                    tools=self.config.get("tools", None),
+                    manually_fix_errors=self.manually_fix_errors,
+                )[0]
+                if isinstance(response, ModelResponse)
+                else response
+            )
+
+            # Check that the output has all the keys in the schema
+            for key in self.config["output"]["schema"]:
+                if key not in output:
+                    return output, False
+
+            for key, value in item.items():
+                if key not in self.config["output"]["schema"]:
+                    output[key] = value
+            if self.runner.api.validate_output(self.config, output, self.console):
+                return output, True
+            return output, False
+
+        self.runner.rate_limiter.try_acquire("call", weight=1)
+        llm_result = self.runner.api.call_llm(
+            self.config.get("model", self.default_model),
+            "map",
+            [{"role": "user", "content": prompt}],
+            self.config["output"]["schema"],
+            tools=self.config.get("tools", None),
+            scratchpad=None,
+            timeout_seconds=self.config.get("timeout", 120),
+            max_retries_per_timeout=self.config.get("max_retries_per_timeout", 2),
+            validation_config=(
+                {
+                    "num_retries": self.num_retries_on_validate_failure,
+                    "val_rule": self.config.get("validate", []),
+                    "validation_fn": validation_fn,
+                }
+                if self.config.get("validate", None)
+                else None
+            ),
+            gleaning_config=self.config.get("gleaning", None),
+            verbose=self.config.get("verbose", False),
+            bypass_cache=self.config.get("bypass_cache", False),
+            initial_result=initial_result,
+            litellm_completion_kwargs=self.config.get(
+                "litellm_completion_kwargs", {}
+            ),
+        )
+
+        if llm_result.validated:
+            # Parse the response
+            if isinstance(llm_result.response, ModelResponse):
+                output = self.runner.api.parse_llm_response(
+                    llm_result.response,
+                    schema=self.config["output"]["schema"],
+                    tools=self.config.get("tools", None),
+                    manually_fix_errors=self.manually_fix_errors,
+                )[0]
+            else:
+                output = llm_result.response
+
+            # Augment the output with the original item
+            output = {**item, **output}
+            if self.config.get("enable_observability", False):
+                output[f"_observability_{self.config['name']}"] = {"prompt": prompt}
+            return output, llm_result.total_cost
+
+        return None, llm_result.total_cost
+
+    # If there's a batch prompt, let's use that
+    def _process_map_batch(items: List[Dict]) -> Tuple[List[Dict], float]:
+        total_cost = 0
+        if len(items) > 1 and self.config.get("batch_prompt", None):
+            batch_prompt = strict_render(
+                self.config["batch_prompt"], {"inputs": items}
+            )
+
+            # Issue the batch call
+            llm_result = self.runner.api.call_llm_batch(
+                self.config.get("model", self.default_model),
+                "batch map",
+                [{"role": "user", "content": batch_prompt}],
+                self.config["output"]["schema"],
+                verbose=self.config.get("verbose", False),
+                timeout_seconds=self.config.get("timeout", 120),
+                max_retries_per_timeout=self.config.get(
+                    "max_retries_per_timeout", 2
+                ),
+                bypass_cache=self.config.get("bypass_cache", False),
+                litellm_completion_kwargs=self.config.get(
+                    "litellm_completion_kwargs", {}
+                ),
+            )
+            total_cost += llm_result.total_cost
+
+            # Parse the LLM response
+            parsed_output = self.runner.api.parse_llm_response(
+                llm_result.response, self.config["output"]["schema"]
+            )[0].get("results", [])
+            items_and_outputs = [
+                (item, parsed_output[idx] if idx < len(parsed_output) else None)
+                for idx, item in enumerate(items)
+            ]
+        else:
+            items_and_outputs = [(item, None) for item in items]
+
+        # Run _process_map_item for each item
+        all_results = []
+        if len(items_and_outputs) > 1:
+            with ThreadPoolExecutor(max_workers=self.max_batch_size) as executor:
+                futures = [
+                    executor.submit(
+                        _process_map_item,
+                        items_and_outputs[i][0],
+                        items_and_outputs[i][1],
+                    )
+                    for i in range(len(items_and_outputs))
+                ]
+                for i in range(len(futures)):
+                    try:
+                        result, item_cost = futures[i].result()
+                        if result is not None:
+                            all_results.append(result)
+                        total_cost += item_cost
+                    except Exception as e:
+                        if self.config.get("skip_on_error", False):
+                            self.console.log(
+                                f"[bold red]Error in map operation {self.config['name']}, skipping item:[/bold red] {e}"
+                            )
+                            continue
+                        else:
+                            raise e
+        else:
+            try:
+                result, item_cost = _process_map_item(
+                    items_and_outputs[0][0], items_and_outputs[0][1]
+                )
+                if result is not None:
+                    all_results.append(result)
+                total_cost += item_cost
+            except Exception as e:
+                if self.config.get("skip_on_error", False):
+                    self.console.log(
+                        f"[bold red]Error in map operation {self.config['name']}, skipping item:[/bold red] {e}"
+                    )
+                else:
+                    raise e
+
+        # Return items and cost
+        return all_results, total_cost
+
+    with ThreadPoolExecutor(max_workers=self.max_batch_size) as executor:
+        batch_size = self.max_batch_size if self.max_batch_size is not None else 1
+        futures = []
+        for i in range(0, len(input_data), batch_size):
+            batch = input_data[i : i + batch_size]
+            futures.append(executor.submit(_process_map_batch, batch))
+        results = []
+        total_cost = 0
+        pbar = RichLoopBar(
+            range(len(futures)),
+            desc=f"Processing {self.config['name']} (map) on all documents",
+            console=self.console,
+        )
+        for i in pbar:
+            result_list, item_cost = futures[i].result()
+            if result_list:
+                if "drop_keys" in self.config:
+                    result_list = [
+                        {
+                            k: v
+                            for k, v in result.items()
+                            if k not in self.config["drop_keys"]
+                        }
+                        for result in result_list
+                    ]
+                results.extend(result_list)
+            total_cost += item_cost
+
+    if self.status:
+        self.status.start()
+
+    return results, total_cost
+
+
+
+ +
+ +
+ + +

+ syntax_check() + +

+ + +
+ +
Checks the configuration of the MapOperation for required keys and valid structure.
+
+ + +

Raises:

+ + + + + + + + + + + + + + + + + +
TypeDescription
+ ValueError + +
+

If required keys are missing or invalid in the configuration.

+
+
+ TypeError + +
+

If configuration values have incorrect types.

+
+
+ +
+ Source code in docetl/operations/map.py +
 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
def syntax_check(self) -> None:
+    """
+        Checks the configuration of the MapOperation for required keys and valid structure.
+
+    Raises:
+        ValueError: If required keys are missing or invalid in the configuration.
+        TypeError: If configuration values have incorrect types.
+    """
+    config = self.schema(**self.config)
+
+    if config.drop_keys:
+        if any(not isinstance(key, str) for key in config.drop_keys):
+            raise TypeError("All items in 'drop_keys' must be strings")
+    elif not (config.prompt and config.output):
+        raise ValueError(
+            "If 'drop_keys' is not specified, both 'prompt' and 'output' must be present in the configuration"
+        )
+
+    if config.batch_prompt:
+        try:
+            template = Template(config.batch_prompt)
+            # Test render with a minimal inputs list to validate template
+            template.render(inputs=[{}])
+        except Exception as e:
+            raise ValueError(
+                f"Invalid Jinja2 template in 'batch_prompt' or missing required 'inputs' variable: {str(e)}"
+            ) from e
+
+    if config.prompt or config.output:
+        for key in ["prompt", "output"]:
+            if not getattr(config, key):
+                raise ValueError(
+                    f"Missing required key '{key}' in MapOperation configuration"
+                )
+
+        if config.output and not config.output["schema"]:
+            raise ValueError("Missing 'schema' in 'output' configuration")
+
+        if config.prompt:
+            try:
+                Template(config.prompt)
+            except Exception as e:
+                raise ValueError(
+                    f"Invalid Jinja2 template in 'prompt': {str(e)}"
+                ) from e
+
+        if config.model and not isinstance(config.model, str):
+            raise TypeError("'model' in configuration must be a string")
+
+        if config.tools:
+            for tool in config.tools:
+                try:
+                    tool_obj = Tool(**tool)
+                except Exception:
+                    raise TypeError("Tool must be a dictionary")
+
+                if not (tool_obj.code and tool_obj.function):
+                    raise ValueError(
+                        "Tool is missing required 'code' or 'function' key"
+                    )
+
+                if not isinstance(tool_obj.function, ToolFunction):
+                    raise TypeError("'function' in tool must be a dictionary")
+
+                for key in ["name", "description", "parameters"]:
+                    if not getattr(tool_obj.function, key):
+                        raise ValueError(
+                            f"Tool is missing required '{key}' in 'function'"
+                        )
+
+        self.gleaning_check()
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ docetl.operations.resolve.ResolveOperation + + +

+ + +
+

+ Bases: BaseOperation

+ + + + + + + +
+ Source code in docetl/operations/resolve.py +
 26
+ 27
+ 28
+ 29
+ 30
+ 31
+ 32
+ 33
+ 34
+ 35
+ 36
+ 37
+ 38
+ 39
+ 40
+ 41
+ 42
+ 43
+ 44
+ 45
+ 46
+ 47
+ 48
+ 49
+ 50
+ 51
+ 52
+ 53
+ 54
+ 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
+162
+163
+164
+165
+166
+167
+168
+169
+170
+171
+172
+173
+174
+175
+176
+177
+178
+179
+180
+181
+182
+183
+184
+185
+186
+187
+188
+189
+190
+191
+192
+193
+194
+195
+196
+197
+198
+199
+200
+201
+202
+203
+204
+205
+206
+207
+208
+209
+210
+211
+212
+213
+214
+215
+216
+217
+218
+219
+220
+221
+222
+223
+224
+225
+226
+227
+228
+229
+230
+231
+232
+233
+234
+235
+236
+237
+238
+239
+240
+241
+242
+243
+244
+245
+246
+247
+248
+249
+250
+251
+252
+253
+254
+255
+256
+257
+258
+259
+260
+261
+262
+263
+264
+265
+266
+267
+268
+269
+270
+271
+272
+273
+274
+275
+276
+277
+278
+279
+280
+281
+282
+283
+284
+285
+286
+287
+288
+289
+290
+291
+292
+293
+294
+295
+296
+297
+298
+299
+300
+301
+302
+303
+304
+305
+306
+307
+308
+309
+310
+311
+312
+313
+314
+315
+316
+317
+318
+319
+320
+321
+322
+323
+324
+325
+326
+327
+328
+329
+330
+331
+332
+333
+334
+335
+336
+337
+338
+339
+340
+341
+342
+343
+344
+345
+346
+347
+348
+349
+350
+351
+352
+353
+354
+355
+356
+357
+358
+359
+360
+361
+362
+363
+364
+365
+366
+367
+368
+369
+370
+371
+372
+373
+374
+375
+376
+377
+378
+379
+380
+381
+382
+383
+384
+385
+386
+387
+388
+389
+390
+391
+392
+393
+394
+395
+396
+397
+398
+399
+400
+401
+402
+403
+404
+405
+406
+407
+408
+409
+410
+411
+412
+413
+414
+415
+416
+417
+418
+419
+420
+421
+422
+423
+424
+425
+426
+427
+428
+429
+430
+431
+432
+433
+434
+435
+436
+437
+438
+439
+440
+441
+442
+443
+444
+445
+446
+447
+448
+449
+450
+451
+452
+453
+454
+455
+456
+457
+458
+459
+460
+461
+462
+463
+464
+465
+466
+467
+468
+469
+470
+471
+472
+473
+474
+475
+476
+477
+478
+479
+480
+481
+482
+483
+484
+485
+486
+487
+488
+489
+490
+491
+492
+493
+494
+495
+496
+497
+498
+499
+500
+501
+502
+503
+504
+505
+506
+507
+508
+509
+510
+511
+512
+513
+514
+515
+516
+517
+518
+519
+520
+521
+522
+523
+524
+525
+526
+527
+528
+529
+530
+531
+532
+533
+534
+535
+536
+537
+538
+539
+540
+541
+542
+543
+544
+545
+546
+547
+548
+549
+550
+551
+552
+553
+554
+555
+556
+557
+558
+559
+560
+561
+562
+563
+564
+565
+566
+567
+568
+569
+570
+571
+572
+573
+574
+575
+576
+577
+578
+579
+580
+581
+582
+583
+584
+585
+586
+587
+588
+589
+590
+591
+592
+593
+594
+595
+596
+597
+598
+599
+600
+601
+602
+603
+604
+605
+606
+607
+608
+609
+610
+611
+612
+613
+614
+615
+616
+617
+618
+619
+620
+621
+622
+623
+624
+625
+626
+627
+628
+629
+630
+631
+632
+633
+634
+635
+636
+637
+638
+639
+640
+641
+642
+643
+644
+645
+646
+647
+648
+649
+650
+651
+652
+653
+654
+655
+656
+657
+658
+659
+660
+661
+662
+663
+664
+665
+666
+667
+668
+669
+670
+671
+672
+673
+674
+675
+676
+677
+678
+679
+680
+681
+682
+683
+684
+685
+686
+687
+688
+689
+690
+691
+692
+693
+694
+695
+696
+697
+698
+699
+700
+701
+702
+703
+704
+705
+706
+707
+708
+709
+710
+711
+712
+713
+714
+715
+716
+717
+718
+719
+720
+721
+722
+723
+724
class ResolveOperation(BaseOperation):
+    class schema(BaseOperation.schema):
+        type: str = "resolve"
+        comparison_prompt: str
+        resolution_prompt: Optional[str] = None
+        output: Optional[Dict[str, Any]] = None
+        embedding_model: Optional[str] = None
+        resolution_model: Optional[str] = None
+        comparison_model: Optional[str] = None
+        blocking_keys: Optional[List[str]] = None
+        blocking_threshold: Optional[float] = None
+        blocking_conditions: Optional[List[str]] = None
+        input: Optional[Dict[str, Any]] = None
+        embedding_batch_size: Optional[int] = None
+        compare_batch_size: Optional[int] = None
+        limit_comparisons: Optional[int] = None
+        optimize: Optional[bool] = None
+        timeout: Optional[int] = None
+        litellm_completion_kwargs: Dict[str, Any] = Field(default_factory=dict)
+        enable_observability: bool = False
+
+    def compare_pair(
+        self,
+        comparison_prompt: str,
+        model: str,
+        item1: Dict,
+        item2: Dict,
+        blocking_keys: List[str] = [],
+        timeout_seconds: int = 120,
+        max_retries_per_timeout: int = 2,
+    ) -> Tuple[bool, float, str]:
+        """
+        Compares two items using an LLM model to determine if they match.
+
+        Args:
+            comparison_prompt (str): The prompt template for comparison.
+            model (str): The LLM model to use for comparison.
+            item1 (Dict): The first item to compare.
+            item2 (Dict): The second item to compare.
+
+        Returns:
+            Tuple[bool, float, str]: A tuple containing a boolean indicating whether the items match, the cost of the comparison, and the prompt.
+        """
+        if blocking_keys:
+            if all(
+                key in item1
+                and key in item2
+                and str(item1[key]).lower() == str(item2[key]).lower()
+                for key in blocking_keys
+            ):
+                return True, 0, ""
+
+        prompt = strict_render(comparison_prompt, {"input1": item1, "input2": item2})
+        response = self.runner.api.call_llm(
+            model,
+            "compare",
+            [{"role": "user", "content": prompt}],
+            {"is_match": "bool"},
+            timeout_seconds=timeout_seconds,
+            max_retries_per_timeout=max_retries_per_timeout,
+            bypass_cache=self.config.get("bypass_cache", False),
+            litellm_completion_kwargs=self.config.get("litellm_completion_kwargs", {}),
+        )
+        output = self.runner.api.parse_llm_response(
+            response.response,
+            {"is_match": "bool"},
+        )[0]
+
+        return output["is_match"], response.total_cost, prompt
+
+    def syntax_check(self) -> None:
+        """
+        Checks the configuration of the ResolveOperation for required keys and valid structure.
+
+        This method performs the following checks:
+        1. Verifies the presence of required keys: 'comparison_prompt' and 'output'.
+        2. Ensures 'output' contains a 'schema' key.
+        3. Validates that 'schema' in 'output' is a non-empty dictionary.
+        4. Checks if 'comparison_prompt' is a valid Jinja2 template with 'input1' and 'input2' variables.
+        5. If 'resolution_prompt' is present, verifies it as a valid Jinja2 template with 'inputs' variable.
+        6. Optionally checks if 'model' is a string (if present).
+        7. Optionally checks 'blocking_keys' (if present, further checks are performed).
+
+        Raises:
+            ValueError: If required keys are missing, if templates are invalid or missing required variables,
+                        or if any other configuration aspect is incorrect or inconsistent.
+            TypeError: If the types of configuration values are incorrect, such as 'schema' not being a dict
+                       or 'model' not being a string.
+        """
+        required_keys = ["comparison_prompt", "output"]
+        for key in required_keys:
+            if key not in self.config:
+                raise ValueError(
+                    f"Missing required key '{key}' in ResolveOperation configuration"
+                )
+
+        if "schema" not in self.config["output"] and not self.runner._from_df_accessors:
+            raise ValueError("Missing 'schema' in 'output' configuration")
+        elif not self.runner._from_df_accessors:
+            if not isinstance(self.config["output"]["schema"], dict):
+                raise TypeError(
+                    "'schema' in 'output' configuration must be a dictionary"
+                )
+
+            if not self.config["output"]["schema"]:
+                raise ValueError("'schema' in 'output' configuration cannot be empty")
+
+        # Check if the comparison_prompt is a valid Jinja2 template
+        try:
+            comparison_template = Template(self.config["comparison_prompt"])
+            comparison_vars = comparison_template.environment.parse(
+                self.config["comparison_prompt"]
+            ).find_all(jinja2.nodes.Name)
+            comparison_var_names = {var.name for var in comparison_vars}
+            if (
+                "input1" not in comparison_var_names
+                or "input2" not in comparison_var_names
+            ):
+                raise ValueError(
+                    f"'comparison_prompt' must contain both 'input1' and 'input2' variables. {self.config['comparison_prompt']}"
+                )
+
+            if "resolution_prompt" in self.config:
+                reduction_template = Template(self.config["resolution_prompt"])
+                reduction_vars = reduction_template.environment.parse(
+                    self.config["resolution_prompt"]
+                ).find_all(jinja2.nodes.Name)
+                reduction_var_names = {var.name for var in reduction_vars}
+                if "inputs" not in reduction_var_names:
+                    raise ValueError(
+                        "'resolution_prompt' must contain 'inputs' variable"
+                    )
+        except Exception as e:
+            raise ValueError(f"Invalid Jinja2 template: {str(e)}")
+
+        # Check if the model is specified (optional)
+        if "model" in self.config and not isinstance(self.config["model"], str):
+            raise TypeError("'model' in configuration must be a string")
+
+        # Check blocking_keys (optional)
+        if "blocking_keys" in self.config:
+            if not isinstance(self.config["blocking_keys"], list):
+                raise TypeError("'blocking_keys' must be a list")
+            if not all(isinstance(key, str) for key in self.config["blocking_keys"]):
+                raise TypeError("All items in 'blocking_keys' must be strings")
+
+        # Check blocking_threshold (optional)
+        if "blocking_threshold" in self.config:
+            if not isinstance(self.config["blocking_threshold"], (int, float)):
+                raise TypeError("'blocking_threshold' must be a number")
+            if not 0 <= self.config["blocking_threshold"] <= 1:
+                raise ValueError("'blocking_threshold' must be between 0 and 1")
+
+        # Check blocking_conditions (optional)
+        if "blocking_conditions" in self.config:
+            if not isinstance(self.config["blocking_conditions"], list):
+                raise TypeError("'blocking_conditions' must be a list")
+            if not all(
+                isinstance(cond, str) for cond in self.config["blocking_conditions"]
+            ):
+                raise TypeError("All items in 'blocking_conditions' must be strings")
+
+        # Check if input schema is provided and valid (optional)
+        if "input" in self.config:
+            if "schema" not in self.config["input"]:
+                raise ValueError("Missing 'schema' in 'input' configuration")
+            if not isinstance(self.config["input"]["schema"], dict):
+                raise TypeError(
+                    "'schema' in 'input' configuration must be a dictionary"
+                )
+
+        # Check limit_comparisons (optional)
+        if "limit_comparisons" in self.config:
+            if not isinstance(self.config["limit_comparisons"], int):
+                raise TypeError("'limit_comparisons' must be an integer")
+            if self.config["limit_comparisons"] <= 0:
+                raise ValueError("'limit_comparisons' must be a positive integer")
+
+    def validation_fn(self, response: Dict[str, Any]):
+        output = self.runner.api.parse_llm_response(
+            response,
+            schema=self.config["output"]["schema"],
+        )[0]
+        if self.runner.api.validate_output(self.config, output, self.console):
+            return output, True
+        return output, False
+
+    def execute(self, input_data: List[Dict]) -> Tuple[List[Dict], float]:
+        """
+        Executes the resolve operation on the provided dataset.
+
+        Args:
+            input_data (List[Dict]): The dataset to resolve.
+
+        Returns:
+            Tuple[List[Dict], float]: A tuple containing the resolved results and the total cost of the operation.
+
+        This method performs the following steps:
+        1. Initial blocking based on specified conditions and/or embedding similarity
+        2. Pairwise comparison of potentially matching entries using LLM
+        3. Clustering of matched entries
+        4. Resolution of each cluster into a single entry (if applicable)
+        5. Result aggregation and validation
+
+        The method also calculates and logs statistics such as comparisons saved by blocking and self-join selectivity.
+        """
+        if len(input_data) == 0:
+            return [], 0
+
+        # Initialize observability data for all items at the start
+        if self.config.get("enable_observability", False):
+            observability_key = f"_observability_{self.config['name']}"
+            for item in input_data:
+                if observability_key not in item:
+                    item[observability_key] = {
+                        "comparison_prompts": [],
+                        "resolution_prompt": None,
+                    }
+
+        blocking_keys = self.config.get("blocking_keys", [])
+        blocking_threshold = self.config.get("blocking_threshold")
+        blocking_conditions = self.config.get("blocking_conditions", [])
+        if self.status:
+            self.status.stop()
+
+        if not blocking_threshold and not blocking_conditions:
+            # Prompt the user for confirmation
+            if not Confirm.ask(
+                "[yellow]Warning: No blocking keys or conditions specified. "
+                "This may result in a large number of comparisons. "
+                "We recommend specifying at least one blocking key or condition, or using the optimizer to automatically come up with these. "
+                "Do you want to continue without blocking?[/yellow]",
+                console=self.runner.console,
+            ):
+                raise ValueError("Operation cancelled by user.")
+
+        input_schema = self.config.get("input", {}).get("schema", {})
+        if not blocking_keys:
+            # Set them to all keys in the input data
+            blocking_keys = list(input_data[0].keys())
+        limit_comparisons = self.config.get("limit_comparisons")
+        total_cost = 0
+
+        def is_match(item1: Dict[str, Any], item2: Dict[str, Any]) -> bool:
+            return any(
+                eval(condition, {"input1": item1, "input2": item2})
+                for condition in blocking_conditions
+            )
+
+        # Calculate embeddings if blocking_threshold is set
+        embeddings = None
+        if blocking_threshold is not None:
+            embedding_model = self.config.get(
+                "embedding_model", "text-embedding-3-small"
+            )
+
+            def get_embeddings_batch(
+                items: List[Dict[str, Any]]
+            ) -> List[Tuple[List[float], float]]:
+                texts = [
+                    " ".join(str(item[key]) for key in blocking_keys if key in item)
+                    for item in items
+                ]
+                response = self.runner.api.gen_embedding(
+                    model=embedding_model, input=texts
+                )
+                return [
+                    (data["embedding"], completion_cost(response))
+                    for data in response["data"]
+                ]
+
+            embeddings = []
+            costs = []
+            with ThreadPoolExecutor(max_workers=self.max_threads) as executor:
+                for i in range(
+                    0, len(input_data), self.config.get("embedding_batch_size", 1000)
+                ):
+                    batch = input_data[
+                        i : i + self.config.get("embedding_batch_size", 1000)
+                    ]
+                    batch_results = list(executor.map(get_embeddings_batch, [batch]))
+
+                    for result in batch_results:
+                        embeddings.extend([r[0] for r in result])
+                        costs.extend([r[1] for r in result])
+
+                total_cost += sum(costs)
+
+        # Generate all pairs to compare, ensuring no duplicate comparisons
+        def get_unique_comparison_pairs() -> (
+            Tuple[List[Tuple[int, int]], Dict[Tuple[str, ...], List[int]]]
+        ):
+            # Create a mapping of values to their indices
+            value_to_indices: Dict[Tuple[str, ...], List[int]] = {}
+            for i, item in enumerate(input_data):
+                # Create a hashable key from the blocking keys
+                key = tuple(str(item.get(k, "")) for k in blocking_keys)
+                if key not in value_to_indices:
+                    value_to_indices[key] = []
+                value_to_indices[key].append(i)
+
+            # Generate pairs for comparison, comparing each unique value combination only once
+            comparison_pairs = []
+            keys = list(value_to_indices.keys())
+
+            # First, handle comparisons between different values
+            for i in range(len(keys)):
+                for j in range(i + 1, len(keys)):
+                    # Only need one comparison between different values
+                    idx1 = value_to_indices[keys[i]][0]
+                    idx2 = value_to_indices[keys[j]][0]
+                    if idx1 < idx2:  # Maintain ordering to avoid duplicates
+                        comparison_pairs.append((idx1, idx2))
+
+            return comparison_pairs, value_to_indices
+
+        comparison_pairs, value_to_indices = get_unique_comparison_pairs()
+
+        # Filter pairs based on blocking conditions
+        def meets_blocking_conditions(pair: Tuple[int, int]) -> bool:
+            i, j = pair
+            return (
+                is_match(input_data[i], input_data[j]) if blocking_conditions else False
+            )
+
+        blocked_pairs = (
+            list(filter(meets_blocking_conditions, comparison_pairs))
+            if blocking_conditions
+            else comparison_pairs
+        )
+
+        # Apply limit_comparisons to blocked pairs
+        if limit_comparisons is not None and len(blocked_pairs) > limit_comparisons:
+            self.console.log(
+                f"Randomly sampling {limit_comparisons} pairs out of {len(blocked_pairs)} blocked pairs."
+            )
+            blocked_pairs = random.sample(blocked_pairs, limit_comparisons)
+
+        # Initialize clusters with all indices
+        clusters = [{i} for i in range(len(input_data))]
+        cluster_map = {i: i for i in range(len(input_data))}
+
+        # If there are remaining comparisons, fill with highest cosine similarities
+        remaining_comparisons = (
+            limit_comparisons - len(blocked_pairs)
+            if limit_comparisons is not None
+            else float("inf")
+        )
+        if remaining_comparisons > 0 and blocking_threshold is not None:
+            # Compute cosine similarity for all pairs efficiently
+            from sklearn.metrics.pairwise import cosine_similarity
+
+            similarity_matrix = cosine_similarity(embeddings)
+
+            cosine_pairs = []
+            for i, j in comparison_pairs:
+                if (i, j) not in blocked_pairs and find_cluster(
+                    i, cluster_map
+                ) != find_cluster(j, cluster_map):
+                    similarity = similarity_matrix[i, j]
+                    if similarity >= blocking_threshold:
+                        cosine_pairs.append((i, j, similarity))
+
+            if remaining_comparisons != float("inf"):
+                cosine_pairs.sort(key=lambda x: x[2], reverse=True)
+                additional_pairs = [
+                    (i, j) for i, j, _ in cosine_pairs[: int(remaining_comparisons)]
+                ]
+                blocked_pairs.extend(additional_pairs)
+            else:
+                blocked_pairs.extend((i, j) for i, j, _ in cosine_pairs)
+
+        # Modified merge_clusters to handle all indices with the same value
+
+        def merge_clusters(item1: int, item2: int) -> None:
+            root1, root2 = find_cluster(item1, cluster_map), find_cluster(
+                item2, cluster_map
+            )
+            if root1 != root2:
+                if len(clusters[root1]) < len(clusters[root2]):
+                    root1, root2 = root2, root1
+                clusters[root1] |= clusters[root2]
+                cluster_map[root2] = root1
+                clusters[root2] = set()
+
+                # Also merge all other indices that share the same values
+                key1 = tuple(str(input_data[item1].get(k, "")) for k in blocking_keys)
+                key2 = tuple(str(input_data[item2].get(k, "")) for k in blocking_keys)
+
+                # Merge all indices with the same values
+                for idx in value_to_indices.get(key1, []):
+                    if idx != item1:
+                        root_idx = find_cluster(idx, cluster_map)
+                        if root_idx != root1:
+                            clusters[root1] |= clusters[root_idx]
+                            cluster_map[root_idx] = root1
+                            clusters[root_idx] = set()
+
+                for idx in value_to_indices.get(key2, []):
+                    if idx != item2:
+                        root_idx = find_cluster(idx, cluster_map)
+                        if root_idx != root1:
+                            clusters[root1] |= clusters[root_idx]
+                            cluster_map[root_idx] = root1
+                            clusters[root_idx] = set()
+
+        # Calculate and print statistics
+        total_possible_comparisons = len(input_data) * (len(input_data) - 1) // 2
+        comparisons_made = len(blocked_pairs)
+        comparisons_saved = total_possible_comparisons - comparisons_made
+        self.console.log(
+            f"[green]Comparisons saved by blocking: {comparisons_saved} "
+            f"({(comparisons_saved / total_possible_comparisons) * 100:.2f}%)[/green]"
+        )
+        self.console.log(
+            f"[blue]Number of pairs to compare: {len(blocked_pairs)}[/blue]"
+        )
+
+        # Compute an auto-batch size based on the number of comparisons
+        def auto_batch() -> int:
+            # Maximum batch size limit for 4o-mini model
+            M = 500
+
+            n = len(input_data)
+            m = len(blocked_pairs)
+
+            # https://www.wolframalpha.com/input?i=k%28k-1%29%2F2+%2B+%28n-k%29%28k-1%29+%3D+m%2C+solve+for+k
+            # Two possible solutions for k:
+            # k = -1/2 sqrt((1 - 2n)^2 - 8m) + n + 1/2
+            # k = 1/2 (sqrt((1 - 2n)^2 - 8m) + 2n + 1)
+
+            discriminant = (1 - 2 * n) ** 2 - 8 * m
+            sqrt_discriminant = discriminant**0.5
+
+            k1 = -0.5 * sqrt_discriminant + n + 0.5
+            k2 = 0.5 * (sqrt_discriminant + 2 * n + 1)
+
+            # Take the maximum viable solution
+            k = max(k1, k2)
+            return M if k < 0 else min(int(k), M)
+
+        # Compare pairs and update clusters in real-time
+        batch_size = self.config.get("compare_batch_size", auto_batch())
+        self.console.log(f"Using compare batch size: {batch_size}")
+        pair_costs = 0
+
+        pbar = RichLoopBar(
+            range(0, len(blocked_pairs), batch_size),
+            desc=f"Processing batches of {batch_size} LLM comparisons",
+            console=self.console,
+        )
+        last_processed = 0
+        for i in pbar:
+            batch_end = last_processed + batch_size
+            batch = blocked_pairs[last_processed:batch_end]
+            # Filter pairs for the initial batch
+            better_batch = [
+                pair
+                for pair in batch
+                if find_cluster(pair[0], cluster_map) == pair[0]
+                and find_cluster(pair[1], cluster_map) == pair[1]
+            ]
+
+            # Expand better_batch if it doesn’t reach batch_size
+            while len(better_batch) < batch_size and batch_end < len(blocked_pairs):
+                # Move batch_end forward by batch_size to get more pairs
+                next_end = batch_end + batch_size
+                next_batch = blocked_pairs[batch_end:next_end]
+
+                better_batch.extend(
+                    pair
+                    for pair in next_batch
+                    if find_cluster(pair[0], cluster_map) == pair[0]
+                    and find_cluster(pair[1], cluster_map) == pair[1]
+                )
+
+                # Update batch_end to prevent overlapping in the next loop
+                batch_end = next_end
+            better_batch = better_batch[:batch_size]
+            last_processed = batch_end
+            with ThreadPoolExecutor(max_workers=self.max_threads) as executor:
+                future_to_pair = {
+                    executor.submit(
+                        self.compare_pair,
+                        self.config["comparison_prompt"],
+                        self.config.get("comparison_model", self.default_model),
+                        input_data[pair[0]],
+                        input_data[pair[1]],
+                        blocking_keys,
+                        timeout_seconds=self.config.get("timeout", 120),
+                        max_retries_per_timeout=self.config.get(
+                            "max_retries_per_timeout", 2
+                        ),
+                    ): pair
+                    for pair in better_batch
+                }
+
+                for future in as_completed(future_to_pair):
+                    pair = future_to_pair[future]
+                    is_match_result, cost, prompt = future.result()
+                    pair_costs += cost
+                    if is_match_result:
+                        merge_clusters(pair[0], pair[1])
+
+                    if self.config.get("enable_observability", False):
+                        observability_key = f"_observability_{self.config['name']}"
+                        for idx in (pair[0], pair[1]):
+                            if observability_key not in input_data[idx]:
+                                input_data[idx][observability_key] = {
+                                    "comparison_prompts": [],
+                                    "resolution_prompt": None,
+                                }
+                            input_data[idx][observability_key][
+                                "comparison_prompts"
+                            ].append(prompt)
+
+        total_cost += pair_costs
+
+        # Collect final clusters
+        final_clusters = [cluster for cluster in clusters if cluster]
+
+        # Process each cluster
+        results = []
+
+        def process_cluster(cluster):
+            if len(cluster) > 1:
+                cluster_items = [input_data[i] for i in cluster]
+                if input_schema:
+                    cluster_items = [
+                        {k: item[k] for k in input_schema.keys() if k in item}
+                        for item in cluster_items
+                    ]
+
+                resolution_prompt = strict_render(
+                    self.config["resolution_prompt"], {"inputs": cluster_items}
+                )
+                reduction_response = self.runner.api.call_llm(
+                    self.config.get("resolution_model", self.default_model),
+                    "reduce",
+                    [{"role": "user", "content": resolution_prompt}],
+                    self.config["output"]["schema"],
+                    timeout_seconds=self.config.get("timeout", 120),
+                    max_retries_per_timeout=self.config.get(
+                        "max_retries_per_timeout", 2
+                    ),
+                    bypass_cache=self.config.get("bypass_cache", False),
+                    validation_config=(
+                        {
+                            "val_rule": self.config.get("validate", []),
+                            "validation_fn": self.validation_fn,
+                        }
+                        if self.config.get("validate", None)
+                        else None
+                    ),
+                    litellm_completion_kwargs=self.config.get(
+                        "litellm_completion_kwargs", {}
+                    ),
+                )
+                reduction_cost = reduction_response.total_cost
+
+                if self.config.get("enable_observability", False):
+                    for item in [input_data[i] for i in cluster]:
+                        observability_key = f"_observability_{self.config['name']}"
+                        if observability_key not in item:
+                            item[observability_key] = {
+                                "comparison_prompts": [],
+                                "resolution_prompt": None,
+                            }
+                        item[observability_key]["resolution_prompt"] = resolution_prompt
+
+                if reduction_response.validated:
+                    reduction_output = self.runner.api.parse_llm_response(
+                        reduction_response.response,
+                        self.config["output"]["schema"],
+                        manually_fix_errors=self.manually_fix_errors,
+                    )[0]
+
+                    # If the output is overwriting an existing key, we want to save the kv pairs
+                    keys_in_output = [
+                        k
+                        for k in set(reduction_output.keys())
+                        if k in cluster_items[0].keys()
+                    ]
+
+                    return (
+                        [
+                            {
+                                **item,
+                                f"_kv_pairs_preresolve_{self.config['name']}": {
+                                    k: item[k] for k in keys_in_output
+                                },
+                                **{
+                                    k: reduction_output[k]
+                                    for k in self.config["output"]["schema"]
+                                },
+                            }
+                            for item in [input_data[i] for i in cluster]
+                        ],
+                        reduction_cost,
+                    )
+                return [], reduction_cost
+            else:
+                # Set the output schema to be the keys found in the compare_prompt
+                compare_prompt_keys = extract_jinja_variables(
+                    self.config["comparison_prompt"]
+                )
+                # Get the set of keys in the compare_prompt
+                compare_prompt_keys = set(
+                    [
+                        k.replace("input1.", "")
+                        for k in compare_prompt_keys
+                        if "input1" in k
+                    ]
+                )
+
+                # For each key in the output schema, find the most similar key in the compare_prompt
+                output_keys = set(self.config["output"]["schema"].keys())
+                key_mapping = {}
+                for output_key in output_keys:
+                    best_match = None
+                    best_score = 0
+                    for compare_key in compare_prompt_keys:
+                        score = sum(
+                            c1 == c2 for c1, c2 in zip(output_key, compare_key)
+                        ) / max(len(output_key), len(compare_key))
+                        if score > best_score:
+                            best_score = score
+                            best_match = compare_key
+                    key_mapping[output_key] = best_match
+
+                # Create the result dictionary using the key mapping
+                result = input_data[list(cluster)[0]].copy()
+                result[f"_kv_pairs_preresolve_{self.config['name']}"] = {
+                    ok: result[ck] for ok, ck in key_mapping.items() if ck in result
+                }
+                for output_key, compare_key in key_mapping.items():
+                    if compare_key in input_data[list(cluster)[0]]:
+                        result[output_key] = input_data[list(cluster)[0]][compare_key]
+                    elif output_key in input_data[list(cluster)[0]]:
+                        result[output_key] = input_data[list(cluster)[0]][output_key]
+                    else:
+                        result[output_key] = None  # or some default value
+
+                return [result], 0
+
+        # Calculate the number of records before and clusters after
+        num_records_before = len(input_data)
+        num_clusters_after = len(final_clusters)
+        self.console.log(f"Number of keys before resolution: {num_records_before}")
+        self.console.log(
+            f"Number of distinct keys after resolution: {num_clusters_after}"
+        )
+
+        # If no resolution prompt is provided, we can skip the resolution phase
+        # And simply select the most common value for each key
+        if not self.config.get("resolution_prompt", None):
+            for cluster in final_clusters:
+                if len(cluster) > 1:
+                    for key in self.config["output"]["keys"]:
+                        most_common_value = max(
+                            set(input_data[i][key] for i in cluster),
+                            key=lambda x: sum(
+                                1 for i in cluster if input_data[i][key] == x
+                            ),
+                        )
+                        for i in cluster:
+                            input_data[i][key] = most_common_value
+            results = input_data
+        else:
+            with ThreadPoolExecutor(max_workers=self.max_threads) as executor:
+                futures = [
+                    executor.submit(process_cluster, cluster)
+                    for cluster in final_clusters
+                ]
+                for future in rich_as_completed(
+                    futures,
+                    total=len(futures),
+                    desc="Determining resolved key for each group of equivalent keys",
+                    console=self.console,
+                ):
+                    cluster_results, cluster_cost = future.result()
+                    results.extend(cluster_results)
+                    total_cost += cluster_cost
+
+        total_pairs = len(input_data) * (len(input_data) - 1) // 2
+        true_match_count = sum(
+            len(cluster) * (len(cluster) - 1) // 2
+            for cluster in final_clusters
+            if len(cluster) > 1
+        )
+        true_match_selectivity = (
+            true_match_count / total_pairs if total_pairs > 0 else 0
+        )
+        self.console.log(f"Self-join selectivity: {true_match_selectivity:.4f}")
+
+        if self.status:
+            self.status.start()
+
+        return results, total_cost
+
+
+ + + +
+ + + + + + + + + +
+ + +

+ compare_pair(comparison_prompt, model, item1, item2, blocking_keys=[], timeout_seconds=120, max_retries_per_timeout=2) + +

+ + +
+ +

Compares two items using an LLM model to determine if they match.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ comparison_prompt + + str + +
+

The prompt template for comparison.

+
+
+ required +
+ model + + str + +
+

The LLM model to use for comparison.

+
+
+ required +
+ item1 + + Dict + +
+

The first item to compare.

+
+
+ required +
+ item2 + + Dict + +
+

The second item to compare.

+
+
+ required +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ Tuple[bool, float, str] + +
+

Tuple[bool, float, str]: A tuple containing a boolean indicating whether the items match, the cost of the comparison, and the prompt.

+
+
+ +
+ Source code in docetl/operations/resolve.py +
47
+48
+49
+50
+51
+52
+53
+54
+55
+56
+57
+58
+59
+60
+61
+62
+63
+64
+65
+66
+67
+68
+69
+70
+71
+72
+73
+74
+75
+76
+77
+78
+79
+80
+81
+82
+83
+84
+85
+86
+87
+88
+89
+90
+91
+92
+93
+94
def compare_pair(
+    self,
+    comparison_prompt: str,
+    model: str,
+    item1: Dict,
+    item2: Dict,
+    blocking_keys: List[str] = [],
+    timeout_seconds: int = 120,
+    max_retries_per_timeout: int = 2,
+) -> Tuple[bool, float, str]:
+    """
+    Compares two items using an LLM model to determine if they match.
+
+    Args:
+        comparison_prompt (str): The prompt template for comparison.
+        model (str): The LLM model to use for comparison.
+        item1 (Dict): The first item to compare.
+        item2 (Dict): The second item to compare.
+
+    Returns:
+        Tuple[bool, float, str]: A tuple containing a boolean indicating whether the items match, the cost of the comparison, and the prompt.
+    """
+    if blocking_keys:
+        if all(
+            key in item1
+            and key in item2
+            and str(item1[key]).lower() == str(item2[key]).lower()
+            for key in blocking_keys
+        ):
+            return True, 0, ""
+
+    prompt = strict_render(comparison_prompt, {"input1": item1, "input2": item2})
+    response = self.runner.api.call_llm(
+        model,
+        "compare",
+        [{"role": "user", "content": prompt}],
+        {"is_match": "bool"},
+        timeout_seconds=timeout_seconds,
+        max_retries_per_timeout=max_retries_per_timeout,
+        bypass_cache=self.config.get("bypass_cache", False),
+        litellm_completion_kwargs=self.config.get("litellm_completion_kwargs", {}),
+    )
+    output = self.runner.api.parse_llm_response(
+        response.response,
+        {"is_match": "bool"},
+    )[0]
+
+    return output["is_match"], response.total_cost, prompt
+
+
+
+ +
+ +
+ + +

+ execute(input_data) + +

+ + +
+ +

Executes the resolve operation on the provided dataset.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ input_data + + List[Dict] + +
+

The dataset to resolve.

+
+
+ required +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ Tuple[List[Dict], float] + +
+

Tuple[List[Dict], float]: A tuple containing the resolved results and the total cost of the operation.

+
+
+

This method performs the following steps: +1. Initial blocking based on specified conditions and/or embedding similarity +2. Pairwise comparison of potentially matching entries using LLM +3. Clustering of matched entries +4. Resolution of each cluster into a single entry (if applicable) +5. Result aggregation and validation

+

The method also calculates and logs statistics such as comparisons saved by blocking and self-join selectivity.

+ +
+ Source code in docetl/operations/resolve.py +
213
+214
+215
+216
+217
+218
+219
+220
+221
+222
+223
+224
+225
+226
+227
+228
+229
+230
+231
+232
+233
+234
+235
+236
+237
+238
+239
+240
+241
+242
+243
+244
+245
+246
+247
+248
+249
+250
+251
+252
+253
+254
+255
+256
+257
+258
+259
+260
+261
+262
+263
+264
+265
+266
+267
+268
+269
+270
+271
+272
+273
+274
+275
+276
+277
+278
+279
+280
+281
+282
+283
+284
+285
+286
+287
+288
+289
+290
+291
+292
+293
+294
+295
+296
+297
+298
+299
+300
+301
+302
+303
+304
+305
+306
+307
+308
+309
+310
+311
+312
+313
+314
+315
+316
+317
+318
+319
+320
+321
+322
+323
+324
+325
+326
+327
+328
+329
+330
+331
+332
+333
+334
+335
+336
+337
+338
+339
+340
+341
+342
+343
+344
+345
+346
+347
+348
+349
+350
+351
+352
+353
+354
+355
+356
+357
+358
+359
+360
+361
+362
+363
+364
+365
+366
+367
+368
+369
+370
+371
+372
+373
+374
+375
+376
+377
+378
+379
+380
+381
+382
+383
+384
+385
+386
+387
+388
+389
+390
+391
+392
+393
+394
+395
+396
+397
+398
+399
+400
+401
+402
+403
+404
+405
+406
+407
+408
+409
+410
+411
+412
+413
+414
+415
+416
+417
+418
+419
+420
+421
+422
+423
+424
+425
+426
+427
+428
+429
+430
+431
+432
+433
+434
+435
+436
+437
+438
+439
+440
+441
+442
+443
+444
+445
+446
+447
+448
+449
+450
+451
+452
+453
+454
+455
+456
+457
+458
+459
+460
+461
+462
+463
+464
+465
+466
+467
+468
+469
+470
+471
+472
+473
+474
+475
+476
+477
+478
+479
+480
+481
+482
+483
+484
+485
+486
+487
+488
+489
+490
+491
+492
+493
+494
+495
+496
+497
+498
+499
+500
+501
+502
+503
+504
+505
+506
+507
+508
+509
+510
+511
+512
+513
+514
+515
+516
+517
+518
+519
+520
+521
+522
+523
+524
+525
+526
+527
+528
+529
+530
+531
+532
+533
+534
+535
+536
+537
+538
+539
+540
+541
+542
+543
+544
+545
+546
+547
+548
+549
+550
+551
+552
+553
+554
+555
+556
+557
+558
+559
+560
+561
+562
+563
+564
+565
+566
+567
+568
+569
+570
+571
+572
+573
+574
+575
+576
+577
+578
+579
+580
+581
+582
+583
+584
+585
+586
+587
+588
+589
+590
+591
+592
+593
+594
+595
+596
+597
+598
+599
+600
+601
+602
+603
+604
+605
+606
+607
+608
+609
+610
+611
+612
+613
+614
+615
+616
+617
+618
+619
+620
+621
+622
+623
+624
+625
+626
+627
+628
+629
+630
+631
+632
+633
+634
+635
+636
+637
+638
+639
+640
+641
+642
+643
+644
+645
+646
+647
+648
+649
+650
+651
+652
+653
+654
+655
+656
+657
+658
+659
+660
+661
+662
+663
+664
+665
+666
+667
+668
+669
+670
+671
+672
+673
+674
+675
+676
+677
+678
+679
+680
+681
+682
+683
+684
+685
+686
+687
+688
+689
+690
+691
+692
+693
+694
+695
+696
+697
+698
+699
+700
+701
+702
+703
+704
+705
+706
+707
+708
+709
+710
+711
+712
+713
+714
+715
+716
+717
+718
+719
+720
+721
+722
+723
+724
def execute(self, input_data: List[Dict]) -> Tuple[List[Dict], float]:
+    """
+    Executes the resolve operation on the provided dataset.
+
+    Args:
+        input_data (List[Dict]): The dataset to resolve.
+
+    Returns:
+        Tuple[List[Dict], float]: A tuple containing the resolved results and the total cost of the operation.
+
+    This method performs the following steps:
+    1. Initial blocking based on specified conditions and/or embedding similarity
+    2. Pairwise comparison of potentially matching entries using LLM
+    3. Clustering of matched entries
+    4. Resolution of each cluster into a single entry (if applicable)
+    5. Result aggregation and validation
+
+    The method also calculates and logs statistics such as comparisons saved by blocking and self-join selectivity.
+    """
+    if len(input_data) == 0:
+        return [], 0
+
+    # Initialize observability data for all items at the start
+    if self.config.get("enable_observability", False):
+        observability_key = f"_observability_{self.config['name']}"
+        for item in input_data:
+            if observability_key not in item:
+                item[observability_key] = {
+                    "comparison_prompts": [],
+                    "resolution_prompt": None,
+                }
+
+    blocking_keys = self.config.get("blocking_keys", [])
+    blocking_threshold = self.config.get("blocking_threshold")
+    blocking_conditions = self.config.get("blocking_conditions", [])
+    if self.status:
+        self.status.stop()
+
+    if not blocking_threshold and not blocking_conditions:
+        # Prompt the user for confirmation
+        if not Confirm.ask(
+            "[yellow]Warning: No blocking keys or conditions specified. "
+            "This may result in a large number of comparisons. "
+            "We recommend specifying at least one blocking key or condition, or using the optimizer to automatically come up with these. "
+            "Do you want to continue without blocking?[/yellow]",
+            console=self.runner.console,
+        ):
+            raise ValueError("Operation cancelled by user.")
+
+    input_schema = self.config.get("input", {}).get("schema", {})
+    if not blocking_keys:
+        # Set them to all keys in the input data
+        blocking_keys = list(input_data[0].keys())
+    limit_comparisons = self.config.get("limit_comparisons")
+    total_cost = 0
+
+    def is_match(item1: Dict[str, Any], item2: Dict[str, Any]) -> bool:
+        return any(
+            eval(condition, {"input1": item1, "input2": item2})
+            for condition in blocking_conditions
+        )
+
+    # Calculate embeddings if blocking_threshold is set
+    embeddings = None
+    if blocking_threshold is not None:
+        embedding_model = self.config.get(
+            "embedding_model", "text-embedding-3-small"
+        )
+
+        def get_embeddings_batch(
+            items: List[Dict[str, Any]]
+        ) -> List[Tuple[List[float], float]]:
+            texts = [
+                " ".join(str(item[key]) for key in blocking_keys if key in item)
+                for item in items
+            ]
+            response = self.runner.api.gen_embedding(
+                model=embedding_model, input=texts
+            )
+            return [
+                (data["embedding"], completion_cost(response))
+                for data in response["data"]
+            ]
+
+        embeddings = []
+        costs = []
+        with ThreadPoolExecutor(max_workers=self.max_threads) as executor:
+            for i in range(
+                0, len(input_data), self.config.get("embedding_batch_size", 1000)
+            ):
+                batch = input_data[
+                    i : i + self.config.get("embedding_batch_size", 1000)
+                ]
+                batch_results = list(executor.map(get_embeddings_batch, [batch]))
+
+                for result in batch_results:
+                    embeddings.extend([r[0] for r in result])
+                    costs.extend([r[1] for r in result])
+
+            total_cost += sum(costs)
+
+    # Generate all pairs to compare, ensuring no duplicate comparisons
+    def get_unique_comparison_pairs() -> (
+        Tuple[List[Tuple[int, int]], Dict[Tuple[str, ...], List[int]]]
+    ):
+        # Create a mapping of values to their indices
+        value_to_indices: Dict[Tuple[str, ...], List[int]] = {}
+        for i, item in enumerate(input_data):
+            # Create a hashable key from the blocking keys
+            key = tuple(str(item.get(k, "")) for k in blocking_keys)
+            if key not in value_to_indices:
+                value_to_indices[key] = []
+            value_to_indices[key].append(i)
+
+        # Generate pairs for comparison, comparing each unique value combination only once
+        comparison_pairs = []
+        keys = list(value_to_indices.keys())
+
+        # First, handle comparisons between different values
+        for i in range(len(keys)):
+            for j in range(i + 1, len(keys)):
+                # Only need one comparison between different values
+                idx1 = value_to_indices[keys[i]][0]
+                idx2 = value_to_indices[keys[j]][0]
+                if idx1 < idx2:  # Maintain ordering to avoid duplicates
+                    comparison_pairs.append((idx1, idx2))
+
+        return comparison_pairs, value_to_indices
+
+    comparison_pairs, value_to_indices = get_unique_comparison_pairs()
+
+    # Filter pairs based on blocking conditions
+    def meets_blocking_conditions(pair: Tuple[int, int]) -> bool:
+        i, j = pair
+        return (
+            is_match(input_data[i], input_data[j]) if blocking_conditions else False
+        )
+
+    blocked_pairs = (
+        list(filter(meets_blocking_conditions, comparison_pairs))
+        if blocking_conditions
+        else comparison_pairs
+    )
+
+    # Apply limit_comparisons to blocked pairs
+    if limit_comparisons is not None and len(blocked_pairs) > limit_comparisons:
+        self.console.log(
+            f"Randomly sampling {limit_comparisons} pairs out of {len(blocked_pairs)} blocked pairs."
+        )
+        blocked_pairs = random.sample(blocked_pairs, limit_comparisons)
+
+    # Initialize clusters with all indices
+    clusters = [{i} for i in range(len(input_data))]
+    cluster_map = {i: i for i in range(len(input_data))}
+
+    # If there are remaining comparisons, fill with highest cosine similarities
+    remaining_comparisons = (
+        limit_comparisons - len(blocked_pairs)
+        if limit_comparisons is not None
+        else float("inf")
+    )
+    if remaining_comparisons > 0 and blocking_threshold is not None:
+        # Compute cosine similarity for all pairs efficiently
+        from sklearn.metrics.pairwise import cosine_similarity
+
+        similarity_matrix = cosine_similarity(embeddings)
+
+        cosine_pairs = []
+        for i, j in comparison_pairs:
+            if (i, j) not in blocked_pairs and find_cluster(
+                i, cluster_map
+            ) != find_cluster(j, cluster_map):
+                similarity = similarity_matrix[i, j]
+                if similarity >= blocking_threshold:
+                    cosine_pairs.append((i, j, similarity))
+
+        if remaining_comparisons != float("inf"):
+            cosine_pairs.sort(key=lambda x: x[2], reverse=True)
+            additional_pairs = [
+                (i, j) for i, j, _ in cosine_pairs[: int(remaining_comparisons)]
+            ]
+            blocked_pairs.extend(additional_pairs)
+        else:
+            blocked_pairs.extend((i, j) for i, j, _ in cosine_pairs)
+
+    # Modified merge_clusters to handle all indices with the same value
+
+    def merge_clusters(item1: int, item2: int) -> None:
+        root1, root2 = find_cluster(item1, cluster_map), find_cluster(
+            item2, cluster_map
+        )
+        if root1 != root2:
+            if len(clusters[root1]) < len(clusters[root2]):
+                root1, root2 = root2, root1
+            clusters[root1] |= clusters[root2]
+            cluster_map[root2] = root1
+            clusters[root2] = set()
+
+            # Also merge all other indices that share the same values
+            key1 = tuple(str(input_data[item1].get(k, "")) for k in blocking_keys)
+            key2 = tuple(str(input_data[item2].get(k, "")) for k in blocking_keys)
+
+            # Merge all indices with the same values
+            for idx in value_to_indices.get(key1, []):
+                if idx != item1:
+                    root_idx = find_cluster(idx, cluster_map)
+                    if root_idx != root1:
+                        clusters[root1] |= clusters[root_idx]
+                        cluster_map[root_idx] = root1
+                        clusters[root_idx] = set()
+
+            for idx in value_to_indices.get(key2, []):
+                if idx != item2:
+                    root_idx = find_cluster(idx, cluster_map)
+                    if root_idx != root1:
+                        clusters[root1] |= clusters[root_idx]
+                        cluster_map[root_idx] = root1
+                        clusters[root_idx] = set()
+
+    # Calculate and print statistics
+    total_possible_comparisons = len(input_data) * (len(input_data) - 1) // 2
+    comparisons_made = len(blocked_pairs)
+    comparisons_saved = total_possible_comparisons - comparisons_made
+    self.console.log(
+        f"[green]Comparisons saved by blocking: {comparisons_saved} "
+        f"({(comparisons_saved / total_possible_comparisons) * 100:.2f}%)[/green]"
+    )
+    self.console.log(
+        f"[blue]Number of pairs to compare: {len(blocked_pairs)}[/blue]"
+    )
+
+    # Compute an auto-batch size based on the number of comparisons
+    def auto_batch() -> int:
+        # Maximum batch size limit for 4o-mini model
+        M = 500
+
+        n = len(input_data)
+        m = len(blocked_pairs)
+
+        # https://www.wolframalpha.com/input?i=k%28k-1%29%2F2+%2B+%28n-k%29%28k-1%29+%3D+m%2C+solve+for+k
+        # Two possible solutions for k:
+        # k = -1/2 sqrt((1 - 2n)^2 - 8m) + n + 1/2
+        # k = 1/2 (sqrt((1 - 2n)^2 - 8m) + 2n + 1)
+
+        discriminant = (1 - 2 * n) ** 2 - 8 * m
+        sqrt_discriminant = discriminant**0.5
+
+        k1 = -0.5 * sqrt_discriminant + n + 0.5
+        k2 = 0.5 * (sqrt_discriminant + 2 * n + 1)
+
+        # Take the maximum viable solution
+        k = max(k1, k2)
+        return M if k < 0 else min(int(k), M)
+
+    # Compare pairs and update clusters in real-time
+    batch_size = self.config.get("compare_batch_size", auto_batch())
+    self.console.log(f"Using compare batch size: {batch_size}")
+    pair_costs = 0
+
+    pbar = RichLoopBar(
+        range(0, len(blocked_pairs), batch_size),
+        desc=f"Processing batches of {batch_size} LLM comparisons",
+        console=self.console,
+    )
+    last_processed = 0
+    for i in pbar:
+        batch_end = last_processed + batch_size
+        batch = blocked_pairs[last_processed:batch_end]
+        # Filter pairs for the initial batch
+        better_batch = [
+            pair
+            for pair in batch
+            if find_cluster(pair[0], cluster_map) == pair[0]
+            and find_cluster(pair[1], cluster_map) == pair[1]
+        ]
+
+        # Expand better_batch if it doesn’t reach batch_size
+        while len(better_batch) < batch_size and batch_end < len(blocked_pairs):
+            # Move batch_end forward by batch_size to get more pairs
+            next_end = batch_end + batch_size
+            next_batch = blocked_pairs[batch_end:next_end]
+
+            better_batch.extend(
+                pair
+                for pair in next_batch
+                if find_cluster(pair[0], cluster_map) == pair[0]
+                and find_cluster(pair[1], cluster_map) == pair[1]
+            )
+
+            # Update batch_end to prevent overlapping in the next loop
+            batch_end = next_end
+        better_batch = better_batch[:batch_size]
+        last_processed = batch_end
+        with ThreadPoolExecutor(max_workers=self.max_threads) as executor:
+            future_to_pair = {
+                executor.submit(
+                    self.compare_pair,
+                    self.config["comparison_prompt"],
+                    self.config.get("comparison_model", self.default_model),
+                    input_data[pair[0]],
+                    input_data[pair[1]],
+                    blocking_keys,
+                    timeout_seconds=self.config.get("timeout", 120),
+                    max_retries_per_timeout=self.config.get(
+                        "max_retries_per_timeout", 2
+                    ),
+                ): pair
+                for pair in better_batch
+            }
+
+            for future in as_completed(future_to_pair):
+                pair = future_to_pair[future]
+                is_match_result, cost, prompt = future.result()
+                pair_costs += cost
+                if is_match_result:
+                    merge_clusters(pair[0], pair[1])
+
+                if self.config.get("enable_observability", False):
+                    observability_key = f"_observability_{self.config['name']}"
+                    for idx in (pair[0], pair[1]):
+                        if observability_key not in input_data[idx]:
+                            input_data[idx][observability_key] = {
+                                "comparison_prompts": [],
+                                "resolution_prompt": None,
+                            }
+                        input_data[idx][observability_key][
+                            "comparison_prompts"
+                        ].append(prompt)
+
+    total_cost += pair_costs
+
+    # Collect final clusters
+    final_clusters = [cluster for cluster in clusters if cluster]
+
+    # Process each cluster
+    results = []
+
+    def process_cluster(cluster):
+        if len(cluster) > 1:
+            cluster_items = [input_data[i] for i in cluster]
+            if input_schema:
+                cluster_items = [
+                    {k: item[k] for k in input_schema.keys() if k in item}
+                    for item in cluster_items
+                ]
+
+            resolution_prompt = strict_render(
+                self.config["resolution_prompt"], {"inputs": cluster_items}
+            )
+            reduction_response = self.runner.api.call_llm(
+                self.config.get("resolution_model", self.default_model),
+                "reduce",
+                [{"role": "user", "content": resolution_prompt}],
+                self.config["output"]["schema"],
+                timeout_seconds=self.config.get("timeout", 120),
+                max_retries_per_timeout=self.config.get(
+                    "max_retries_per_timeout", 2
+                ),
+                bypass_cache=self.config.get("bypass_cache", False),
+                validation_config=(
+                    {
+                        "val_rule": self.config.get("validate", []),
+                        "validation_fn": self.validation_fn,
+                    }
+                    if self.config.get("validate", None)
+                    else None
+                ),
+                litellm_completion_kwargs=self.config.get(
+                    "litellm_completion_kwargs", {}
+                ),
+            )
+            reduction_cost = reduction_response.total_cost
+
+            if self.config.get("enable_observability", False):
+                for item in [input_data[i] for i in cluster]:
+                    observability_key = f"_observability_{self.config['name']}"
+                    if observability_key not in item:
+                        item[observability_key] = {
+                            "comparison_prompts": [],
+                            "resolution_prompt": None,
+                        }
+                    item[observability_key]["resolution_prompt"] = resolution_prompt
+
+            if reduction_response.validated:
+                reduction_output = self.runner.api.parse_llm_response(
+                    reduction_response.response,
+                    self.config["output"]["schema"],
+                    manually_fix_errors=self.manually_fix_errors,
+                )[0]
+
+                # If the output is overwriting an existing key, we want to save the kv pairs
+                keys_in_output = [
+                    k
+                    for k in set(reduction_output.keys())
+                    if k in cluster_items[0].keys()
+                ]
+
+                return (
+                    [
+                        {
+                            **item,
+                            f"_kv_pairs_preresolve_{self.config['name']}": {
+                                k: item[k] for k in keys_in_output
+                            },
+                            **{
+                                k: reduction_output[k]
+                                for k in self.config["output"]["schema"]
+                            },
+                        }
+                        for item in [input_data[i] for i in cluster]
+                    ],
+                    reduction_cost,
+                )
+            return [], reduction_cost
+        else:
+            # Set the output schema to be the keys found in the compare_prompt
+            compare_prompt_keys = extract_jinja_variables(
+                self.config["comparison_prompt"]
+            )
+            # Get the set of keys in the compare_prompt
+            compare_prompt_keys = set(
+                [
+                    k.replace("input1.", "")
+                    for k in compare_prompt_keys
+                    if "input1" in k
+                ]
+            )
+
+            # For each key in the output schema, find the most similar key in the compare_prompt
+            output_keys = set(self.config["output"]["schema"].keys())
+            key_mapping = {}
+            for output_key in output_keys:
+                best_match = None
+                best_score = 0
+                for compare_key in compare_prompt_keys:
+                    score = sum(
+                        c1 == c2 for c1, c2 in zip(output_key, compare_key)
+                    ) / max(len(output_key), len(compare_key))
+                    if score > best_score:
+                        best_score = score
+                        best_match = compare_key
+                key_mapping[output_key] = best_match
+
+            # Create the result dictionary using the key mapping
+            result = input_data[list(cluster)[0]].copy()
+            result[f"_kv_pairs_preresolve_{self.config['name']}"] = {
+                ok: result[ck] for ok, ck in key_mapping.items() if ck in result
+            }
+            for output_key, compare_key in key_mapping.items():
+                if compare_key in input_data[list(cluster)[0]]:
+                    result[output_key] = input_data[list(cluster)[0]][compare_key]
+                elif output_key in input_data[list(cluster)[0]]:
+                    result[output_key] = input_data[list(cluster)[0]][output_key]
+                else:
+                    result[output_key] = None  # or some default value
+
+            return [result], 0
+
+    # Calculate the number of records before and clusters after
+    num_records_before = len(input_data)
+    num_clusters_after = len(final_clusters)
+    self.console.log(f"Number of keys before resolution: {num_records_before}")
+    self.console.log(
+        f"Number of distinct keys after resolution: {num_clusters_after}"
+    )
+
+    # If no resolution prompt is provided, we can skip the resolution phase
+    # And simply select the most common value for each key
+    if not self.config.get("resolution_prompt", None):
+        for cluster in final_clusters:
+            if len(cluster) > 1:
+                for key in self.config["output"]["keys"]:
+                    most_common_value = max(
+                        set(input_data[i][key] for i in cluster),
+                        key=lambda x: sum(
+                            1 for i in cluster if input_data[i][key] == x
+                        ),
+                    )
+                    for i in cluster:
+                        input_data[i][key] = most_common_value
+        results = input_data
+    else:
+        with ThreadPoolExecutor(max_workers=self.max_threads) as executor:
+            futures = [
+                executor.submit(process_cluster, cluster)
+                for cluster in final_clusters
+            ]
+            for future in rich_as_completed(
+                futures,
+                total=len(futures),
+                desc="Determining resolved key for each group of equivalent keys",
+                console=self.console,
+            ):
+                cluster_results, cluster_cost = future.result()
+                results.extend(cluster_results)
+                total_cost += cluster_cost
+
+    total_pairs = len(input_data) * (len(input_data) - 1) // 2
+    true_match_count = sum(
+        len(cluster) * (len(cluster) - 1) // 2
+        for cluster in final_clusters
+        if len(cluster) > 1
+    )
+    true_match_selectivity = (
+        true_match_count / total_pairs if total_pairs > 0 else 0
+    )
+    self.console.log(f"Self-join selectivity: {true_match_selectivity:.4f}")
+
+    if self.status:
+        self.status.start()
+
+    return results, total_cost
+
+
+
+ +
+ +
+ + +

+ syntax_check() + +

+ + +
+ +

Checks the configuration of the ResolveOperation for required keys and valid structure.

+

This method performs the following checks: +1. Verifies the presence of required keys: 'comparison_prompt' and 'output'. +2. Ensures 'output' contains a 'schema' key. +3. Validates that 'schema' in 'output' is a non-empty dictionary. +4. Checks if 'comparison_prompt' is a valid Jinja2 template with 'input1' and 'input2' variables. +5. If 'resolution_prompt' is present, verifies it as a valid Jinja2 template with 'inputs' variable. +6. Optionally checks if 'model' is a string (if present). +7. Optionally checks 'blocking_keys' (if present, further checks are performed).

+ + +

Raises:

+ + + + + + + + + + + + + + + + + +
TypeDescription
+ ValueError + +
+

If required keys are missing, if templates are invalid or missing required variables, + or if any other configuration aspect is incorrect or inconsistent.

+
+
+ TypeError + +
+

If the types of configuration values are incorrect, such as 'schema' not being a dict + or 'model' not being a string.

+
+
+ +
+ Source code in docetl/operations/resolve.py +
 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
+162
+163
+164
+165
+166
+167
+168
+169
+170
+171
+172
+173
+174
+175
+176
+177
+178
+179
+180
+181
+182
+183
+184
+185
+186
+187
+188
+189
+190
+191
+192
+193
+194
+195
+196
+197
+198
+199
+200
+201
+202
def syntax_check(self) -> None:
+    """
+    Checks the configuration of the ResolveOperation for required keys and valid structure.
+
+    This method performs the following checks:
+    1. Verifies the presence of required keys: 'comparison_prompt' and 'output'.
+    2. Ensures 'output' contains a 'schema' key.
+    3. Validates that 'schema' in 'output' is a non-empty dictionary.
+    4. Checks if 'comparison_prompt' is a valid Jinja2 template with 'input1' and 'input2' variables.
+    5. If 'resolution_prompt' is present, verifies it as a valid Jinja2 template with 'inputs' variable.
+    6. Optionally checks if 'model' is a string (if present).
+    7. Optionally checks 'blocking_keys' (if present, further checks are performed).
+
+    Raises:
+        ValueError: If required keys are missing, if templates are invalid or missing required variables,
+                    or if any other configuration aspect is incorrect or inconsistent.
+        TypeError: If the types of configuration values are incorrect, such as 'schema' not being a dict
+                   or 'model' not being a string.
+    """
+    required_keys = ["comparison_prompt", "output"]
+    for key in required_keys:
+        if key not in self.config:
+            raise ValueError(
+                f"Missing required key '{key}' in ResolveOperation configuration"
+            )
+
+    if "schema" not in self.config["output"] and not self.runner._from_df_accessors:
+        raise ValueError("Missing 'schema' in 'output' configuration")
+    elif not self.runner._from_df_accessors:
+        if not isinstance(self.config["output"]["schema"], dict):
+            raise TypeError(
+                "'schema' in 'output' configuration must be a dictionary"
+            )
+
+        if not self.config["output"]["schema"]:
+            raise ValueError("'schema' in 'output' configuration cannot be empty")
+
+    # Check if the comparison_prompt is a valid Jinja2 template
+    try:
+        comparison_template = Template(self.config["comparison_prompt"])
+        comparison_vars = comparison_template.environment.parse(
+            self.config["comparison_prompt"]
+        ).find_all(jinja2.nodes.Name)
+        comparison_var_names = {var.name for var in comparison_vars}
+        if (
+            "input1" not in comparison_var_names
+            or "input2" not in comparison_var_names
+        ):
+            raise ValueError(
+                f"'comparison_prompt' must contain both 'input1' and 'input2' variables. {self.config['comparison_prompt']}"
+            )
+
+        if "resolution_prompt" in self.config:
+            reduction_template = Template(self.config["resolution_prompt"])
+            reduction_vars = reduction_template.environment.parse(
+                self.config["resolution_prompt"]
+            ).find_all(jinja2.nodes.Name)
+            reduction_var_names = {var.name for var in reduction_vars}
+            if "inputs" not in reduction_var_names:
+                raise ValueError(
+                    "'resolution_prompt' must contain 'inputs' variable"
+                )
+    except Exception as e:
+        raise ValueError(f"Invalid Jinja2 template: {str(e)}")
+
+    # Check if the model is specified (optional)
+    if "model" in self.config and not isinstance(self.config["model"], str):
+        raise TypeError("'model' in configuration must be a string")
+
+    # Check blocking_keys (optional)
+    if "blocking_keys" in self.config:
+        if not isinstance(self.config["blocking_keys"], list):
+            raise TypeError("'blocking_keys' must be a list")
+        if not all(isinstance(key, str) for key in self.config["blocking_keys"]):
+            raise TypeError("All items in 'blocking_keys' must be strings")
+
+    # Check blocking_threshold (optional)
+    if "blocking_threshold" in self.config:
+        if not isinstance(self.config["blocking_threshold"], (int, float)):
+            raise TypeError("'blocking_threshold' must be a number")
+        if not 0 <= self.config["blocking_threshold"] <= 1:
+            raise ValueError("'blocking_threshold' must be between 0 and 1")
+
+    # Check blocking_conditions (optional)
+    if "blocking_conditions" in self.config:
+        if not isinstance(self.config["blocking_conditions"], list):
+            raise TypeError("'blocking_conditions' must be a list")
+        if not all(
+            isinstance(cond, str) for cond in self.config["blocking_conditions"]
+        ):
+            raise TypeError("All items in 'blocking_conditions' must be strings")
+
+    # Check if input schema is provided and valid (optional)
+    if "input" in self.config:
+        if "schema" not in self.config["input"]:
+            raise ValueError("Missing 'schema' in 'input' configuration")
+        if not isinstance(self.config["input"]["schema"], dict):
+            raise TypeError(
+                "'schema' in 'input' configuration must be a dictionary"
+            )
+
+    # Check limit_comparisons (optional)
+    if "limit_comparisons" in self.config:
+        if not isinstance(self.config["limit_comparisons"], int):
+            raise TypeError("'limit_comparisons' must be an integer")
+        if self.config["limit_comparisons"] <= 0:
+            raise ValueError("'limit_comparisons' must be a positive integer")
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ docetl.operations.reduce.ReduceOperation + + +

+ + +
+

+ Bases: BaseOperation

+ + +

A class that implements a reduce operation on input data using language models.

+

This class extends BaseOperation to provide functionality for reducing grouped data +using various strategies including batch reduce, incremental reduce, and parallel fold and merge.

+ + + + + + +
+ Source code in docetl/operations/reduce.py +
  31
+  32
+  33
+  34
+  35
+  36
+  37
+  38
+  39
+  40
+  41
+  42
+  43
+  44
+  45
+  46
+  47
+  48
+  49
+  50
+  51
+  52
+  53
+  54
+  55
+  56
+  57
+  58
+  59
+  60
+  61
+  62
+  63
+  64
+  65
+  66
+  67
+  68
+  69
+  70
+  71
+  72
+  73
+  74
+  75
+  76
+  77
+  78
+  79
+  80
+  81
+  82
+  83
+  84
+  85
+  86
+  87
+  88
+  89
+  90
+  91
+  92
+  93
+  94
+  95
+  96
+  97
+  98
+  99
+ 100
+ 101
+ 102
+ 103
+ 104
+ 105
+ 106
+ 107
+ 108
+ 109
+ 110
+ 111
+ 112
+ 113
+ 114
+ 115
+ 116
+ 117
+ 118
+ 119
+ 120
+ 121
+ 122
+ 123
+ 124
+ 125
+ 126
+ 127
+ 128
+ 129
+ 130
+ 131
+ 132
+ 133
+ 134
+ 135
+ 136
+ 137
+ 138
+ 139
+ 140
+ 141
+ 142
+ 143
+ 144
+ 145
+ 146
+ 147
+ 148
+ 149
+ 150
+ 151
+ 152
+ 153
+ 154
+ 155
+ 156
+ 157
+ 158
+ 159
+ 160
+ 161
+ 162
+ 163
+ 164
+ 165
+ 166
+ 167
+ 168
+ 169
+ 170
+ 171
+ 172
+ 173
+ 174
+ 175
+ 176
+ 177
+ 178
+ 179
+ 180
+ 181
+ 182
+ 183
+ 184
+ 185
+ 186
+ 187
+ 188
+ 189
+ 190
+ 191
+ 192
+ 193
+ 194
+ 195
+ 196
+ 197
+ 198
+ 199
+ 200
+ 201
+ 202
+ 203
+ 204
+ 205
+ 206
+ 207
+ 208
+ 209
+ 210
+ 211
+ 212
+ 213
+ 214
+ 215
+ 216
+ 217
+ 218
+ 219
+ 220
+ 221
+ 222
+ 223
+ 224
+ 225
+ 226
+ 227
+ 228
+ 229
+ 230
+ 231
+ 232
+ 233
+ 234
+ 235
+ 236
+ 237
+ 238
+ 239
+ 240
+ 241
+ 242
+ 243
+ 244
+ 245
+ 246
+ 247
+ 248
+ 249
+ 250
+ 251
+ 252
+ 253
+ 254
+ 255
+ 256
+ 257
+ 258
+ 259
+ 260
+ 261
+ 262
+ 263
+ 264
+ 265
+ 266
+ 267
+ 268
+ 269
+ 270
+ 271
+ 272
+ 273
+ 274
+ 275
+ 276
+ 277
+ 278
+ 279
+ 280
+ 281
+ 282
+ 283
+ 284
+ 285
+ 286
+ 287
+ 288
+ 289
+ 290
+ 291
+ 292
+ 293
+ 294
+ 295
+ 296
+ 297
+ 298
+ 299
+ 300
+ 301
+ 302
+ 303
+ 304
+ 305
+ 306
+ 307
+ 308
+ 309
+ 310
+ 311
+ 312
+ 313
+ 314
+ 315
+ 316
+ 317
+ 318
+ 319
+ 320
+ 321
+ 322
+ 323
+ 324
+ 325
+ 326
+ 327
+ 328
+ 329
+ 330
+ 331
+ 332
+ 333
+ 334
+ 335
+ 336
+ 337
+ 338
+ 339
+ 340
+ 341
+ 342
+ 343
+ 344
+ 345
+ 346
+ 347
+ 348
+ 349
+ 350
+ 351
+ 352
+ 353
+ 354
+ 355
+ 356
+ 357
+ 358
+ 359
+ 360
+ 361
+ 362
+ 363
+ 364
+ 365
+ 366
+ 367
+ 368
+ 369
+ 370
+ 371
+ 372
+ 373
+ 374
+ 375
+ 376
+ 377
+ 378
+ 379
+ 380
+ 381
+ 382
+ 383
+ 384
+ 385
+ 386
+ 387
+ 388
+ 389
+ 390
+ 391
+ 392
+ 393
+ 394
+ 395
+ 396
+ 397
+ 398
+ 399
+ 400
+ 401
+ 402
+ 403
+ 404
+ 405
+ 406
+ 407
+ 408
+ 409
+ 410
+ 411
+ 412
+ 413
+ 414
+ 415
+ 416
+ 417
+ 418
+ 419
+ 420
+ 421
+ 422
+ 423
+ 424
+ 425
+ 426
+ 427
+ 428
+ 429
+ 430
+ 431
+ 432
+ 433
+ 434
+ 435
+ 436
+ 437
+ 438
+ 439
+ 440
+ 441
+ 442
+ 443
+ 444
+ 445
+ 446
+ 447
+ 448
+ 449
+ 450
+ 451
+ 452
+ 453
+ 454
+ 455
+ 456
+ 457
+ 458
+ 459
+ 460
+ 461
+ 462
+ 463
+ 464
+ 465
+ 466
+ 467
+ 468
+ 469
+ 470
+ 471
+ 472
+ 473
+ 474
+ 475
+ 476
+ 477
+ 478
+ 479
+ 480
+ 481
+ 482
+ 483
+ 484
+ 485
+ 486
+ 487
+ 488
+ 489
+ 490
+ 491
+ 492
+ 493
+ 494
+ 495
+ 496
+ 497
+ 498
+ 499
+ 500
+ 501
+ 502
+ 503
+ 504
+ 505
+ 506
+ 507
+ 508
+ 509
+ 510
+ 511
+ 512
+ 513
+ 514
+ 515
+ 516
+ 517
+ 518
+ 519
+ 520
+ 521
+ 522
+ 523
+ 524
+ 525
+ 526
+ 527
+ 528
+ 529
+ 530
+ 531
+ 532
+ 533
+ 534
+ 535
+ 536
+ 537
+ 538
+ 539
+ 540
+ 541
+ 542
+ 543
+ 544
+ 545
+ 546
+ 547
+ 548
+ 549
+ 550
+ 551
+ 552
+ 553
+ 554
+ 555
+ 556
+ 557
+ 558
+ 559
+ 560
+ 561
+ 562
+ 563
+ 564
+ 565
+ 566
+ 567
+ 568
+ 569
+ 570
+ 571
+ 572
+ 573
+ 574
+ 575
+ 576
+ 577
+ 578
+ 579
+ 580
+ 581
+ 582
+ 583
+ 584
+ 585
+ 586
+ 587
+ 588
+ 589
+ 590
+ 591
+ 592
+ 593
+ 594
+ 595
+ 596
+ 597
+ 598
+ 599
+ 600
+ 601
+ 602
+ 603
+ 604
+ 605
+ 606
+ 607
+ 608
+ 609
+ 610
+ 611
+ 612
+ 613
+ 614
+ 615
+ 616
+ 617
+ 618
+ 619
+ 620
+ 621
+ 622
+ 623
+ 624
+ 625
+ 626
+ 627
+ 628
+ 629
+ 630
+ 631
+ 632
+ 633
+ 634
+ 635
+ 636
+ 637
+ 638
+ 639
+ 640
+ 641
+ 642
+ 643
+ 644
+ 645
+ 646
+ 647
+ 648
+ 649
+ 650
+ 651
+ 652
+ 653
+ 654
+ 655
+ 656
+ 657
+ 658
+ 659
+ 660
+ 661
+ 662
+ 663
+ 664
+ 665
+ 666
+ 667
+ 668
+ 669
+ 670
+ 671
+ 672
+ 673
+ 674
+ 675
+ 676
+ 677
+ 678
+ 679
+ 680
+ 681
+ 682
+ 683
+ 684
+ 685
+ 686
+ 687
+ 688
+ 689
+ 690
+ 691
+ 692
+ 693
+ 694
+ 695
+ 696
+ 697
+ 698
+ 699
+ 700
+ 701
+ 702
+ 703
+ 704
+ 705
+ 706
+ 707
+ 708
+ 709
+ 710
+ 711
+ 712
+ 713
+ 714
+ 715
+ 716
+ 717
+ 718
+ 719
+ 720
+ 721
+ 722
+ 723
+ 724
+ 725
+ 726
+ 727
+ 728
+ 729
+ 730
+ 731
+ 732
+ 733
+ 734
+ 735
+ 736
+ 737
+ 738
+ 739
+ 740
+ 741
+ 742
+ 743
+ 744
+ 745
+ 746
+ 747
+ 748
+ 749
+ 750
+ 751
+ 752
+ 753
+ 754
+ 755
+ 756
+ 757
+ 758
+ 759
+ 760
+ 761
+ 762
+ 763
+ 764
+ 765
+ 766
+ 767
+ 768
+ 769
+ 770
+ 771
+ 772
+ 773
+ 774
+ 775
+ 776
+ 777
+ 778
+ 779
+ 780
+ 781
+ 782
+ 783
+ 784
+ 785
+ 786
+ 787
+ 788
+ 789
+ 790
+ 791
+ 792
+ 793
+ 794
+ 795
+ 796
+ 797
+ 798
+ 799
+ 800
+ 801
+ 802
+ 803
+ 804
+ 805
+ 806
+ 807
+ 808
+ 809
+ 810
+ 811
+ 812
+ 813
+ 814
+ 815
+ 816
+ 817
+ 818
+ 819
+ 820
+ 821
+ 822
+ 823
+ 824
+ 825
+ 826
+ 827
+ 828
+ 829
+ 830
+ 831
+ 832
+ 833
+ 834
+ 835
+ 836
+ 837
+ 838
+ 839
+ 840
+ 841
+ 842
+ 843
+ 844
+ 845
+ 846
+ 847
+ 848
+ 849
+ 850
+ 851
+ 852
+ 853
+ 854
+ 855
+ 856
+ 857
+ 858
+ 859
+ 860
+ 861
+ 862
+ 863
+ 864
+ 865
+ 866
+ 867
+ 868
+ 869
+ 870
+ 871
+ 872
+ 873
+ 874
+ 875
+ 876
+ 877
+ 878
+ 879
+ 880
+ 881
+ 882
+ 883
+ 884
+ 885
+ 886
+ 887
+ 888
+ 889
+ 890
+ 891
+ 892
+ 893
+ 894
+ 895
+ 896
+ 897
+ 898
+ 899
+ 900
+ 901
+ 902
+ 903
+ 904
+ 905
+ 906
+ 907
+ 908
+ 909
+ 910
+ 911
+ 912
+ 913
+ 914
+ 915
+ 916
+ 917
+ 918
+ 919
+ 920
+ 921
+ 922
+ 923
+ 924
+ 925
+ 926
+ 927
+ 928
+ 929
+ 930
+ 931
+ 932
+ 933
+ 934
+ 935
+ 936
+ 937
+ 938
+ 939
+ 940
+ 941
+ 942
+ 943
+ 944
+ 945
+ 946
+ 947
+ 948
+ 949
+ 950
+ 951
+ 952
+ 953
+ 954
+ 955
+ 956
+ 957
+ 958
+ 959
+ 960
+ 961
+ 962
+ 963
+ 964
+ 965
+ 966
+ 967
+ 968
+ 969
+ 970
+ 971
+ 972
+ 973
+ 974
+ 975
+ 976
+ 977
+ 978
+ 979
+ 980
+ 981
+ 982
+ 983
+ 984
+ 985
+ 986
+ 987
+ 988
+ 989
+ 990
+ 991
+ 992
+ 993
+ 994
+ 995
+ 996
+ 997
+ 998
+ 999
+1000
+1001
+1002
+1003
+1004
+1005
+1006
+1007
+1008
+1009
+1010
+1011
+1012
+1013
+1014
+1015
+1016
+1017
+1018
class ReduceOperation(BaseOperation):
+    """
+    A class that implements a reduce operation on input data using language models.
+
+    This class extends BaseOperation to provide functionality for reducing grouped data
+    using various strategies including batch reduce, incremental reduce, and parallel fold and merge.
+    """
+
+    class schema(BaseOperation.schema):
+        type: str = "reduce"
+        reduce_key: Union[str, List[str]]
+        output: Optional[Dict[str, Any]] = None
+        prompt: Optional[str] = None
+        optimize: Optional[bool] = None
+        synthesize_resolve: Optional[bool] = None
+        model: Optional[str] = None
+        input: Optional[Dict[str, Any]] = None
+        pass_through: Optional[bool] = None
+        associative: Optional[bool] = None
+        fold_prompt: Optional[str] = None
+        fold_batch_size: Optional[int] = None
+        value_sampling: Optional[Dict[str, Any]] = None
+        verbose: Optional[bool] = None
+        timeout: Optional[int] = None
+        litellm_completion_kwargs: Dict[str, Any] = Field(default_factory=dict)
+        enable_observability: bool = False
+
+    def __init__(self, *args, **kwargs):
+        """
+        Initialize the ReduceOperation.
+
+        Args:
+            *args: Variable length argument list.
+            **kwargs: Arbitrary keyword arguments.
+        """
+        super().__init__(*args, **kwargs)
+        self.min_samples = 5
+        self.max_samples = 1000
+        self.fold_times = deque(maxlen=self.max_samples)
+        self.merge_times = deque(maxlen=self.max_samples)
+        self.lock = Lock()
+        self.config["reduce_key"] = (
+            [self.config["reduce_key"]]
+            if isinstance(self.config["reduce_key"], str)
+            else self.config["reduce_key"]
+        )
+        self.intermediates = {}
+        self.lineage_keys = self.config.get("output", {}).get("lineage", [])
+
+    def syntax_check(self) -> None:
+        """
+        Perform comprehensive syntax checks on the configuration of the ReduceOperation.
+
+        This method validates the presence and correctness of all required configuration keys, Jinja2 templates, and ensures the correct
+        structure and types of the entire configuration.
+
+        The method performs the following checks:
+        1. Verifies the presence of all required keys in the configuration.
+        2. Validates the structure and content of the 'output' configuration, including its 'schema'.
+        3. Checks if the main 'prompt' is a valid Jinja2 template and contains the required 'inputs' variable.
+        4. If 'merge_prompt' is specified, ensures that 'fold_prompt' is also present.
+        5. If 'fold_prompt' is present, verifies the existence of 'fold_batch_size'.
+        6. Validates the 'fold_prompt' as a Jinja2 template with required variables 'inputs' and 'output'.
+        7. If present, checks 'merge_prompt' as a valid Jinja2 template with required 'outputs' variable.
+        8. Verifies types of various configuration inputs (e.g., 'fold_batch_size' as int).
+        9. Checks for the presence and validity of optional configurations like 'model'.
+
+        Raises:
+            ValueError: If any required configuration is missing, if templates are invalid or missing required
+                        variables, or if any other configuration aspect is incorrect or inconsistent.
+            TypeError: If any configuration value has an incorrect type, such as 'schema' not being a dict
+                       or 'fold_batch_size' not being an integer.
+        """
+        required_keys = ["reduce_key", "prompt", "output"]
+        for key in required_keys:
+            if key not in self.config:
+                raise ValueError(
+                    f"Missing required key '{key}' in {self.config['name']} configuration"
+                )
+
+        if "schema" not in self.config["output"]:
+            raise ValueError(
+                f"Missing 'schema' in {self.config['name']} 'output' configuration"
+            )
+
+        if not isinstance(self.config["output"]["schema"], dict):
+            raise TypeError(
+                f"'schema' in {self.config['name']} 'output' configuration must be a dictionary"
+            )
+
+        if not self.config["output"]["schema"]:
+            raise ValueError(
+                f"'schema' in {self.config['name']} 'output' configuration cannot be empty"
+            )
+
+        # Check if the prompt is a valid Jinja2 template
+        try:
+            template = Template(self.config["prompt"])
+            template_vars = template.environment.parse(self.config["prompt"]).find_all(
+                jinja2.nodes.Name
+            )
+            template_var_names = {var.name for var in template_vars}
+            if "inputs" not in template_var_names:
+                raise ValueError(
+                    f"Prompt template for {self.config['name']} must include the 'inputs' variable"
+                )
+        except Exception as e:
+            raise ValueError(
+                f"Invalid Jinja2 template in {self.config['name']} 'prompt': {str(e)}"
+            )
+
+        # Check if fold_prompt is a valid Jinja2 template (now required if merge exists)
+        if "merge_prompt" in self.config:
+            if "fold_prompt" not in self.config:
+                raise ValueError(
+                    f"'fold_prompt' is required when 'merge_prompt' is specified in {self.config['name']}"
+                )
+
+        if "fold_prompt" in self.config:
+            if "fold_batch_size" not in self.config:
+                raise ValueError(
+                    f"'fold_batch_size' is required when 'fold_prompt' is specified in {self.config['name']}"
+                )
+
+            try:
+                fold_template = Template(self.config["fold_prompt"])
+                fold_template_vars = fold_template.environment.parse(
+                    self.config["fold_prompt"]
+                ).find_all(jinja2.nodes.Name)
+                fold_template_var_names = {var.name for var in fold_template_vars}
+                required_vars = {"inputs", "output"}
+                if not required_vars.issubset(fold_template_var_names):
+                    raise ValueError(
+                        f"Fold template in {self.config['name']} must include variables: {required_vars}. Current template includes: {fold_template_var_names}"
+                    )
+            except Exception as e:
+                raise ValueError(
+                    f"Invalid Jinja2 template in {self.config['name']} 'fold_prompt': {str(e)}"
+                )
+
+        # Check merge_prompt and merge_batch_size
+        if "merge_prompt" in self.config:
+            if "merge_batch_size" not in self.config:
+                raise ValueError(
+                    f"'merge_batch_size' is required when 'merge_prompt' is specified in {self.config['name']}"
+                )
+
+            try:
+                merge_template = Template(self.config["merge_prompt"])
+                merge_template_vars = merge_template.environment.parse(
+                    self.config["merge_prompt"]
+                ).find_all(jinja2.nodes.Name)
+                merge_template_var_names = {var.name for var in merge_template_vars}
+                if "outputs" not in merge_template_var_names:
+                    raise ValueError(
+                        f"Merge template in {self.config['name']} must include the 'outputs' variable"
+                    )
+            except Exception as e:
+                raise ValueError(
+                    f"Invalid Jinja2 template in {self.config['name']} 'merge_prompt': {str(e)}"
+                )
+
+        # Check if the model is specified (optional)
+        if "model" in self.config and not isinstance(self.config["model"], str):
+            raise TypeError(
+                f"'model' in {self.config['name']} configuration must be a string"
+            )
+
+        # Check if reduce_key is a string or a list of strings
+        if not isinstance(self.config["reduce_key"], (str, list)):
+            raise TypeError(
+                f"'reduce_key' in {self.config['name']} configuration must be a string or a list of strings"
+            )
+        if isinstance(self.config["reduce_key"], list):
+            if not all(isinstance(key, str) for key in self.config["reduce_key"]):
+                raise TypeError(
+                    f"All elements in 'reduce_key' list in {self.config['name']} configuration must be strings"
+                )
+
+        # Check if input schema is provided and valid (optional)
+        if "input" in self.config:
+            if "schema" not in self.config["input"]:
+                raise ValueError(
+                    f"Missing 'schema' in {self.config['name']} 'input' configuration"
+                )
+            if not isinstance(self.config["input"]["schema"], dict):
+                raise TypeError(
+                    f"'schema' in {self.config['name']} 'input' configuration must be a dictionary"
+                )
+
+        # Check if fold_batch_size and merge_batch_size are positive integers
+        for key in ["fold_batch_size", "merge_batch_size"]:
+            if key in self.config:
+                if not isinstance(self.config[key], int) or self.config[key] <= 0:
+                    raise ValueError(
+                        f"'{key}' in {self.config['name']} configuration must be a positive integer"
+                    )
+
+        if "value_sampling" in self.config:
+            sampling = self.config["value_sampling"]
+            if not isinstance(sampling, dict):
+                raise TypeError(
+                    f"'value_sampling' in {self.config['name']} configuration must be a dictionary"
+                )
+
+            if "enabled" not in sampling:
+                raise ValueError(
+                    f"'enabled' is required in {self.config['name']} 'value_sampling' configuration"
+                )
+            if not isinstance(sampling["enabled"], bool):
+                raise TypeError(
+                    f"'enabled' in {self.config['name']} 'value_sampling' configuration must be a boolean"
+                )
+
+            if sampling["enabled"]:
+                if "sample_size" not in sampling:
+                    raise ValueError(
+                        f"'sample_size' is required when value_sampling is enabled in {self.config['name']}"
+                    )
+                if (
+                    not isinstance(sampling["sample_size"], int)
+                    or sampling["sample_size"] <= 0
+                ):
+                    raise ValueError(
+                        f"'sample_size' in {self.config['name']} configuration must be a positive integer"
+                    )
+
+                if "method" not in sampling:
+                    raise ValueError(
+                        f"'method' is required when value_sampling is enabled in {self.config['name']}"
+                    )
+                if sampling["method"] not in [
+                    "random",
+                    "first_n",
+                    "cluster",
+                    "sem_sim",
+                ]:
+                    raise ValueError(
+                        f"Invalid 'method'. Must be 'random', 'first_n', or 'embedding' in {self.config['name']}"
+                    )
+
+                if sampling["method"] == "embedding":
+                    if "embedding_model" not in sampling:
+                        raise ValueError(
+                            f"'embedding_model' is required when using embedding-based sampling in {self.config['name']}"
+                        )
+                    if "embedding_keys" not in sampling:
+                        raise ValueError(
+                            f"'embedding_keys' is required when using embedding-based sampling in {self.config['name']}"
+                        )
+
+        # Check if lineage is a list of strings
+        if "lineage" in self.config.get("output", {}):
+            if not isinstance(self.config["output"]["lineage"], list):
+                raise TypeError(
+                    f"'lineage' in {self.config['name']} 'output' configuration must be a list"
+                )
+            if not all(
+                isinstance(key, str) for key in self.config["output"]["lineage"]
+            ):
+                raise TypeError(
+                    f"All elements in 'lineage' list in {self.config['name']} 'output' configuration must be strings"
+                )
+
+        self.gleaning_check()
+
+    def execute(self, input_data: List[Dict]) -> Tuple[List[Dict], float]:
+        """
+        Execute the reduce operation on the provided input data.
+
+        This method sorts and groups the input data by the reduce key(s), then processes each group
+        using either parallel fold and merge, incremental reduce, or batch reduce strategies.
+
+        Args:
+            input_data (List[Dict]): The input data to process.
+
+        Returns:
+            Tuple[List[Dict], float]: A tuple containing the processed results and the total cost of the operation.
+        """
+        if self.config.get("gleaning", {}).get("validation_prompt", None):
+            self.console.log(
+                f"Using gleaning with validation prompt: {self.config.get('gleaning', {}).get('validation_prompt', '')}"
+            )
+
+        reduce_keys = self.config["reduce_key"]
+        if isinstance(reduce_keys, str):
+            reduce_keys = [reduce_keys]
+        input_schema = self.config.get("input", {}).get("schema", {})
+
+        if self.status:
+            self.status.stop()
+
+        # Check if we need to group everything into one group
+        if reduce_keys == ["_all"] or reduce_keys == "_all":
+            grouped_data = [("_all", input_data)]
+        else:
+            # Group the input data by the reduce key(s) while maintaining original order
+            def get_group_key(item):
+                key_values = []
+                for key in reduce_keys:
+                    value = item[key]
+                    # Special handling for list-type values
+                    if isinstance(value, list):
+                        key_values.append(
+                            tuple(sorted(value))
+                        )  # Convert list to sorted tuple
+                    else:
+                        key_values.append(value)
+                return tuple(key_values)
+
+            grouped_data = {}
+            for item in input_data:
+                key = get_group_key(item)
+                if key not in grouped_data:
+                    grouped_data[key] = []
+                grouped_data[key].append(item)
+
+            # Convert the grouped data to a list of tuples
+            grouped_data = list(grouped_data.items())
+
+        def process_group(
+            key: Tuple, group_elems: List[Dict]
+        ) -> Tuple[Optional[Dict], float]:
+            if input_schema:
+                group_list = [
+                    {k: item[k] for k in input_schema.keys() if k in item}
+                    for item in group_elems
+                ]
+            else:
+                group_list = group_elems
+
+            total_cost = 0.0
+
+            # Apply value sampling if enabled
+            value_sampling = self.config.get("value_sampling", {})
+            if value_sampling.get("enabled", False):
+                sample_size = min(value_sampling["sample_size"], len(group_list))
+                method = value_sampling["method"]
+
+                if method == "random":
+                    group_sample = random.sample(group_list, sample_size)
+                    group_sample.sort(key=lambda x: group_list.index(x))
+                elif method == "first_n":
+                    group_sample = group_list[:sample_size]
+                elif method == "cluster":
+                    group_sample, embedding_cost = self._cluster_based_sampling(
+                        group_list, value_sampling, sample_size
+                    )
+                    group_sample.sort(key=lambda x: group_list.index(x))
+                    total_cost += embedding_cost
+                elif method == "sem_sim":
+                    group_sample, embedding_cost = self._semantic_similarity_sampling(
+                        key, group_list, value_sampling, sample_size
+                    )
+                    group_sample.sort(key=lambda x: group_list.index(x))
+                    total_cost += embedding_cost
+
+                group_list = group_sample
+
+            # Only execute merge-based plans if associative = True
+            if "merge_prompt" in self.config and self.config.get("associative", True):
+                result, prompts, cost = self._parallel_fold_and_merge(key, group_list)
+            elif self.config.get("fold_batch_size", None) and self.config.get(
+                "fold_batch_size"
+            ) >= len(group_list):
+                # If the fold batch size is greater than or equal to the number of items in the group,
+                # we can just run a single fold operation
+                result, prompt, cost = self._batch_reduce(key, group_list)
+                prompts = [prompt]
+            elif "fold_prompt" in self.config:
+                result, prompts, cost = self._incremental_reduce(key, group_list)
+            else:
+                result, prompt, cost = self._batch_reduce(key, group_list)
+                prompts = [prompt]
+
+            total_cost += cost
+
+            # Add the counts of items in the group to the result
+            result[f"_counts_prereduce_{self.config['name']}"] = len(group_elems)
+
+            if self.config.get("enable_observability", False):
+                # Add the _observability_{self.config['name']} key to the result
+                result[f"_observability_{self.config['name']}"] = {"prompts": prompts}
+
+            # Apply pass-through at the group level
+            if (
+                result is not None
+                and self.config.get("pass_through", False)
+                and group_elems
+            ):
+                for k, v in group_elems[0].items():
+                    if k not in self.config["output"]["schema"] and k not in result:
+                        result[k] = v
+
+            # Add lineage information
+            if result is not None and self.lineage_keys:
+                lineage = []
+                for item in group_elems:
+                    lineage_item = {
+                        k: item.get(k) for k in self.lineage_keys if k in item
+                    }
+                    if lineage_item:
+                        lineage.append(lineage_item)
+                result[f"{self.config['name']}_lineage"] = lineage
+
+            return result, total_cost
+
+        with ThreadPoolExecutor(max_workers=self.max_threads) as executor:
+            futures = [
+                executor.submit(process_group, key, group)
+                for key, group in grouped_data
+            ]
+            results = []
+            total_cost = 0
+            for future in rich_as_completed(
+                futures,
+                total=len(futures),
+                desc=f"Processing {self.config['name']} (reduce) on all documents",
+                leave=True,
+                console=self.console,
+            ):
+                output, item_cost = future.result()
+                total_cost += item_cost
+                if output is not None:
+                    results.append(output)
+
+        if self.config.get("persist_intermediates", False):
+            for result in results:
+                key = tuple(result[k] for k in self.config["reduce_key"])
+                if key in self.intermediates:
+                    result[f"_{self.config['name']}_intermediates"] = (
+                        self.intermediates[key]
+                    )
+
+        if self.status:
+            self.status.start()
+
+        return results, total_cost
+
+    def _cluster_based_sampling(
+        self, group_list: List[Dict], value_sampling: Dict, sample_size: int
+    ) -> Tuple[List[Dict], float]:
+        if sample_size >= len(group_list):
+            return group_list, 0
+
+        clusters, cost = cluster_documents(
+            group_list, value_sampling, sample_size, self.runner.api
+        )
+
+        sampled_items = []
+        idx_added_already = set()
+        num_clusters = len(clusters)
+        for i in range(sample_size):
+            # Add a random item from the cluster
+            idx = i % num_clusters
+
+            # Skip if there are no items in the cluster
+            if len(clusters[idx]) == 0:
+                continue
+
+            if len(clusters[idx]) == 1:
+                # If there's only one item in the cluster, add it directly if we haven't already
+                if idx not in idx_added_already:
+                    sampled_items.append(clusters[idx][0])
+                continue
+
+            random_choice_idx = random.randint(0, len(clusters[idx]) - 1)
+            max_attempts = 10
+            while random_choice_idx in idx_added_already and max_attempts > 0:
+                random_choice_idx = random.randint(0, len(clusters[idx]) - 1)
+                max_attempts -= 1
+            idx_added_already.add(random_choice_idx)
+            sampled_items.append(clusters[idx][random_choice_idx])
+
+        return sampled_items, cost
+
+    def _semantic_similarity_sampling(
+        self, key: Tuple, group_list: List[Dict], value_sampling: Dict, sample_size: int
+    ) -> Tuple[List[Dict], float]:
+        embedding_model = value_sampling["embedding_model"]
+        query_text = strict_render(
+            value_sampling["query_text"],
+            {"reduce_key": dict(zip(self.config["reduce_key"], key))},
+        )
+
+        embeddings, cost = get_embeddings_for_clustering(
+            group_list, value_sampling, self.runner.api
+        )
+
+        query_response = self.runner.api.gen_embedding(embedding_model, [query_text])
+        query_embedding = query_response["data"][0]["embedding"]
+        cost += completion_cost(query_response)
+
+        from sklearn.metrics.pairwise import cosine_similarity
+
+        similarities = cosine_similarity([query_embedding], embeddings)[0]
+
+        top_k_indices = np.argsort(similarities)[-sample_size:]
+
+        return [group_list[i] for i in top_k_indices], cost
+
+    def _parallel_fold_and_merge(
+        self, key: Tuple, group_list: List[Dict]
+    ) -> Tuple[Optional[Dict], float]:
+        """
+        Perform parallel folding and merging on a group of items.
+
+        This method implements a strategy that combines parallel folding of input items
+        and merging of intermediate results to efficiently process large groups. It works as follows:
+        1. The input group is initially divided into smaller batches for efficient processing.
+        2. The method performs an initial round of folding operations on these batches.
+        3. After the first round of folds, a few merges are performed to estimate the merge runtime.
+        4. Based on the estimated merge runtime and observed fold runtime, it calculates the optimal number of parallel folds. Subsequent rounds of folding are then performed concurrently, with the number of parallel folds determined by the runtime estimates.
+        5. The folding process repeats in rounds, progressively reducing the number of items to be processed.
+        6. Once all folding operations are complete, the method recursively performs final merges on the fold results to combine them into a final result.
+        7. Throughout this process, the method may adjust the number of parallel folds based on updated performance metrics (i.e., fold and merge runtimes) to maintain efficiency.
+
+        Args:
+            key (Tuple): The reduce key tuple for the group.
+            group_list (List[Dict]): The list of items in the group to be processed.
+
+        Returns:
+            Tuple[Optional[Dict], float]: A tuple containing the final merged result (or None if processing failed)
+            and the total cost of the operation.
+        """
+        fold_batch_size = self.config["fold_batch_size"]
+        merge_batch_size = self.config["merge_batch_size"]
+        total_cost = 0
+        prompts = []
+
+        def calculate_num_parallel_folds():
+            fold_time, fold_default = self.get_fold_time()
+            merge_time, merge_default = self.get_merge_time()
+            num_group_items = len(group_list)
+            return (
+                max(
+                    1,
+                    int(
+                        (fold_time * num_group_items * math.log(merge_batch_size))
+                        / (fold_batch_size * merge_time)
+                    ),
+                ),
+                fold_default or merge_default,
+            )
+
+        num_parallel_folds, used_default_times = calculate_num_parallel_folds()
+        fold_results = []
+        remaining_items = group_list
+
+        if self.config.get("persist_intermediates", False):
+            self.intermediates[key] = []
+            iter_count = 0
+
+        # Parallel folding and merging
+        with ThreadPoolExecutor(max_workers=self.max_threads) as executor:
+            while remaining_items:
+                # Folding phase
+                fold_futures = []
+                for i in range(min(num_parallel_folds, len(remaining_items))):
+                    batch = remaining_items[:fold_batch_size]
+                    remaining_items = remaining_items[fold_batch_size:]
+                    current_output = fold_results[i] if i < len(fold_results) else None
+                    fold_futures.append(
+                        executor.submit(
+                            self._increment_fold, key, batch, current_output
+                        )
+                    )
+
+                new_fold_results = []
+                for future in as_completed(fold_futures):
+                    result, prompt, cost = future.result()
+                    total_cost += cost
+                    prompts.append(prompt)
+                    if result is not None:
+                        new_fold_results.append(result)
+                        if self.config.get("persist_intermediates", False):
+                            self.intermediates[key].append(
+                                {
+                                    "iter": iter_count,
+                                    "intermediate": result,
+                                    "scratchpad": result["updated_scratchpad"],
+                                }
+                            )
+                            iter_count += 1
+
+                # Update fold_results with new results
+                fold_results = new_fold_results + fold_results[len(new_fold_results) :]
+
+                # Single pass merging phase
+                if (
+                    len(self.merge_times) < self.min_samples
+                    and len(fold_results) >= merge_batch_size
+                ):
+                    merge_futures = []
+                    for i in range(0, len(fold_results), merge_batch_size):
+                        batch = fold_results[i : i + merge_batch_size]
+                        merge_futures.append(
+                            executor.submit(self._merge_results, key, batch)
+                        )
+
+                    new_results = []
+                    for future in as_completed(merge_futures):
+                        result, prompt, cost = future.result()
+                        total_cost += cost
+                        prompts.append(prompt)
+                        if result is not None:
+                            new_results.append(result)
+                            if self.config.get("persist_intermediates", False):
+                                self.intermediates[key].append(
+                                    {
+                                        "iter": iter_count,
+                                        "intermediate": result,
+                                        "scratchpad": None,
+                                    }
+                                )
+                                iter_count += 1
+
+                    fold_results = new_results
+
+                # Recalculate num_parallel_folds if we used default times
+                if used_default_times:
+                    new_num_parallel_folds, used_default_times = (
+                        calculate_num_parallel_folds()
+                    )
+                    if not used_default_times:
+                        self.console.log(
+                            f"Recalculated num_parallel_folds from {num_parallel_folds} to {new_num_parallel_folds}"
+                        )
+                        num_parallel_folds = new_num_parallel_folds
+
+        # Final merging if needed
+        while len(fold_results) > 1:
+            self.console.log(f"Finished folding! Merging {len(fold_results)} items.")
+            with ThreadPoolExecutor(max_workers=self.max_threads) as executor:
+                merge_futures = []
+                for i in range(0, len(fold_results), merge_batch_size):
+                    batch = fold_results[i : i + merge_batch_size]
+                    merge_futures.append(
+                        executor.submit(self._merge_results, key, batch)
+                    )
+
+                new_results = []
+                for future in as_completed(merge_futures):
+                    result, prompt, cost = future.result()
+                    total_cost += cost
+                    prompts.append(prompt)
+                    if result is not None:
+                        new_results.append(result)
+                        if self.config.get("persist_intermediates", False):
+                            self.intermediates[key].append(
+                                {
+                                    "iter": iter_count,
+                                    "intermediate": result,
+                                    "scratchpad": None,
+                                }
+                            )
+                            iter_count += 1
+
+                fold_results = new_results
+
+        return (
+            (fold_results[0], prompts, total_cost)
+            if fold_results
+            else (None, prompts, total_cost)
+        )
+
+    def _incremental_reduce(
+        self, key: Tuple, group_list: List[Dict]
+    ) -> Tuple[Optional[Dict], List[str], float]:
+        """
+        Perform an incremental reduce operation on a group of items.
+
+        This method processes the group in batches, incrementally folding the results.
+
+        Args:
+            key (Tuple): The reduce key tuple for the group.
+            group_list (List[Dict]): The list of items in the group to be processed.
+
+        Returns:
+            Tuple[Optional[Dict], List[str], float]: A tuple containing the final reduced result (or None if processing failed),
+            the list of prompts used, and the total cost of the operation.
+        """
+        fold_batch_size = self.config["fold_batch_size"]
+        total_cost = 0
+        current_output = None
+        prompts = []
+
+        # Calculate and log the number of folds to be performed
+        num_folds = (len(group_list) + fold_batch_size - 1) // fold_batch_size
+
+        scratchpad = ""
+        if self.config.get("persist_intermediates", False):
+            self.intermediates[key] = []
+            iter_count = 0
+
+        for i in range(0, len(group_list), fold_batch_size):
+            # Log the current iteration and total number of folds
+            current_fold = i // fold_batch_size + 1
+            if self.config.get("verbose", False):
+                self.console.log(
+                    f"Processing fold {current_fold} of {num_folds} for group with key {key}"
+                )
+            batch = group_list[i : i + fold_batch_size]
+
+            folded_output, prompt, fold_cost = self._increment_fold(
+                key, batch, current_output, scratchpad
+            )
+            total_cost += fold_cost
+            prompts.append(prompt)
+
+            if folded_output is None:
+                continue
+
+            if self.config.get("persist_intermediates", False):
+                self.intermediates[key].append(
+                    {
+                        "iter": iter_count,
+                        "intermediate": folded_output,
+                        "scratchpad": folded_output["updated_scratchpad"],
+                    }
+                )
+                iter_count += 1
+
+            # Pop off updated_scratchpad
+            if "updated_scratchpad" in folded_output:
+                scratchpad = folded_output["updated_scratchpad"]
+                if self.config.get("verbose", False):
+                    self.console.log(
+                        f"Updated scratchpad for fold {current_fold}: {scratchpad}"
+                    )
+                del folded_output["updated_scratchpad"]
+
+            current_output = folded_output
+
+        return current_output, prompts, total_cost
+
+    def validation_fn(self, response: Dict[str, Any]):
+        output = self.runner.api.parse_llm_response(
+            response,
+            schema=self.config["output"]["schema"],
+        )[0]
+        if self.runner.api.validate_output(self.config, output, self.console):
+            return output, True
+        return output, False
+
+    def _increment_fold(
+        self,
+        key: Tuple,
+        batch: List[Dict],
+        current_output: Optional[Dict],
+        scratchpad: Optional[str] = None,
+    ) -> Tuple[Optional[Dict], str, float]:
+        """
+        Perform an incremental fold operation on a batch of items.
+
+        This method folds a batch of items into the current output using the fold prompt.
+
+        Args:
+            key (Tuple): The reduce key tuple for the group.
+            batch (List[Dict]): The batch of items to be folded.
+            current_output (Optional[Dict]): The current accumulated output, if any.
+            scratchpad (Optional[str]): The scratchpad to use for the fold operation.
+        Returns:
+            Tuple[Optional[Dict], str, float]: A tuple containing the folded output (or None if processing failed),
+            the prompt used, and the cost of the fold operation.
+        """
+        if current_output is None:
+            return self._batch_reduce(key, batch, scratchpad)
+
+        start_time = time.time()
+        fold_prompt = strict_render(
+            self.config["fold_prompt"],
+            {
+                "inputs": batch,
+                "output": current_output,
+                "reduce_key": dict(zip(self.config["reduce_key"], key)),
+            },
+        )
+
+        response = self.runner.api.call_llm(
+            self.config.get("model", self.default_model),
+            "reduce",
+            [{"role": "user", "content": fold_prompt}],
+            self.config["output"]["schema"],
+            scratchpad=scratchpad,
+            timeout_seconds=self.config.get("timeout", 120),
+            max_retries_per_timeout=self.config.get("max_retries_per_timeout", 2),
+            validation_config=(
+                {
+                    "num_retries": self.num_retries_on_validate_failure,
+                    "val_rule": self.config.get("validate", []),
+                    "validation_fn": self.validation_fn,
+                }
+                if self.config.get("validate", None)
+                else None
+            ),
+            bypass_cache=self.config.get("bypass_cache", False),
+            verbose=self.config.get("verbose", False),
+            litellm_completion_kwargs=self.config.get("litellm_completion_kwargs", {}),
+        )
+
+        end_time = time.time()
+        self._update_fold_time(end_time - start_time)
+
+        if response.validated:
+            folded_output = self.runner.api.parse_llm_response(
+                response.response,
+                schema=self.config["output"]["schema"],
+                manually_fix_errors=self.manually_fix_errors,
+            )[0]
+
+            folded_output.update(dict(zip(self.config["reduce_key"], key)))
+            fold_cost = response.total_cost
+
+            return folded_output, fold_prompt, fold_cost
+
+        return None, fold_prompt, fold_cost
+
+    def _merge_results(
+        self, key: Tuple, outputs: List[Dict]
+    ) -> Tuple[Optional[Dict], str, float]:
+        """
+        Merge multiple outputs into a single result.
+
+        This method merges a list of outputs using the merge prompt.
+
+        Args:
+            key (Tuple): The reduce key tuple for the group.
+            outputs (List[Dict]): The list of outputs to be merged.
+
+        Returns:
+            Tuple[Optional[Dict], str, float]: A tuple containing the merged output (or None if processing failed),
+            the prompt used, and the cost of the merge operation.
+        """
+        start_time = time.time()
+        merge_prompt = strict_render(
+            self.config["merge_prompt"],
+            {
+                "outputs": outputs,
+                "reduce_key": dict(zip(self.config["reduce_key"], key)),
+            },
+        )
+        response = self.runner.api.call_llm(
+            self.config.get("model", self.default_model),
+            "merge",
+            [{"role": "user", "content": merge_prompt}],
+            self.config["output"]["schema"],
+            timeout_seconds=self.config.get("timeout", 120),
+            max_retries_per_timeout=self.config.get("max_retries_per_timeout", 2),
+            validation_config=(
+                {
+                    "num_retries": self.num_retries_on_validate_failure,
+                    "val_rule": self.config.get("validate", []),
+                    "validation_fn": self.validation_fn,
+                }
+                if self.config.get("validate", None)
+                else None
+            ),
+            bypass_cache=self.config.get("bypass_cache", False),
+            verbose=self.config.get("verbose", False),
+            litellm_completion_kwargs=self.config.get("litellm_completion_kwargs", {}),
+        )
+
+        end_time = time.time()
+        self._update_merge_time(end_time - start_time)
+
+        if response.validated:
+            merged_output = self.runner.api.parse_llm_response(
+                response.response,
+                schema=self.config["output"]["schema"],
+                manually_fix_errors=self.manually_fix_errors,
+            )[0]
+            merged_output.update(dict(zip(self.config["reduce_key"], key)))
+            merge_cost = response.total_cost
+            return merged_output, merge_prompt, merge_cost
+
+        return None, merge_prompt, merge_cost
+
+    def get_fold_time(self) -> Tuple[float, bool]:
+        """
+        Get the average fold time or a default value.
+
+        Returns:
+            Tuple[float, bool]: A tuple containing the average fold time (or default) and a boolean
+            indicating whether the default value was used.
+        """
+        if "fold_time" in self.config:
+            return self.config["fold_time"], False
+        with self.lock:
+            if len(self.fold_times) >= self.min_samples:
+                return sum(self.fold_times) / len(self.fold_times), False
+        return 1.0, True  # Default to 1 second if no data is available
+
+    def get_merge_time(self) -> Tuple[float, bool]:
+        """
+        Get the average merge time or a default value.
+
+        Returns:
+            Tuple[float, bool]: A tuple containing the average merge time (or default) and a boolean
+            indicating whether the default value was used.
+        """
+        if "merge_time" in self.config:
+            return self.config["merge_time"], False
+        with self.lock:
+            if len(self.merge_times) >= self.min_samples:
+                return sum(self.merge_times) / len(self.merge_times), False
+        return 1.0, True  # Default to 1 second if no data is available
+
+    def _update_fold_time(self, time: float) -> None:
+        """
+        Update the fold time statistics.
+
+        Args:
+            time (float): The time taken for a fold operation.
+        """
+        with self.lock:
+            self.fold_times.append(time)
+
+    def _update_merge_time(self, time: float) -> None:
+        """
+        Update the merge time statistics.
+
+        Args:
+            time (float): The time taken for a merge operation.
+        """
+        with self.lock:
+            self.merge_times.append(time)
+
+    def _batch_reduce(
+        self, key: Tuple, group_list: List[Dict], scratchpad: Optional[str] = None
+    ) -> Tuple[Optional[Dict], str, float]:
+        """
+        Perform a batch reduce operation on a group of items.
+
+        This method reduces a group of items into a single output using the reduce prompt.
+
+        Args:
+            key (Tuple): The reduce key tuple for the group.
+            group_list (List[Dict]): The list of items to be reduced.
+            scratchpad (Optional[str]): The scratchpad to use for the reduce operation.
+        Returns:
+            Tuple[Optional[Dict], str, float]: A tuple containing the reduced output (or None if processing failed),
+            the prompt used, and the cost of the reduce operation.
+        """
+        prompt = strict_render(
+            self.config["prompt"],
+            {
+                "reduce_key": dict(zip(self.config["reduce_key"], key)),
+                "inputs": group_list,
+            },
+        )
+        item_cost = 0
+
+        response = self.runner.api.call_llm(
+            self.config.get("model", self.default_model),
+            "reduce",
+            [{"role": "user", "content": prompt}],
+            self.config["output"]["schema"],
+            scratchpad=scratchpad,
+            timeout_seconds=self.config.get("timeout", 120),
+            max_retries_per_timeout=self.config.get("max_retries_per_timeout", 2),
+            bypass_cache=self.config.get("bypass_cache", False),
+            validation_config=(
+                {
+                    "num_retries": self.num_retries_on_validate_failure,
+                    "val_rule": self.config.get("validate", []),
+                    "validation_fn": self.validation_fn,
+                }
+                if self.config.get("validate", None)
+                else None
+            ),
+            gleaning_config=self.config.get("gleaning", None),
+            verbose=self.config.get("verbose", False),
+            litellm_completion_kwargs=self.config.get("litellm_completion_kwargs", {}),
+        )
+
+        item_cost += response.total_cost
+
+        if response.validated:
+            output = self.runner.api.parse_llm_response(
+                response.response,
+                schema=self.config["output"]["schema"],
+                manually_fix_errors=self.manually_fix_errors,
+            )[0]
+            output.update(dict(zip(self.config["reduce_key"], key)))
+
+            return output, prompt, item_cost
+        return None, prompt, item_cost
+
+
+ + + +
+ + + + + + + + + +
+ + +

+ __init__(*args, **kwargs) + +

+ + +
+ +

Initialize the ReduceOperation.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ *args + + +
+

Variable length argument list.

+
+
+ () +
+ **kwargs + + +
+

Arbitrary keyword arguments.

+
+
+ {} +
+ +
+ Source code in docetl/operations/reduce.py +
58
+59
+60
+61
+62
+63
+64
+65
+66
+67
+68
+69
+70
+71
+72
+73
+74
+75
+76
+77
+78
def __init__(self, *args, **kwargs):
+    """
+    Initialize the ReduceOperation.
+
+    Args:
+        *args: Variable length argument list.
+        **kwargs: Arbitrary keyword arguments.
+    """
+    super().__init__(*args, **kwargs)
+    self.min_samples = 5
+    self.max_samples = 1000
+    self.fold_times = deque(maxlen=self.max_samples)
+    self.merge_times = deque(maxlen=self.max_samples)
+    self.lock = Lock()
+    self.config["reduce_key"] = (
+        [self.config["reduce_key"]]
+        if isinstance(self.config["reduce_key"], str)
+        else self.config["reduce_key"]
+    )
+    self.intermediates = {}
+    self.lineage_keys = self.config.get("output", {}).get("lineage", [])
+
+
+
+ +
+ +
+ + +

+ execute(input_data) + +

+ + +
+ +

Execute the reduce operation on the provided input data.

+

This method sorts and groups the input data by the reduce key(s), then processes each group +using either parallel fold and merge, incremental reduce, or batch reduce strategies.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ input_data + + List[Dict] + +
+

The input data to process.

+
+
+ required +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ Tuple[List[Dict], float] + +
+

Tuple[List[Dict], float]: A tuple containing the processed results and the total cost of the operation.

+
+
+ +
+ Source code in docetl/operations/reduce.py +
297
+298
+299
+300
+301
+302
+303
+304
+305
+306
+307
+308
+309
+310
+311
+312
+313
+314
+315
+316
+317
+318
+319
+320
+321
+322
+323
+324
+325
+326
+327
+328
+329
+330
+331
+332
+333
+334
+335
+336
+337
+338
+339
+340
+341
+342
+343
+344
+345
+346
+347
+348
+349
+350
+351
+352
+353
+354
+355
+356
+357
+358
+359
+360
+361
+362
+363
+364
+365
+366
+367
+368
+369
+370
+371
+372
+373
+374
+375
+376
+377
+378
+379
+380
+381
+382
+383
+384
+385
+386
+387
+388
+389
+390
+391
+392
+393
+394
+395
+396
+397
+398
+399
+400
+401
+402
+403
+404
+405
+406
+407
+408
+409
+410
+411
+412
+413
+414
+415
+416
+417
+418
+419
+420
+421
+422
+423
+424
+425
+426
+427
+428
+429
+430
+431
+432
+433
+434
+435
+436
+437
+438
+439
+440
+441
+442
+443
+444
+445
+446
+447
+448
+449
+450
+451
+452
+453
+454
+455
+456
+457
+458
+459
+460
+461
+462
+463
+464
+465
+466
+467
+468
def execute(self, input_data: List[Dict]) -> Tuple[List[Dict], float]:
+    """
+    Execute the reduce operation on the provided input data.
+
+    This method sorts and groups the input data by the reduce key(s), then processes each group
+    using either parallel fold and merge, incremental reduce, or batch reduce strategies.
+
+    Args:
+        input_data (List[Dict]): The input data to process.
+
+    Returns:
+        Tuple[List[Dict], float]: A tuple containing the processed results and the total cost of the operation.
+    """
+    if self.config.get("gleaning", {}).get("validation_prompt", None):
+        self.console.log(
+            f"Using gleaning with validation prompt: {self.config.get('gleaning', {}).get('validation_prompt', '')}"
+        )
+
+    reduce_keys = self.config["reduce_key"]
+    if isinstance(reduce_keys, str):
+        reduce_keys = [reduce_keys]
+    input_schema = self.config.get("input", {}).get("schema", {})
+
+    if self.status:
+        self.status.stop()
+
+    # Check if we need to group everything into one group
+    if reduce_keys == ["_all"] or reduce_keys == "_all":
+        grouped_data = [("_all", input_data)]
+    else:
+        # Group the input data by the reduce key(s) while maintaining original order
+        def get_group_key(item):
+            key_values = []
+            for key in reduce_keys:
+                value = item[key]
+                # Special handling for list-type values
+                if isinstance(value, list):
+                    key_values.append(
+                        tuple(sorted(value))
+                    )  # Convert list to sorted tuple
+                else:
+                    key_values.append(value)
+            return tuple(key_values)
+
+        grouped_data = {}
+        for item in input_data:
+            key = get_group_key(item)
+            if key not in grouped_data:
+                grouped_data[key] = []
+            grouped_data[key].append(item)
+
+        # Convert the grouped data to a list of tuples
+        grouped_data = list(grouped_data.items())
+
+    def process_group(
+        key: Tuple, group_elems: List[Dict]
+    ) -> Tuple[Optional[Dict], float]:
+        if input_schema:
+            group_list = [
+                {k: item[k] for k in input_schema.keys() if k in item}
+                for item in group_elems
+            ]
+        else:
+            group_list = group_elems
+
+        total_cost = 0.0
+
+        # Apply value sampling if enabled
+        value_sampling = self.config.get("value_sampling", {})
+        if value_sampling.get("enabled", False):
+            sample_size = min(value_sampling["sample_size"], len(group_list))
+            method = value_sampling["method"]
+
+            if method == "random":
+                group_sample = random.sample(group_list, sample_size)
+                group_sample.sort(key=lambda x: group_list.index(x))
+            elif method == "first_n":
+                group_sample = group_list[:sample_size]
+            elif method == "cluster":
+                group_sample, embedding_cost = self._cluster_based_sampling(
+                    group_list, value_sampling, sample_size
+                )
+                group_sample.sort(key=lambda x: group_list.index(x))
+                total_cost += embedding_cost
+            elif method == "sem_sim":
+                group_sample, embedding_cost = self._semantic_similarity_sampling(
+                    key, group_list, value_sampling, sample_size
+                )
+                group_sample.sort(key=lambda x: group_list.index(x))
+                total_cost += embedding_cost
+
+            group_list = group_sample
+
+        # Only execute merge-based plans if associative = True
+        if "merge_prompt" in self.config and self.config.get("associative", True):
+            result, prompts, cost = self._parallel_fold_and_merge(key, group_list)
+        elif self.config.get("fold_batch_size", None) and self.config.get(
+            "fold_batch_size"
+        ) >= len(group_list):
+            # If the fold batch size is greater than or equal to the number of items in the group,
+            # we can just run a single fold operation
+            result, prompt, cost = self._batch_reduce(key, group_list)
+            prompts = [prompt]
+        elif "fold_prompt" in self.config:
+            result, prompts, cost = self._incremental_reduce(key, group_list)
+        else:
+            result, prompt, cost = self._batch_reduce(key, group_list)
+            prompts = [prompt]
+
+        total_cost += cost
+
+        # Add the counts of items in the group to the result
+        result[f"_counts_prereduce_{self.config['name']}"] = len(group_elems)
+
+        if self.config.get("enable_observability", False):
+            # Add the _observability_{self.config['name']} key to the result
+            result[f"_observability_{self.config['name']}"] = {"prompts": prompts}
+
+        # Apply pass-through at the group level
+        if (
+            result is not None
+            and self.config.get("pass_through", False)
+            and group_elems
+        ):
+            for k, v in group_elems[0].items():
+                if k not in self.config["output"]["schema"] and k not in result:
+                    result[k] = v
+
+        # Add lineage information
+        if result is not None and self.lineage_keys:
+            lineage = []
+            for item in group_elems:
+                lineage_item = {
+                    k: item.get(k) for k in self.lineage_keys if k in item
+                }
+                if lineage_item:
+                    lineage.append(lineage_item)
+            result[f"{self.config['name']}_lineage"] = lineage
+
+        return result, total_cost
+
+    with ThreadPoolExecutor(max_workers=self.max_threads) as executor:
+        futures = [
+            executor.submit(process_group, key, group)
+            for key, group in grouped_data
+        ]
+        results = []
+        total_cost = 0
+        for future in rich_as_completed(
+            futures,
+            total=len(futures),
+            desc=f"Processing {self.config['name']} (reduce) on all documents",
+            leave=True,
+            console=self.console,
+        ):
+            output, item_cost = future.result()
+            total_cost += item_cost
+            if output is not None:
+                results.append(output)
+
+    if self.config.get("persist_intermediates", False):
+        for result in results:
+            key = tuple(result[k] for k in self.config["reduce_key"])
+            if key in self.intermediates:
+                result[f"_{self.config['name']}_intermediates"] = (
+                    self.intermediates[key]
+                )
+
+    if self.status:
+        self.status.start()
+
+    return results, total_cost
+
+
+
+ +
+ +
+ + +

+ get_fold_time() + +

+ + +
+ +

Get the average fold time or a default value.

+ + +

Returns:

+ + + + + + + + + + + + + + + + + +
TypeDescription
+ float + +
+

Tuple[float, bool]: A tuple containing the average fold time (or default) and a boolean

+
+
+ bool + +
+

indicating whether the default value was used.

+
+
+ +
+ Source code in docetl/operations/reduce.py +
909
+910
+911
+912
+913
+914
+915
+916
+917
+918
+919
+920
+921
+922
def get_fold_time(self) -> Tuple[float, bool]:
+    """
+    Get the average fold time or a default value.
+
+    Returns:
+        Tuple[float, bool]: A tuple containing the average fold time (or default) and a boolean
+        indicating whether the default value was used.
+    """
+    if "fold_time" in self.config:
+        return self.config["fold_time"], False
+    with self.lock:
+        if len(self.fold_times) >= self.min_samples:
+            return sum(self.fold_times) / len(self.fold_times), False
+    return 1.0, True  # Default to 1 second if no data is available
+
+
+
+ +
+ +
+ + +

+ get_merge_time() + +

+ + +
+ +

Get the average merge time or a default value.

+ + +

Returns:

+ + + + + + + + + + + + + + + + + +
TypeDescription
+ float + +
+

Tuple[float, bool]: A tuple containing the average merge time (or default) and a boolean

+
+
+ bool + +
+

indicating whether the default value was used.

+
+
+ +
+ Source code in docetl/operations/reduce.py +
924
+925
+926
+927
+928
+929
+930
+931
+932
+933
+934
+935
+936
+937
def get_merge_time(self) -> Tuple[float, bool]:
+    """
+    Get the average merge time or a default value.
+
+    Returns:
+        Tuple[float, bool]: A tuple containing the average merge time (or default) and a boolean
+        indicating whether the default value was used.
+    """
+    if "merge_time" in self.config:
+        return self.config["merge_time"], False
+    with self.lock:
+        if len(self.merge_times) >= self.min_samples:
+            return sum(self.merge_times) / len(self.merge_times), False
+    return 1.0, True  # Default to 1 second if no data is available
+
+
+
+ +
+ +
+ + +

+ syntax_check() + +

+ + +
+ +

Perform comprehensive syntax checks on the configuration of the ReduceOperation.

+

This method validates the presence and correctness of all required configuration keys, Jinja2 templates, and ensures the correct +structure and types of the entire configuration.

+

The method performs the following checks: +1. Verifies the presence of all required keys in the configuration. +2. Validates the structure and content of the 'output' configuration, including its 'schema'. +3. Checks if the main 'prompt' is a valid Jinja2 template and contains the required 'inputs' variable. +4. If 'merge_prompt' is specified, ensures that 'fold_prompt' is also present. +5. If 'fold_prompt' is present, verifies the existence of 'fold_batch_size'. +6. Validates the 'fold_prompt' as a Jinja2 template with required variables 'inputs' and 'output'. +7. If present, checks 'merge_prompt' as a valid Jinja2 template with required 'outputs' variable. +8. Verifies types of various configuration inputs (e.g., 'fold_batch_size' as int). +9. Checks for the presence and validity of optional configurations like 'model'.

+ + +

Raises:

+ + + + + + + + + + + + + + + + + +
TypeDescription
+ ValueError + +
+

If any required configuration is missing, if templates are invalid or missing required + variables, or if any other configuration aspect is incorrect or inconsistent.

+
+
+ TypeError + +
+

If any configuration value has an incorrect type, such as 'schema' not being a dict + or 'fold_batch_size' not being an integer.

+
+
+ +
+ Source code in docetl/operations/reduce.py +
 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
+162
+163
+164
+165
+166
+167
+168
+169
+170
+171
+172
+173
+174
+175
+176
+177
+178
+179
+180
+181
+182
+183
+184
+185
+186
+187
+188
+189
+190
+191
+192
+193
+194
+195
+196
+197
+198
+199
+200
+201
+202
+203
+204
+205
+206
+207
+208
+209
+210
+211
+212
+213
+214
+215
+216
+217
+218
+219
+220
+221
+222
+223
+224
+225
+226
+227
+228
+229
+230
+231
+232
+233
+234
+235
+236
+237
+238
+239
+240
+241
+242
+243
+244
+245
+246
+247
+248
+249
+250
+251
+252
+253
+254
+255
+256
+257
+258
+259
+260
+261
+262
+263
+264
+265
+266
+267
+268
+269
+270
+271
+272
+273
+274
+275
+276
+277
+278
+279
+280
+281
+282
+283
+284
+285
+286
+287
+288
+289
+290
+291
+292
+293
+294
+295
def syntax_check(self) -> None:
+    """
+    Perform comprehensive syntax checks on the configuration of the ReduceOperation.
+
+    This method validates the presence and correctness of all required configuration keys, Jinja2 templates, and ensures the correct
+    structure and types of the entire configuration.
+
+    The method performs the following checks:
+    1. Verifies the presence of all required keys in the configuration.
+    2. Validates the structure and content of the 'output' configuration, including its 'schema'.
+    3. Checks if the main 'prompt' is a valid Jinja2 template and contains the required 'inputs' variable.
+    4. If 'merge_prompt' is specified, ensures that 'fold_prompt' is also present.
+    5. If 'fold_prompt' is present, verifies the existence of 'fold_batch_size'.
+    6. Validates the 'fold_prompt' as a Jinja2 template with required variables 'inputs' and 'output'.
+    7. If present, checks 'merge_prompt' as a valid Jinja2 template with required 'outputs' variable.
+    8. Verifies types of various configuration inputs (e.g., 'fold_batch_size' as int).
+    9. Checks for the presence and validity of optional configurations like 'model'.
+
+    Raises:
+        ValueError: If any required configuration is missing, if templates are invalid or missing required
+                    variables, or if any other configuration aspect is incorrect or inconsistent.
+        TypeError: If any configuration value has an incorrect type, such as 'schema' not being a dict
+                   or 'fold_batch_size' not being an integer.
+    """
+    required_keys = ["reduce_key", "prompt", "output"]
+    for key in required_keys:
+        if key not in self.config:
+            raise ValueError(
+                f"Missing required key '{key}' in {self.config['name']} configuration"
+            )
+
+    if "schema" not in self.config["output"]:
+        raise ValueError(
+            f"Missing 'schema' in {self.config['name']} 'output' configuration"
+        )
+
+    if not isinstance(self.config["output"]["schema"], dict):
+        raise TypeError(
+            f"'schema' in {self.config['name']} 'output' configuration must be a dictionary"
+        )
+
+    if not self.config["output"]["schema"]:
+        raise ValueError(
+            f"'schema' in {self.config['name']} 'output' configuration cannot be empty"
+        )
+
+    # Check if the prompt is a valid Jinja2 template
+    try:
+        template = Template(self.config["prompt"])
+        template_vars = template.environment.parse(self.config["prompt"]).find_all(
+            jinja2.nodes.Name
+        )
+        template_var_names = {var.name for var in template_vars}
+        if "inputs" not in template_var_names:
+            raise ValueError(
+                f"Prompt template for {self.config['name']} must include the 'inputs' variable"
+            )
+    except Exception as e:
+        raise ValueError(
+            f"Invalid Jinja2 template in {self.config['name']} 'prompt': {str(e)}"
+        )
+
+    # Check if fold_prompt is a valid Jinja2 template (now required if merge exists)
+    if "merge_prompt" in self.config:
+        if "fold_prompt" not in self.config:
+            raise ValueError(
+                f"'fold_prompt' is required when 'merge_prompt' is specified in {self.config['name']}"
+            )
+
+    if "fold_prompt" in self.config:
+        if "fold_batch_size" not in self.config:
+            raise ValueError(
+                f"'fold_batch_size' is required when 'fold_prompt' is specified in {self.config['name']}"
+            )
+
+        try:
+            fold_template = Template(self.config["fold_prompt"])
+            fold_template_vars = fold_template.environment.parse(
+                self.config["fold_prompt"]
+            ).find_all(jinja2.nodes.Name)
+            fold_template_var_names = {var.name for var in fold_template_vars}
+            required_vars = {"inputs", "output"}
+            if not required_vars.issubset(fold_template_var_names):
+                raise ValueError(
+                    f"Fold template in {self.config['name']} must include variables: {required_vars}. Current template includes: {fold_template_var_names}"
+                )
+        except Exception as e:
+            raise ValueError(
+                f"Invalid Jinja2 template in {self.config['name']} 'fold_prompt': {str(e)}"
+            )
+
+    # Check merge_prompt and merge_batch_size
+    if "merge_prompt" in self.config:
+        if "merge_batch_size" not in self.config:
+            raise ValueError(
+                f"'merge_batch_size' is required when 'merge_prompt' is specified in {self.config['name']}"
+            )
+
+        try:
+            merge_template = Template(self.config["merge_prompt"])
+            merge_template_vars = merge_template.environment.parse(
+                self.config["merge_prompt"]
+            ).find_all(jinja2.nodes.Name)
+            merge_template_var_names = {var.name for var in merge_template_vars}
+            if "outputs" not in merge_template_var_names:
+                raise ValueError(
+                    f"Merge template in {self.config['name']} must include the 'outputs' variable"
+                )
+        except Exception as e:
+            raise ValueError(
+                f"Invalid Jinja2 template in {self.config['name']} 'merge_prompt': {str(e)}"
+            )
+
+    # Check if the model is specified (optional)
+    if "model" in self.config and not isinstance(self.config["model"], str):
+        raise TypeError(
+            f"'model' in {self.config['name']} configuration must be a string"
+        )
+
+    # Check if reduce_key is a string or a list of strings
+    if not isinstance(self.config["reduce_key"], (str, list)):
+        raise TypeError(
+            f"'reduce_key' in {self.config['name']} configuration must be a string or a list of strings"
+        )
+    if isinstance(self.config["reduce_key"], list):
+        if not all(isinstance(key, str) for key in self.config["reduce_key"]):
+            raise TypeError(
+                f"All elements in 'reduce_key' list in {self.config['name']} configuration must be strings"
+            )
+
+    # Check if input schema is provided and valid (optional)
+    if "input" in self.config:
+        if "schema" not in self.config["input"]:
+            raise ValueError(
+                f"Missing 'schema' in {self.config['name']} 'input' configuration"
+            )
+        if not isinstance(self.config["input"]["schema"], dict):
+            raise TypeError(
+                f"'schema' in {self.config['name']} 'input' configuration must be a dictionary"
+            )
+
+    # Check if fold_batch_size and merge_batch_size are positive integers
+    for key in ["fold_batch_size", "merge_batch_size"]:
+        if key in self.config:
+            if not isinstance(self.config[key], int) or self.config[key] <= 0:
+                raise ValueError(
+                    f"'{key}' in {self.config['name']} configuration must be a positive integer"
+                )
+
+    if "value_sampling" in self.config:
+        sampling = self.config["value_sampling"]
+        if not isinstance(sampling, dict):
+            raise TypeError(
+                f"'value_sampling' in {self.config['name']} configuration must be a dictionary"
+            )
+
+        if "enabled" not in sampling:
+            raise ValueError(
+                f"'enabled' is required in {self.config['name']} 'value_sampling' configuration"
+            )
+        if not isinstance(sampling["enabled"], bool):
+            raise TypeError(
+                f"'enabled' in {self.config['name']} 'value_sampling' configuration must be a boolean"
+            )
+
+        if sampling["enabled"]:
+            if "sample_size" not in sampling:
+                raise ValueError(
+                    f"'sample_size' is required when value_sampling is enabled in {self.config['name']}"
+                )
+            if (
+                not isinstance(sampling["sample_size"], int)
+                or sampling["sample_size"] <= 0
+            ):
+                raise ValueError(
+                    f"'sample_size' in {self.config['name']} configuration must be a positive integer"
+                )
+
+            if "method" not in sampling:
+                raise ValueError(
+                    f"'method' is required when value_sampling is enabled in {self.config['name']}"
+                )
+            if sampling["method"] not in [
+                "random",
+                "first_n",
+                "cluster",
+                "sem_sim",
+            ]:
+                raise ValueError(
+                    f"Invalid 'method'. Must be 'random', 'first_n', or 'embedding' in {self.config['name']}"
+                )
+
+            if sampling["method"] == "embedding":
+                if "embedding_model" not in sampling:
+                    raise ValueError(
+                        f"'embedding_model' is required when using embedding-based sampling in {self.config['name']}"
+                    )
+                if "embedding_keys" not in sampling:
+                    raise ValueError(
+                        f"'embedding_keys' is required when using embedding-based sampling in {self.config['name']}"
+                    )
+
+    # Check if lineage is a list of strings
+    if "lineage" in self.config.get("output", {}):
+        if not isinstance(self.config["output"]["lineage"], list):
+            raise TypeError(
+                f"'lineage' in {self.config['name']} 'output' configuration must be a list"
+            )
+        if not all(
+            isinstance(key, str) for key in self.config["output"]["lineage"]
+        ):
+            raise TypeError(
+                f"All elements in 'lineage' list in {self.config['name']} 'output' configuration must be strings"
+            )
+
+    self.gleaning_check()
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ docetl.operations.map.ParallelMapOperation + + +

+ + +
+

+ Bases: BaseOperation

+ + + + + + + +
+ Source code in docetl/operations/map.py +
357
+358
+359
+360
+361
+362
+363
+364
+365
+366
+367
+368
+369
+370
+371
+372
+373
+374
+375
+376
+377
+378
+379
+380
+381
+382
+383
+384
+385
+386
+387
+388
+389
+390
+391
+392
+393
+394
+395
+396
+397
+398
+399
+400
+401
+402
+403
+404
+405
+406
+407
+408
+409
+410
+411
+412
+413
+414
+415
+416
+417
+418
+419
+420
+421
+422
+423
+424
+425
+426
+427
+428
+429
+430
+431
+432
+433
+434
+435
+436
+437
+438
+439
+440
+441
+442
+443
+444
+445
+446
+447
+448
+449
+450
+451
+452
+453
+454
+455
+456
+457
+458
+459
+460
+461
+462
+463
+464
+465
+466
+467
+468
+469
+470
+471
+472
+473
+474
+475
+476
+477
+478
+479
+480
+481
+482
+483
+484
+485
+486
+487
+488
+489
+490
+491
+492
+493
+494
+495
+496
+497
+498
+499
+500
+501
+502
+503
+504
+505
+506
+507
+508
+509
+510
+511
+512
+513
+514
+515
+516
+517
+518
+519
+520
+521
+522
+523
+524
+525
+526
+527
+528
+529
+530
+531
+532
+533
+534
+535
+536
+537
+538
+539
+540
+541
+542
+543
+544
+545
+546
+547
+548
+549
+550
+551
+552
+553
+554
+555
+556
+557
+558
+559
+560
+561
+562
+563
+564
+565
+566
+567
+568
+569
+570
+571
+572
+573
+574
+575
class ParallelMapOperation(BaseOperation):
+    class schema(BaseOperation.schema):
+        type: str = "parallel_map"
+        prompts: List[Dict[str, Any]]
+        output: Dict[str, Any]
+        enable_observability: bool = False
+
+    def __init__(
+        self,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+
+    def syntax_check(self) -> None:
+        """
+        Checks the configuration of the ParallelMapOperation for required keys and valid structure.
+
+        Raises:
+            ValueError: If required keys are missing or if the configuration structure is invalid.
+            TypeError: If the configuration values have incorrect types.
+        """
+        if "drop_keys" in self.config:
+            if not isinstance(self.config["drop_keys"], list):
+                raise TypeError(
+                    "'drop_keys' in configuration must be a list of strings"
+                )
+            for key in self.config["drop_keys"]:
+                if not isinstance(key, str):
+                    raise TypeError("All items in 'drop_keys' must be strings")
+        elif "prompts" not in self.config:
+            raise ValueError(
+                "If 'drop_keys' is not specified, 'prompts' must be present in the configuration"
+            )
+
+        if "prompts" in self.config:
+            if not isinstance(self.config["prompts"], list):
+                raise ValueError(
+                    "ParallelMapOperation requires a 'prompts' list in the configuration"
+                )
+
+            if not self.config["prompts"]:
+                raise ValueError("The 'prompts' list cannot be empty")
+
+            for i, prompt_config in enumerate(self.config["prompts"]):
+                if not isinstance(prompt_config, dict):
+                    raise TypeError(f"Prompt configuration {i} must be a dictionary")
+
+                required_keys = ["prompt", "output_keys"]
+                for key in required_keys:
+                    if key not in prompt_config:
+                        raise ValueError(
+                            f"Missing required key '{key}' in prompt configuration {i}"
+                        )
+                if not isinstance(prompt_config["prompt"], str):
+                    raise TypeError(
+                        f"'prompt' in prompt configuration {i} must be a string"
+                    )
+
+                if not isinstance(prompt_config["output_keys"], list):
+                    raise TypeError(
+                        f"'output_keys' in prompt configuration {i} must be a list"
+                    )
+
+                if not prompt_config["output_keys"]:
+                    raise ValueError(
+                        f"'output_keys' list in prompt configuration {i} cannot be empty"
+                    )
+
+                # Check if the prompt is a valid Jinja2 template
+                try:
+                    Template(prompt_config["prompt"])
+                except Exception as e:
+                    raise ValueError(
+                        f"Invalid Jinja2 template in prompt configuration {i}: {str(e)}"
+                    ) from e
+
+                # Check if the model is specified (optional)
+                if "model" in prompt_config and not isinstance(
+                    prompt_config["model"], str
+                ):
+                    raise TypeError(
+                        f"'model' in prompt configuration {i} must be a string"
+                    )
+
+            # Check if all output schema keys are covered by the prompts
+            output_schema = self.config["output"]["schema"]
+            output_keys_covered = set()
+            for prompt_config in self.config["prompts"]:
+                output_keys_covered.update(prompt_config["output_keys"])
+
+            missing_keys = set(output_schema.keys()) - output_keys_covered
+            if missing_keys:
+                raise ValueError(
+                    f"The following output schema keys are not covered by any prompt: {missing_keys}"
+                )
+
+    def execute(self, input_data: List[Dict]) -> Tuple[List[Dict], float]:
+        """
+        Executes the parallel map operation on the provided input data.
+
+        Args:
+            input_data (List[Dict]): The input data to process.
+
+        Returns:
+            Tuple[List[Dict], float]: A tuple containing the processed results and the total cost of the operation.
+
+        This method performs the following steps:
+        1. If prompts are specified, it processes each input item using multiple prompts in parallel
+        2. Aggregates results from different prompts for each input item
+        3. Validates the combined output for each item
+        4. If drop_keys is specified, it drops the specified keys from each document
+        5. Calculates total cost of the operation
+        """
+        results = {}
+        total_cost = 0
+        output_schema = self.config.get("output", {}).get("schema", {})
+
+        # Check if there's no prompt and only drop_keys
+        if "prompts" not in self.config and "drop_keys" in self.config:
+            # If only drop_keys is specified, simply drop the keys and return
+            dropped_results = []
+            for item in input_data:
+                new_item = {
+                    k: v for k, v in item.items() if k not in self.config["drop_keys"]
+                }
+                dropped_results.append(new_item)
+            return dropped_results, 0.0  # Return the modified data with no cost
+
+        if self.status:
+            self.status.stop()
+
+        def process_prompt(item, prompt_config):
+            prompt = strict_render(prompt_config["prompt"], {"input": item})
+            local_output_schema = {
+                key: output_schema[key] for key in prompt_config["output_keys"]
+            }
+            model = prompt_config.get("model", self.default_model)
+            if not model:
+                model = self.default_model
+
+            # Start of Selection
+            # If there are tools, we need to pass in the tools
+            response = self.runner.api.call_llm(
+                model,
+                "parallel_map",
+                [{"role": "user", "content": prompt}],
+                local_output_schema,
+                tools=prompt_config.get("tools", None),
+                timeout_seconds=self.config.get("timeout", 120),
+                max_retries_per_timeout=self.config.get("max_retries_per_timeout", 2),
+                bypass_cache=self.config.get("bypass_cache", False),
+                litellm_completion_kwargs=self.config.get(
+                    "litellm_completion_kwargs", {}
+                ),
+            )
+            output = self.runner.api.parse_llm_response(
+                response.response,
+                schema=local_output_schema,
+                tools=prompt_config.get("tools", None),
+                manually_fix_errors=self.manually_fix_errors,
+            )[0]
+            return output, prompt, response.total_cost
+
+        with ThreadPoolExecutor(max_workers=self.max_threads) as executor:
+            if "prompts" in self.config:
+                # Create all futures at once
+                all_futures = [
+                    executor.submit(process_prompt, item, prompt_config)
+                    for item in input_data
+                    for prompt_config in self.config["prompts"]
+                ]
+
+                # Process results in order
+                for i in tqdm(
+                    range(len(all_futures)),
+                    desc="Processing parallel map items",
+                ):
+                    future = all_futures[i]
+                    output, prompt, cost = future.result()
+                    total_cost += cost
+
+                    # Determine which item this future corresponds to
+                    item_index = i // len(self.config["prompts"])
+                    prompt_index = i % len(self.config["prompts"])
+
+                    # Initialize or update the item_result
+                    if prompt_index == 0:
+                        item_result = input_data[item_index].copy()
+                        results[item_index] = item_result
+
+                    # Fetch the item_result
+                    item_result = results[item_index]
+
+                    if self.config.get("enable_observability", False):
+                        if f"_observability_{self.config['name']}" not in item_result:
+                            item_result[f"_observability_{self.config['name']}"] = {}
+                        item_result[f"_observability_{self.config['name']}"].update(
+                            {f"prompt_{prompt_index}": prompt}
+                        )
+
+                    # Update the item_result with the output
+                    item_result.update(output)
+
+            else:
+                results = {i: item.copy() for i, item in enumerate(input_data)}
+
+        # Apply drop_keys if specified
+        if "drop_keys" in self.config:
+            drop_keys = self.config["drop_keys"]
+            for item in results.values():
+                for key in drop_keys:
+                    item.pop(key, None)
+
+        if self.status:
+            self.status.start()
+
+        # Return the results in order
+        return [results[i] for i in range(len(input_data)) if i in results], total_cost
+
+
+ + + +
+ + + + + + + + + +
+ + +

+ execute(input_data) + +

+ + +
+ +

Executes the parallel map operation on the provided input data.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ input_data + + List[Dict] + +
+

The input data to process.

+
+
+ required +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ Tuple[List[Dict], float] + +
+

Tuple[List[Dict], float]: A tuple containing the processed results and the total cost of the operation.

+
+
+

This method performs the following steps: +1. If prompts are specified, it processes each input item using multiple prompts in parallel +2. Aggregates results from different prompts for each input item +3. Validates the combined output for each item +4. If drop_keys is specified, it drops the specified keys from each document +5. Calculates total cost of the operation

+ +
+ Source code in docetl/operations/map.py +
454
+455
+456
+457
+458
+459
+460
+461
+462
+463
+464
+465
+466
+467
+468
+469
+470
+471
+472
+473
+474
+475
+476
+477
+478
+479
+480
+481
+482
+483
+484
+485
+486
+487
+488
+489
+490
+491
+492
+493
+494
+495
+496
+497
+498
+499
+500
+501
+502
+503
+504
+505
+506
+507
+508
+509
+510
+511
+512
+513
+514
+515
+516
+517
+518
+519
+520
+521
+522
+523
+524
+525
+526
+527
+528
+529
+530
+531
+532
+533
+534
+535
+536
+537
+538
+539
+540
+541
+542
+543
+544
+545
+546
+547
+548
+549
+550
+551
+552
+553
+554
+555
+556
+557
+558
+559
+560
+561
+562
+563
+564
+565
+566
+567
+568
+569
+570
+571
+572
+573
+574
+575
def execute(self, input_data: List[Dict]) -> Tuple[List[Dict], float]:
+    """
+    Executes the parallel map operation on the provided input data.
+
+    Args:
+        input_data (List[Dict]): The input data to process.
+
+    Returns:
+        Tuple[List[Dict], float]: A tuple containing the processed results and the total cost of the operation.
+
+    This method performs the following steps:
+    1. If prompts are specified, it processes each input item using multiple prompts in parallel
+    2. Aggregates results from different prompts for each input item
+    3. Validates the combined output for each item
+    4. If drop_keys is specified, it drops the specified keys from each document
+    5. Calculates total cost of the operation
+    """
+    results = {}
+    total_cost = 0
+    output_schema = self.config.get("output", {}).get("schema", {})
+
+    # Check if there's no prompt and only drop_keys
+    if "prompts" not in self.config and "drop_keys" in self.config:
+        # If only drop_keys is specified, simply drop the keys and return
+        dropped_results = []
+        for item in input_data:
+            new_item = {
+                k: v for k, v in item.items() if k not in self.config["drop_keys"]
+            }
+            dropped_results.append(new_item)
+        return dropped_results, 0.0  # Return the modified data with no cost
+
+    if self.status:
+        self.status.stop()
+
+    def process_prompt(item, prompt_config):
+        prompt = strict_render(prompt_config["prompt"], {"input": item})
+        local_output_schema = {
+            key: output_schema[key] for key in prompt_config["output_keys"]
+        }
+        model = prompt_config.get("model", self.default_model)
+        if not model:
+            model = self.default_model
+
+        # Start of Selection
+        # If there are tools, we need to pass in the tools
+        response = self.runner.api.call_llm(
+            model,
+            "parallel_map",
+            [{"role": "user", "content": prompt}],
+            local_output_schema,
+            tools=prompt_config.get("tools", None),
+            timeout_seconds=self.config.get("timeout", 120),
+            max_retries_per_timeout=self.config.get("max_retries_per_timeout", 2),
+            bypass_cache=self.config.get("bypass_cache", False),
+            litellm_completion_kwargs=self.config.get(
+                "litellm_completion_kwargs", {}
+            ),
+        )
+        output = self.runner.api.parse_llm_response(
+            response.response,
+            schema=local_output_schema,
+            tools=prompt_config.get("tools", None),
+            manually_fix_errors=self.manually_fix_errors,
+        )[0]
+        return output, prompt, response.total_cost
+
+    with ThreadPoolExecutor(max_workers=self.max_threads) as executor:
+        if "prompts" in self.config:
+            # Create all futures at once
+            all_futures = [
+                executor.submit(process_prompt, item, prompt_config)
+                for item in input_data
+                for prompt_config in self.config["prompts"]
+            ]
+
+            # Process results in order
+            for i in tqdm(
+                range(len(all_futures)),
+                desc="Processing parallel map items",
+            ):
+                future = all_futures[i]
+                output, prompt, cost = future.result()
+                total_cost += cost
+
+                # Determine which item this future corresponds to
+                item_index = i // len(self.config["prompts"])
+                prompt_index = i % len(self.config["prompts"])
+
+                # Initialize or update the item_result
+                if prompt_index == 0:
+                    item_result = input_data[item_index].copy()
+                    results[item_index] = item_result
+
+                # Fetch the item_result
+                item_result = results[item_index]
+
+                if self.config.get("enable_observability", False):
+                    if f"_observability_{self.config['name']}" not in item_result:
+                        item_result[f"_observability_{self.config['name']}"] = {}
+                    item_result[f"_observability_{self.config['name']}"].update(
+                        {f"prompt_{prompt_index}": prompt}
+                    )
+
+                # Update the item_result with the output
+                item_result.update(output)
+
+        else:
+            results = {i: item.copy() for i, item in enumerate(input_data)}
+
+    # Apply drop_keys if specified
+    if "drop_keys" in self.config:
+        drop_keys = self.config["drop_keys"]
+        for item in results.values():
+            for key in drop_keys:
+                item.pop(key, None)
+
+    if self.status:
+        self.status.start()
+
+    # Return the results in order
+    return [results[i] for i in range(len(input_data)) if i in results], total_cost
+
+
+
+ +
+ +
+ + +

+ syntax_check() + +

+ + +
+ +

Checks the configuration of the ParallelMapOperation for required keys and valid structure.

+ + +

Raises:

+ + + + + + + + + + + + + + + + + +
TypeDescription
+ ValueError + +
+

If required keys are missing or if the configuration structure is invalid.

+
+
+ TypeError + +
+

If the configuration values have incorrect types.

+
+
+ +
+ Source code in docetl/operations/map.py +
371
+372
+373
+374
+375
+376
+377
+378
+379
+380
+381
+382
+383
+384
+385
+386
+387
+388
+389
+390
+391
+392
+393
+394
+395
+396
+397
+398
+399
+400
+401
+402
+403
+404
+405
+406
+407
+408
+409
+410
+411
+412
+413
+414
+415
+416
+417
+418
+419
+420
+421
+422
+423
+424
+425
+426
+427
+428
+429
+430
+431
+432
+433
+434
+435
+436
+437
+438
+439
+440
+441
+442
+443
+444
+445
+446
+447
+448
+449
+450
+451
+452
def syntax_check(self) -> None:
+    """
+    Checks the configuration of the ParallelMapOperation for required keys and valid structure.
+
+    Raises:
+        ValueError: If required keys are missing or if the configuration structure is invalid.
+        TypeError: If the configuration values have incorrect types.
+    """
+    if "drop_keys" in self.config:
+        if not isinstance(self.config["drop_keys"], list):
+            raise TypeError(
+                "'drop_keys' in configuration must be a list of strings"
+            )
+        for key in self.config["drop_keys"]:
+            if not isinstance(key, str):
+                raise TypeError("All items in 'drop_keys' must be strings")
+    elif "prompts" not in self.config:
+        raise ValueError(
+            "If 'drop_keys' is not specified, 'prompts' must be present in the configuration"
+        )
+
+    if "prompts" in self.config:
+        if not isinstance(self.config["prompts"], list):
+            raise ValueError(
+                "ParallelMapOperation requires a 'prompts' list in the configuration"
+            )
+
+        if not self.config["prompts"]:
+            raise ValueError("The 'prompts' list cannot be empty")
+
+        for i, prompt_config in enumerate(self.config["prompts"]):
+            if not isinstance(prompt_config, dict):
+                raise TypeError(f"Prompt configuration {i} must be a dictionary")
+
+            required_keys = ["prompt", "output_keys"]
+            for key in required_keys:
+                if key not in prompt_config:
+                    raise ValueError(
+                        f"Missing required key '{key}' in prompt configuration {i}"
+                    )
+            if not isinstance(prompt_config["prompt"], str):
+                raise TypeError(
+                    f"'prompt' in prompt configuration {i} must be a string"
+                )
+
+            if not isinstance(prompt_config["output_keys"], list):
+                raise TypeError(
+                    f"'output_keys' in prompt configuration {i} must be a list"
+                )
+
+            if not prompt_config["output_keys"]:
+                raise ValueError(
+                    f"'output_keys' list in prompt configuration {i} cannot be empty"
+                )
+
+            # Check if the prompt is a valid Jinja2 template
+            try:
+                Template(prompt_config["prompt"])
+            except Exception as e:
+                raise ValueError(
+                    f"Invalid Jinja2 template in prompt configuration {i}: {str(e)}"
+                ) from e
+
+            # Check if the model is specified (optional)
+            if "model" in prompt_config and not isinstance(
+                prompt_config["model"], str
+            ):
+                raise TypeError(
+                    f"'model' in prompt configuration {i} must be a string"
+                )
+
+        # Check if all output schema keys are covered by the prompts
+        output_schema = self.config["output"]["schema"]
+        output_keys_covered = set()
+        for prompt_config in self.config["prompts"]:
+            output_keys_covered.update(prompt_config["output_keys"])
+
+        missing_keys = set(output_schema.keys()) - output_keys_covered
+        if missing_keys:
+            raise ValueError(
+                f"The following output schema keys are not covered by any prompt: {missing_keys}"
+            )
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ docetl.operations.filter.FilterOperation + + +

+ + +
+

+ Bases: MapOperation

+ + + + + + + +
+ Source code in docetl/operations/filter.py +
  8
+  9
+ 10
+ 11
+ 12
+ 13
+ 14
+ 15
+ 16
+ 17
+ 18
+ 19
+ 20
+ 21
+ 22
+ 23
+ 24
+ 25
+ 26
+ 27
+ 28
+ 29
+ 30
+ 31
+ 32
+ 33
+ 34
+ 35
+ 36
+ 37
+ 38
+ 39
+ 40
+ 41
+ 42
+ 43
+ 44
+ 45
+ 46
+ 47
+ 48
+ 49
+ 50
+ 51
+ 52
+ 53
+ 54
+ 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
class FilterOperation(MapOperation):
+    class schema(MapOperation.schema):
+        type: str = "filter"
+
+    def syntax_check(self) -> None:
+        """
+        Checks the configuration of the FilterOperation for required keys and valid structure.
+
+        Raises:
+            ValueError: If required keys are missing or if the output schema structure is invalid.
+            TypeError: If the schema in the output configuration is not a dictionary or if the schema value is not of type bool.
+
+        This method checks for the following:
+        - Presence of required keys: 'prompt' and 'output'
+        - Presence of 'schema' in the 'output' configuration
+        - The 'schema' is a non-empty dictionary with exactly one key-value pair
+        - The value in the schema is of type bool
+        """
+        required_keys = ["prompt", "output"]
+        for key in required_keys:
+            if key not in self.config:
+                raise ValueError(
+                    f"Missing required key '{key}' in FilterOperation configuration"
+                )
+
+        if "schema" not in self.config["output"]:
+            raise ValueError("Missing 'schema' in 'output' configuration")
+
+        if not isinstance(self.config["output"]["schema"], dict):
+            raise TypeError("'schema' in 'output' configuration must be a dictionary")
+
+        if not self.config["output"]["schema"]:
+            raise ValueError("'schema' in 'output' configuration cannot be empty")
+
+        schema = self.config["output"]["schema"]
+        if "_short_explanation" in schema:
+            schema = {k: v for k, v in schema.items() if k != "_short_explanation"}
+        if len(schema) != 1:
+            raise ValueError(
+                "The 'schema' in 'output' configuration must have exactly one key-value pair that maps to a boolean value"
+            )
+
+        key, value = next(iter(schema.items()))
+        if value not in ["bool", "boolean"]:
+            raise TypeError(
+                f"The value in the 'schema' must be of type bool, got {value}"
+            )
+
+    def execute(
+        self, input_data: List[Dict], is_build: bool = False
+    ) -> Tuple[List[Dict], float]:
+        """
+        Executes the filter operation on the input data.
+
+        Args:
+            input_data (List[Dict]): A list of dictionaries to process.
+            is_build (bool): Whether the operation is being executed in the build phase. Defaults to False.
+
+        Returns:
+            Tuple[List[Dict], float]: A tuple containing the filtered list of dictionaries
+            and the total cost of the operation.
+
+        This method performs the following steps:
+        1. Processes each input item using an LLM model
+        2. Validates the output
+        3. Filters the results based on the specified filter key
+        4. Calculates the total cost of the operation
+
+        The method uses multi-threading to process items in parallel, improving performance
+        for large datasets.
+
+        Usage:
+        ```python
+        from docetl.operations import FilterOperation
+
+        config = {
+            "prompt": "Determine if the following item is important: {{input}}",
+            "output": {
+                "schema": {"is_important": "bool"}
+            },
+            "model": "gpt-3.5-turbo"
+        }
+        filter_op = FilterOperation(config)
+        input_data = [
+            {"id": 1, "text": "Critical update"},
+            {"id": 2, "text": "Regular maintenance"}
+        ]
+        results, cost = filter_op.execute(input_data)
+        print(f"Filtered results: {results}")
+        print(f"Total cost: {cost}")
+        ```
+        """
+        filter_key = next(
+            iter(
+                [
+                    k
+                    for k in self.config["output"]["schema"].keys()
+                    if k != "_short_explanation"
+                ]
+            )
+        )
+
+        results, total_cost = super().execute(input_data)
+
+        # Drop records with filter_key values that are False
+        if not is_build:
+            results = [result for result in results if result[filter_key]]
+
+        # Drop the filter_key from the results
+        for result in results:
+            result.pop(filter_key, None)
+
+        return results, total_cost
+
+
+ + + +
+ + + + + + + + + +
+ + +

+ execute(input_data, is_build=False) + +

+ + +
+ +

Executes the filter operation on the input data.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ input_data + + List[Dict] + +
+

A list of dictionaries to process.

+
+
+ required +
+ is_build + + bool + +
+

Whether the operation is being executed in the build phase. Defaults to False.

+
+
+ False +
+ + +

Returns:

+ + + + + + + + + + + + + + + + + +
TypeDescription
+ List[Dict] + +
+

Tuple[List[Dict], float]: A tuple containing the filtered list of dictionaries

+
+
+ float + +
+

and the total cost of the operation.

+
+
+

This method performs the following steps: +1. Processes each input item using an LLM model +2. Validates the output +3. Filters the results based on the specified filter key +4. Calculates the total cost of the operation

+

The method uses multi-threading to process items in parallel, improving performance +for large datasets.

+

Usage: +

from docetl.operations import FilterOperation
+
+config = {
+    "prompt": "Determine if the following item is important: {{input}}",
+    "output": {
+        "schema": {"is_important": "bool"}
+    },
+    "model": "gpt-3.5-turbo"
+}
+filter_op = FilterOperation(config)
+input_data = [
+    {"id": 1, "text": "Critical update"},
+    {"id": 2, "text": "Regular maintenance"}
+]
+results, cost = filter_op.execute(input_data)
+print(f"Filtered results: {results}")
+print(f"Total cost: {cost}")
+

+ +
+ Source code in docetl/operations/filter.py +
 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
def execute(
+    self, input_data: List[Dict], is_build: bool = False
+) -> Tuple[List[Dict], float]:
+    """
+    Executes the filter operation on the input data.
+
+    Args:
+        input_data (List[Dict]): A list of dictionaries to process.
+        is_build (bool): Whether the operation is being executed in the build phase. Defaults to False.
+
+    Returns:
+        Tuple[List[Dict], float]: A tuple containing the filtered list of dictionaries
+        and the total cost of the operation.
+
+    This method performs the following steps:
+    1. Processes each input item using an LLM model
+    2. Validates the output
+    3. Filters the results based on the specified filter key
+    4. Calculates the total cost of the operation
+
+    The method uses multi-threading to process items in parallel, improving performance
+    for large datasets.
+
+    Usage:
+    ```python
+    from docetl.operations import FilterOperation
+
+    config = {
+        "prompt": "Determine if the following item is important: {{input}}",
+        "output": {
+            "schema": {"is_important": "bool"}
+        },
+        "model": "gpt-3.5-turbo"
+    }
+    filter_op = FilterOperation(config)
+    input_data = [
+        {"id": 1, "text": "Critical update"},
+        {"id": 2, "text": "Regular maintenance"}
+    ]
+    results, cost = filter_op.execute(input_data)
+    print(f"Filtered results: {results}")
+    print(f"Total cost: {cost}")
+    ```
+    """
+    filter_key = next(
+        iter(
+            [
+                k
+                for k in self.config["output"]["schema"].keys()
+                if k != "_short_explanation"
+            ]
+        )
+    )
+
+    results, total_cost = super().execute(input_data)
+
+    # Drop records with filter_key values that are False
+    if not is_build:
+        results = [result for result in results if result[filter_key]]
+
+    # Drop the filter_key from the results
+    for result in results:
+        result.pop(filter_key, None)
+
+    return results, total_cost
+
+
+
+ +
+ +
+ + +

+ syntax_check() + +

+ + +
+ +

Checks the configuration of the FilterOperation for required keys and valid structure.

+ + +

Raises:

+ + + + + + + + + + + + + + + + + +
TypeDescription
+ ValueError + +
+

If required keys are missing or if the output schema structure is invalid.

+
+
+ TypeError + +
+

If the schema in the output configuration is not a dictionary or if the schema value is not of type bool.

+
+
+

This method checks for the following: +- Presence of required keys: 'prompt' and 'output' +- Presence of 'schema' in the 'output' configuration +- The 'schema' is a non-empty dictionary with exactly one key-value pair +- The value in the schema is of type bool

+ +
+ Source code in docetl/operations/filter.py +
12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
+52
+53
+54
def syntax_check(self) -> None:
+    """
+    Checks the configuration of the FilterOperation for required keys and valid structure.
+
+    Raises:
+        ValueError: If required keys are missing or if the output schema structure is invalid.
+        TypeError: If the schema in the output configuration is not a dictionary or if the schema value is not of type bool.
+
+    This method checks for the following:
+    - Presence of required keys: 'prompt' and 'output'
+    - Presence of 'schema' in the 'output' configuration
+    - The 'schema' is a non-empty dictionary with exactly one key-value pair
+    - The value in the schema is of type bool
+    """
+    required_keys = ["prompt", "output"]
+    for key in required_keys:
+        if key not in self.config:
+            raise ValueError(
+                f"Missing required key '{key}' in FilterOperation configuration"
+            )
+
+    if "schema" not in self.config["output"]:
+        raise ValueError("Missing 'schema' in 'output' configuration")
+
+    if not isinstance(self.config["output"]["schema"], dict):
+        raise TypeError("'schema' in 'output' configuration must be a dictionary")
+
+    if not self.config["output"]["schema"]:
+        raise ValueError("'schema' in 'output' configuration cannot be empty")
+
+    schema = self.config["output"]["schema"]
+    if "_short_explanation" in schema:
+        schema = {k: v for k, v in schema.items() if k != "_short_explanation"}
+    if len(schema) != 1:
+        raise ValueError(
+            "The 'schema' in 'output' configuration must have exactly one key-value pair that maps to a boolean value"
+        )
+
+    key, value = next(iter(schema.items()))
+    if value not in ["bool", "boolean"]:
+        raise TypeError(
+            f"The value in the 'schema' must be of type bool, got {value}"
+        )
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ docetl.operations.equijoin.EquijoinOperation + + +

+ + +
+

+ Bases: BaseOperation

+ + + + + + + +
+ Source code in docetl/operations/equijoin.py +
 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
+162
+163
+164
+165
+166
+167
+168
+169
+170
+171
+172
+173
+174
+175
+176
+177
+178
+179
+180
+181
+182
+183
+184
+185
+186
+187
+188
+189
+190
+191
+192
+193
+194
+195
+196
+197
+198
+199
+200
+201
+202
+203
+204
+205
+206
+207
+208
+209
+210
+211
+212
+213
+214
+215
+216
+217
+218
+219
+220
+221
+222
+223
+224
+225
+226
+227
+228
+229
+230
+231
+232
+233
+234
+235
+236
+237
+238
+239
+240
+241
+242
+243
+244
+245
+246
+247
+248
+249
+250
+251
+252
+253
+254
+255
+256
+257
+258
+259
+260
+261
+262
+263
+264
+265
+266
+267
+268
+269
+270
+271
+272
+273
+274
+275
+276
+277
+278
+279
+280
+281
+282
+283
+284
+285
+286
+287
+288
+289
+290
+291
+292
+293
+294
+295
+296
+297
+298
+299
+300
+301
+302
+303
+304
+305
+306
+307
+308
+309
+310
+311
+312
+313
+314
+315
+316
+317
+318
+319
+320
+321
+322
+323
+324
+325
+326
+327
+328
+329
+330
+331
+332
+333
+334
+335
+336
+337
+338
+339
+340
+341
+342
+343
+344
+345
+346
+347
+348
+349
+350
+351
+352
+353
+354
+355
+356
+357
+358
+359
+360
+361
+362
+363
+364
+365
+366
+367
+368
+369
+370
+371
+372
+373
+374
+375
+376
+377
+378
+379
+380
+381
+382
+383
+384
+385
+386
+387
+388
+389
+390
+391
+392
+393
+394
+395
+396
+397
+398
+399
+400
+401
+402
+403
+404
+405
+406
+407
+408
+409
+410
+411
+412
+413
+414
+415
+416
+417
+418
+419
+420
+421
+422
+423
+424
+425
+426
+427
+428
+429
+430
+431
+432
+433
+434
+435
+436
+437
+438
+439
+440
+441
+442
+443
+444
+445
+446
+447
+448
+449
+450
+451
+452
+453
+454
+455
+456
+457
+458
+459
+460
+461
+462
+463
+464
+465
+466
+467
+468
+469
+470
+471
+472
+473
+474
+475
+476
+477
+478
+479
+480
+481
+482
+483
+484
+485
+486
+487
+488
+489
+490
class EquijoinOperation(BaseOperation):
+    class schema(BaseOperation.schema):
+        type: str = "equijoin"
+        left: str
+        right: str
+        comparison_prompt: str
+        output: Optional[Dict[str, Any]] = None
+        blocking_threshold: Optional[float] = None
+        blocking_conditions: Optional[Dict[str, List[str]]] = None
+        limits: Optional[Dict[str, int]] = None
+        comparison_model: Optional[str] = None
+        optimize: Optional[bool] = None
+        embedding_model: Optional[str] = None
+        embedding_batch_size: Optional[int] = None
+        compare_batch_size: Optional[int] = None
+        limit_comparisons: Optional[int] = None
+        blocking_keys: Optional[Dict[str, List[str]]] = None
+        timeout: Optional[int] = None
+        litellm_completion_kwargs: Dict[str, Any] = {}
+
+    def compare_pair(
+        self,
+        comparison_prompt: str,
+        model: str,
+        item1: Dict,
+        item2: Dict,
+        timeout_seconds: int = 120,
+        max_retries_per_timeout: int = 2,
+    ) -> Tuple[bool, float]:
+        """
+        Compares two items using an LLM model to determine if they match.
+
+        Args:
+            comparison_prompt (str): The prompt template for comparison.
+            model (str): The LLM model to use for comparison.
+            item1 (Dict): The first item to compare.
+            item2 (Dict): The second item to compare.
+            timeout_seconds (int): The timeout for the LLM call in seconds.
+            max_retries_per_timeout (int): The maximum number of retries per timeout.
+
+        Returns:
+            Tuple[bool, float]: A tuple containing a boolean indicating whether the items match and the cost of the comparison.
+        """
+
+        try:
+            prompt = strict_render(comparison_prompt, {"left": item1, "right": item2})
+        except Exception as e:
+            self.console.log(f"[red]Error rendering prompt: {e}[/red]")
+            return False, 0
+        response = self.runner.api.call_llm(
+            model,
+            "compare",
+            [{"role": "user", "content": prompt}],
+            {"is_match": "bool"},
+            timeout_seconds=timeout_seconds,
+            max_retries_per_timeout=max_retries_per_timeout,
+            bypass_cache=self.config.get("bypass_cache", False),
+            litellm_completion_kwargs=self.config.get("litellm_completion_kwargs", {}),
+        )
+        cost = 0
+        try:
+            cost = response.total_cost
+            output = self.runner.api.parse_llm_response(
+                response.response, {"is_match": "bool"}
+            )[0]
+        except Exception as e:
+            self.console.log(f"[red]Error parsing LLM response: {e}[/red]")
+            return False, cost
+        return output["is_match"], cost
+
+    def syntax_check(self) -> None:
+        """
+        Checks the configuration of the EquijoinOperation for required keys and valid structure.
+
+        Raises:
+            ValueError: If required keys are missing or if the blocking_keys structure is invalid.
+            Specifically:
+            - Raises if 'comparison_prompt' is missing from the config.
+            - Raises if 'left' or 'right' are missing from the 'blocking_keys' structure (if present).
+            - Raises if 'left' or 'right' are missing from the 'limits' structure (if present).
+        """
+        if "comparison_prompt" not in self.config:
+            raise ValueError(
+                "Missing required key 'comparison_prompt' in EquijoinOperation configuration"
+            )
+
+        if "blocking_keys" in self.config:
+            if (
+                "left" not in self.config["blocking_keys"]
+                or "right" not in self.config["blocking_keys"]
+            ):
+                raise ValueError(
+                    "Both 'left' and 'right' must be specified in 'blocking_keys'"
+                )
+
+        if "limits" in self.config:
+            if (
+                "left" not in self.config["limits"]
+                or "right" not in self.config["limits"]
+            ):
+                raise ValueError(
+                    "Both 'left' and 'right' must be specified in 'limits'"
+                )
+
+        if "limit_comparisons" in self.config:
+            if not isinstance(self.config["limit_comparisons"], int):
+                raise ValueError("limit_comparisons must be an integer")
+
+    def execute(
+        self, left_data: List[Dict], right_data: List[Dict]
+    ) -> Tuple[List[Dict], float]:
+        """
+        Executes the equijoin operation on the provided datasets.
+
+        Args:
+            left_data (List[Dict]): The left dataset to join.
+            right_data (List[Dict]): The right dataset to join.
+
+        Returns:
+            Tuple[List[Dict], float]: A tuple containing the joined results and the total cost of the operation.
+
+        Usage:
+        ```python
+        from docetl.operations import EquijoinOperation
+
+        config = {
+            "blocking_keys": {
+                "left": ["id"],
+                "right": ["user_id"]
+            },
+            "limits": {
+                "left": 1,
+                "right": 1
+            },
+            "comparison_prompt": "Compare {{left}} and {{right}} and determine if they match.",
+            "blocking_threshold": 0.8,
+            "blocking_conditions": ["left['id'] == right['user_id']"],
+            "limit_comparisons": 1000
+        }
+        equijoin_op = EquijoinOperation(config)
+        left_data = [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}]
+        right_data = [{"user_id": 1, "age": 30}, {"user_id": 2, "age": 25}]
+        results, cost = equijoin_op.execute(left_data, right_data)
+        print(f"Joined results: {results}")
+        print(f"Total cost: {cost}")
+        ```
+
+        This method performs the following steps:
+        1. Initial blocking based on specified conditions (if any)
+        2. Embedding-based blocking (if threshold is provided)
+        3. LLM-based comparison for blocked pairs
+        4. Result aggregation and validation
+
+        The method also calculates and logs statistics such as comparisons saved by blocking and join selectivity.
+        """
+
+        blocking_keys = self.config.get("blocking_keys", {})
+        left_keys = blocking_keys.get(
+            "left", list(left_data[0].keys()) if left_data else []
+        )
+        right_keys = blocking_keys.get(
+            "right", list(right_data[0].keys()) if right_data else []
+        )
+        limits = self.config.get(
+            "limits", {"left": float("inf"), "right": float("inf")}
+        )
+        left_limit = limits["left"]
+        right_limit = limits["right"]
+        blocking_threshold = self.config.get("blocking_threshold")
+        blocking_conditions = self.config.get("blocking_conditions", [])
+        limit_comparisons = self.config.get("limit_comparisons")
+        total_cost = 0
+
+        if len(left_data) == 0 or len(right_data) == 0:
+            return [], 0
+
+        if self.status:
+            self.status.stop()
+
+        # Initial blocking using multiprocessing
+        num_processes = min(cpu_count(), len(left_data))
+
+        self.console.log(
+            f"Starting to run code-based blocking rules for {len(left_data)} left and {len(right_data)} right rows ({len(left_data) * len(right_data)} total pairs) with {num_processes} processes..."
+        )
+
+        with Pool(
+            processes=num_processes,
+            initializer=init_worker,
+            initargs=(right_data, blocking_conditions),
+        ) as pool:
+            blocked_pairs_nested = pool.map(process_left_item, left_data)
+
+        # Flatten the nested list of blocked pairs
+        blocked_pairs = [pair for sublist in blocked_pairs_nested for pair in sublist]
+
+        # Check if we have exceeded the pairwise comparison limit
+        if limit_comparisons is not None and len(blocked_pairs) > limit_comparisons:
+            # Sample pairs based on cardinality and length
+            sampled_pairs = stratified_length_sample(
+                blocked_pairs, limit_comparisons, sample_size=1000, console=self.console
+            )
+
+            # Calculate number of dropped pairs
+            dropped_pairs = len(blocked_pairs) - limit_comparisons
+
+            # Prompt the user for confirmation
+            if self.status:
+                self.status.stop()
+            if not Confirm.ask(
+                f"[yellow]Warning: {dropped_pairs} pairs will be dropped due to the comparison limit. "
+                f"Proceeding with {limit_comparisons} randomly sampled pairs. "
+                f"Do you want to continue?[/yellow]",
+                console=self.console,
+            ):
+                raise ValueError("Operation cancelled by user due to pair limit.")
+
+            if self.status:
+                self.status.start()
+
+            blocked_pairs = sampled_pairs
+
+        self.console.log(
+            f"Number of blocked pairs after initial blocking: {len(blocked_pairs)}"
+        )
+
+        if blocking_threshold is not None:
+            embedding_model = self.config.get("embedding_model", self.default_model)
+            model_input_context_length = model_cost.get(embedding_model, {}).get(
+                "max_input_tokens", 8192
+            )
+
+            def get_embeddings(
+                input_data: List[Dict[str, Any]], keys: List[str], name: str
+            ) -> Tuple[List[List[float]], float]:
+                texts = [
+                    " ".join(str(item[key]) for key in keys if key in item)[
+                        : model_input_context_length * 4
+                    ]
+                    for item in input_data
+                ]
+
+                embeddings = []
+                total_cost = 0
+                batch_size = 2000
+                for i in range(0, len(texts), batch_size):
+                    batch = texts[i : i + batch_size]
+                    self.console.log(
+                        f"On iteration {i} for creating embeddings for {name} data"
+                    )
+                    response = self.runner.api.gen_embedding(
+                        model=embedding_model,
+                        input=batch,
+                    )
+                    embeddings.extend([data["embedding"] for data in response["data"]])
+                    total_cost += completion_cost(response)
+                return embeddings, total_cost
+
+            left_embeddings, left_cost = get_embeddings(left_data, left_keys, "left")
+            right_embeddings, right_cost = get_embeddings(
+                right_data, right_keys, "right"
+            )
+            total_cost += left_cost + right_cost
+            self.console.log(
+                f"Created embeddings for datasets. Total embedding creation cost: {total_cost}"
+            )
+
+            # Compute all cosine similarities in one call
+            from sklearn.metrics.pairwise import cosine_similarity
+
+            similarities = cosine_similarity(left_embeddings, right_embeddings)
+
+            # Additional blocking based on embeddings
+            # Find indices where similarity is above threshold
+            above_threshold = np.argwhere(similarities >= blocking_threshold)
+            self.console.log(
+                f"There are {above_threshold.shape[0]} pairs above the threshold."
+            )
+            block_pair_set = set(
+                (get_hashable_key(left_item), get_hashable_key(right_item))
+                for left_item, right_item in blocked_pairs
+            )
+
+            # If limit_comparisons is set, take only the top pairs
+            if limit_comparisons is not None:
+                # First, get all pairs above threshold
+                above_threshold_pairs = [(int(i), int(j)) for i, j in above_threshold]
+
+                # Sort these pairs by their similarity scores
+                sorted_pairs = sorted(
+                    above_threshold_pairs,
+                    key=lambda pair: similarities[pair[0], pair[1]],
+                    reverse=True,
+                )
+
+                # Take the top 'limit_comparisons' pairs
+                top_pairs = sorted_pairs[:limit_comparisons]
+
+                # Create new blocked_pairs based on top similarities and existing blocked pairs
+                new_blocked_pairs = []
+                remaining_limit = limit_comparisons - len(blocked_pairs)
+
+                # First, include all existing blocked pairs
+                final_blocked_pairs = blocked_pairs.copy()
+
+                # Then, add new pairs from top similarities until we reach the limit
+                for i, j in top_pairs:
+                    if remaining_limit <= 0:
+                        break
+                    left_item, right_item = left_data[i], right_data[j]
+                    left_key = get_hashable_key(left_item)
+                    right_key = get_hashable_key(right_item)
+                    if (left_key, right_key) not in block_pair_set:
+                        new_blocked_pairs.append((left_item, right_item))
+                        block_pair_set.add((left_key, right_key))
+                        remaining_limit -= 1
+
+                final_blocked_pairs.extend(new_blocked_pairs)
+                blocked_pairs = final_blocked_pairs
+
+                self.console.log(
+                    f"Limited comparisons to top {limit_comparisons} pairs, including {len(blocked_pairs) - len(new_blocked_pairs)} from code-based blocking and {len(new_blocked_pairs)} based on cosine similarity. Lowest cosine similarity included: {similarities[top_pairs[-1]]:.4f}"
+                )
+            else:
+                # Add new pairs to blocked_pairs
+                for i, j in above_threshold:
+                    left_item, right_item = left_data[i], right_data[j]
+                    left_key = get_hashable_key(left_item)
+                    right_key = get_hashable_key(right_item)
+                    if (left_key, right_key) not in block_pair_set:
+                        blocked_pairs.append((left_item, right_item))
+                        block_pair_set.add((left_key, right_key))
+
+        # If there are no blocking conditions or embedding threshold, use all pairs
+        if not blocking_conditions and blocking_threshold is None:
+            blocked_pairs = [
+                (left_item, right_item)
+                for left_item in left_data
+                for right_item in right_data
+            ]
+
+        # If there's a limit on the number of comparisons, randomly sample pairs
+        if limit_comparisons is not None and len(blocked_pairs) > limit_comparisons:
+            self.console.log(
+                f"Randomly sampling {limit_comparisons} pairs out of {len(blocked_pairs)} blocked pairs."
+            )
+            blocked_pairs = random.sample(blocked_pairs, limit_comparisons)
+
+        self.console.log(
+            f"Total pairs to compare after blocking and sampling: {len(blocked_pairs)}"
+        )
+
+        # Calculate and print statistics
+        total_possible_comparisons = len(left_data) * len(right_data)
+        comparisons_made = len(blocked_pairs)
+        comparisons_saved = total_possible_comparisons - comparisons_made
+        self.console.log(
+            f"[green]Comparisons saved by blocking: {comparisons_saved} "
+            f"({(comparisons_saved / total_possible_comparisons) * 100:.2f}%)[/green]"
+        )
+
+        left_match_counts = defaultdict(int)
+        right_match_counts = defaultdict(int)
+        results = []
+        comparison_costs = 0
+
+        if self.status:
+            self.status.stop()
+
+        with ThreadPoolExecutor(max_workers=self.max_threads) as executor:
+            future_to_pair = {
+                executor.submit(
+                    self.compare_pair,
+                    self.config["comparison_prompt"],
+                    self.config.get("comparison_model", self.default_model),
+                    left,
+                    right,
+                    self.config.get("timeout", 120),
+                    self.config.get("max_retries_per_timeout", 2),
+                ): (left, right)
+                for left, right in blocked_pairs
+            }
+
+            pbar = RichLoopBar(
+                range(len(future_to_pair)),
+                desc="Comparing pairs",
+                console=self.console,
+            )
+
+            for i in pbar:
+                future = list(future_to_pair.keys())[i]
+                pair = future_to_pair[future]
+                is_match, cost = future.result()
+                comparison_costs += cost
+
+                if is_match:
+                    joined_item = {}
+                    left_item, right_item = pair
+                    left_key_hash = get_hashable_key(left_item)
+                    right_key_hash = get_hashable_key(right_item)
+                    if (
+                        left_match_counts[left_key_hash] >= left_limit
+                        or right_match_counts[right_key_hash] >= right_limit
+                    ):
+                        continue
+
+                    for key, value in left_item.items():
+                        joined_item[f"{key}_left" if key in right_item else key] = value
+                    for key, value in right_item.items():
+                        joined_item[f"{key}_right" if key in left_item else key] = value
+                    if self.runner.api.validate_output(
+                        self.config, joined_item, self.console
+                    ):
+                        results.append(joined_item)
+                        left_match_counts[left_key_hash] += 1
+                        right_match_counts[right_key_hash] += 1
+
+                    # TODO: support retry in validation failure
+
+        total_cost += comparison_costs
+
+        if self.status:
+            self.status.start()
+
+        # Calculate and print the join selectivity
+        join_selectivity = (
+            len(results) / (len(left_data) * len(right_data))
+            if len(left_data) * len(right_data) > 0
+            else 0
+        )
+        self.console.log(f"Equijoin selectivity: {join_selectivity:.4f}")
+
+        if self.status:
+            self.status.start()
+
+        return results, total_cost
+
+
+ + + +
+ + + + + + + + + +
+ + +

+ compare_pair(comparison_prompt, model, item1, item2, timeout_seconds=120, max_retries_per_timeout=2) + +

+ + +
+ +

Compares two items using an LLM model to determine if they match.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ comparison_prompt + + str + +
+

The prompt template for comparison.

+
+
+ required +
+ model + + str + +
+

The LLM model to use for comparison.

+
+
+ required +
+ item1 + + Dict + +
+

The first item to compare.

+
+
+ required +
+ item2 + + Dict + +
+

The second item to compare.

+
+
+ required +
+ timeout_seconds + + int + +
+

The timeout for the LLM call in seconds.

+
+
+ 120 +
+ max_retries_per_timeout + + int + +
+

The maximum number of retries per timeout.

+
+
+ 2 +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ Tuple[bool, float] + +
+

Tuple[bool, float]: A tuple containing a boolean indicating whether the items match and the cost of the comparison.

+
+
+ +
+ Source code in docetl/operations/equijoin.py +
 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
def compare_pair(
+    self,
+    comparison_prompt: str,
+    model: str,
+    item1: Dict,
+    item2: Dict,
+    timeout_seconds: int = 120,
+    max_retries_per_timeout: int = 2,
+) -> Tuple[bool, float]:
+    """
+    Compares two items using an LLM model to determine if they match.
+
+    Args:
+        comparison_prompt (str): The prompt template for comparison.
+        model (str): The LLM model to use for comparison.
+        item1 (Dict): The first item to compare.
+        item2 (Dict): The second item to compare.
+        timeout_seconds (int): The timeout for the LLM call in seconds.
+        max_retries_per_timeout (int): The maximum number of retries per timeout.
+
+    Returns:
+        Tuple[bool, float]: A tuple containing a boolean indicating whether the items match and the cost of the comparison.
+    """
+
+    try:
+        prompt = strict_render(comparison_prompt, {"left": item1, "right": item2})
+    except Exception as e:
+        self.console.log(f"[red]Error rendering prompt: {e}[/red]")
+        return False, 0
+    response = self.runner.api.call_llm(
+        model,
+        "compare",
+        [{"role": "user", "content": prompt}],
+        {"is_match": "bool"},
+        timeout_seconds=timeout_seconds,
+        max_retries_per_timeout=max_retries_per_timeout,
+        bypass_cache=self.config.get("bypass_cache", False),
+        litellm_completion_kwargs=self.config.get("litellm_completion_kwargs", {}),
+    )
+    cost = 0
+    try:
+        cost = response.total_cost
+        output = self.runner.api.parse_llm_response(
+            response.response, {"is_match": "bool"}
+        )[0]
+    except Exception as e:
+        self.console.log(f"[red]Error parsing LLM response: {e}[/red]")
+        return False, cost
+    return output["is_match"], cost
+
+
+
+ +
+ +
+ + +

+ execute(left_data, right_data) + +

+ + +
+ +

Executes the equijoin operation on the provided datasets.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ left_data + + List[Dict] + +
+

The left dataset to join.

+
+
+ required +
+ right_data + + List[Dict] + +
+

The right dataset to join.

+
+
+ required +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ Tuple[List[Dict], float] + +
+

Tuple[List[Dict], float]: A tuple containing the joined results and the total cost of the operation.

+
+
+

Usage: +

from docetl.operations import EquijoinOperation
+
+config = {
+    "blocking_keys": {
+        "left": ["id"],
+        "right": ["user_id"]
+    },
+    "limits": {
+        "left": 1,
+        "right": 1
+    },
+    "comparison_prompt": "Compare {{left}} and {{right}} and determine if they match.",
+    "blocking_threshold": 0.8,
+    "blocking_conditions": ["left['id'] == right['user_id']"],
+    "limit_comparisons": 1000
+}
+equijoin_op = EquijoinOperation(config)
+left_data = [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}]
+right_data = [{"user_id": 1, "age": 30}, {"user_id": 2, "age": 25}]
+results, cost = equijoin_op.execute(left_data, right_data)
+print(f"Joined results: {results}")
+print(f"Total cost: {cost}")
+

+

This method performs the following steps: +1. Initial blocking based on specified conditions (if any) +2. Embedding-based blocking (if threshold is provided) +3. LLM-based comparison for blocked pairs +4. Result aggregation and validation

+

The method also calculates and logs statistics such as comparisons saved by blocking and join selectivity.

+ +
+ Source code in docetl/operations/equijoin.py +
163
+164
+165
+166
+167
+168
+169
+170
+171
+172
+173
+174
+175
+176
+177
+178
+179
+180
+181
+182
+183
+184
+185
+186
+187
+188
+189
+190
+191
+192
+193
+194
+195
+196
+197
+198
+199
+200
+201
+202
+203
+204
+205
+206
+207
+208
+209
+210
+211
+212
+213
+214
+215
+216
+217
+218
+219
+220
+221
+222
+223
+224
+225
+226
+227
+228
+229
+230
+231
+232
+233
+234
+235
+236
+237
+238
+239
+240
+241
+242
+243
+244
+245
+246
+247
+248
+249
+250
+251
+252
+253
+254
+255
+256
+257
+258
+259
+260
+261
+262
+263
+264
+265
+266
+267
+268
+269
+270
+271
+272
+273
+274
+275
+276
+277
+278
+279
+280
+281
+282
+283
+284
+285
+286
+287
+288
+289
+290
+291
+292
+293
+294
+295
+296
+297
+298
+299
+300
+301
+302
+303
+304
+305
+306
+307
+308
+309
+310
+311
+312
+313
+314
+315
+316
+317
+318
+319
+320
+321
+322
+323
+324
+325
+326
+327
+328
+329
+330
+331
+332
+333
+334
+335
+336
+337
+338
+339
+340
+341
+342
+343
+344
+345
+346
+347
+348
+349
+350
+351
+352
+353
+354
+355
+356
+357
+358
+359
+360
+361
+362
+363
+364
+365
+366
+367
+368
+369
+370
+371
+372
+373
+374
+375
+376
+377
+378
+379
+380
+381
+382
+383
+384
+385
+386
+387
+388
+389
+390
+391
+392
+393
+394
+395
+396
+397
+398
+399
+400
+401
+402
+403
+404
+405
+406
+407
+408
+409
+410
+411
+412
+413
+414
+415
+416
+417
+418
+419
+420
+421
+422
+423
+424
+425
+426
+427
+428
+429
+430
+431
+432
+433
+434
+435
+436
+437
+438
+439
+440
+441
+442
+443
+444
+445
+446
+447
+448
+449
+450
+451
+452
+453
+454
+455
+456
+457
+458
+459
+460
+461
+462
+463
+464
+465
+466
+467
+468
+469
+470
+471
+472
+473
+474
+475
+476
+477
+478
+479
+480
+481
+482
+483
+484
+485
+486
+487
+488
+489
+490
def execute(
+    self, left_data: List[Dict], right_data: List[Dict]
+) -> Tuple[List[Dict], float]:
+    """
+    Executes the equijoin operation on the provided datasets.
+
+    Args:
+        left_data (List[Dict]): The left dataset to join.
+        right_data (List[Dict]): The right dataset to join.
+
+    Returns:
+        Tuple[List[Dict], float]: A tuple containing the joined results and the total cost of the operation.
+
+    Usage:
+    ```python
+    from docetl.operations import EquijoinOperation
+
+    config = {
+        "blocking_keys": {
+            "left": ["id"],
+            "right": ["user_id"]
+        },
+        "limits": {
+            "left": 1,
+            "right": 1
+        },
+        "comparison_prompt": "Compare {{left}} and {{right}} and determine if they match.",
+        "blocking_threshold": 0.8,
+        "blocking_conditions": ["left['id'] == right['user_id']"],
+        "limit_comparisons": 1000
+    }
+    equijoin_op = EquijoinOperation(config)
+    left_data = [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}]
+    right_data = [{"user_id": 1, "age": 30}, {"user_id": 2, "age": 25}]
+    results, cost = equijoin_op.execute(left_data, right_data)
+    print(f"Joined results: {results}")
+    print(f"Total cost: {cost}")
+    ```
+
+    This method performs the following steps:
+    1. Initial blocking based on specified conditions (if any)
+    2. Embedding-based blocking (if threshold is provided)
+    3. LLM-based comparison for blocked pairs
+    4. Result aggregation and validation
+
+    The method also calculates and logs statistics such as comparisons saved by blocking and join selectivity.
+    """
+
+    blocking_keys = self.config.get("blocking_keys", {})
+    left_keys = blocking_keys.get(
+        "left", list(left_data[0].keys()) if left_data else []
+    )
+    right_keys = blocking_keys.get(
+        "right", list(right_data[0].keys()) if right_data else []
+    )
+    limits = self.config.get(
+        "limits", {"left": float("inf"), "right": float("inf")}
+    )
+    left_limit = limits["left"]
+    right_limit = limits["right"]
+    blocking_threshold = self.config.get("blocking_threshold")
+    blocking_conditions = self.config.get("blocking_conditions", [])
+    limit_comparisons = self.config.get("limit_comparisons")
+    total_cost = 0
+
+    if len(left_data) == 0 or len(right_data) == 0:
+        return [], 0
+
+    if self.status:
+        self.status.stop()
+
+    # Initial blocking using multiprocessing
+    num_processes = min(cpu_count(), len(left_data))
+
+    self.console.log(
+        f"Starting to run code-based blocking rules for {len(left_data)} left and {len(right_data)} right rows ({len(left_data) * len(right_data)} total pairs) with {num_processes} processes..."
+    )
+
+    with Pool(
+        processes=num_processes,
+        initializer=init_worker,
+        initargs=(right_data, blocking_conditions),
+    ) as pool:
+        blocked_pairs_nested = pool.map(process_left_item, left_data)
+
+    # Flatten the nested list of blocked pairs
+    blocked_pairs = [pair for sublist in blocked_pairs_nested for pair in sublist]
+
+    # Check if we have exceeded the pairwise comparison limit
+    if limit_comparisons is not None and len(blocked_pairs) > limit_comparisons:
+        # Sample pairs based on cardinality and length
+        sampled_pairs = stratified_length_sample(
+            blocked_pairs, limit_comparisons, sample_size=1000, console=self.console
+        )
+
+        # Calculate number of dropped pairs
+        dropped_pairs = len(blocked_pairs) - limit_comparisons
+
+        # Prompt the user for confirmation
+        if self.status:
+            self.status.stop()
+        if not Confirm.ask(
+            f"[yellow]Warning: {dropped_pairs} pairs will be dropped due to the comparison limit. "
+            f"Proceeding with {limit_comparisons} randomly sampled pairs. "
+            f"Do you want to continue?[/yellow]",
+            console=self.console,
+        ):
+            raise ValueError("Operation cancelled by user due to pair limit.")
+
+        if self.status:
+            self.status.start()
+
+        blocked_pairs = sampled_pairs
+
+    self.console.log(
+        f"Number of blocked pairs after initial blocking: {len(blocked_pairs)}"
+    )
+
+    if blocking_threshold is not None:
+        embedding_model = self.config.get("embedding_model", self.default_model)
+        model_input_context_length = model_cost.get(embedding_model, {}).get(
+            "max_input_tokens", 8192
+        )
+
+        def get_embeddings(
+            input_data: List[Dict[str, Any]], keys: List[str], name: str
+        ) -> Tuple[List[List[float]], float]:
+            texts = [
+                " ".join(str(item[key]) for key in keys if key in item)[
+                    : model_input_context_length * 4
+                ]
+                for item in input_data
+            ]
+
+            embeddings = []
+            total_cost = 0
+            batch_size = 2000
+            for i in range(0, len(texts), batch_size):
+                batch = texts[i : i + batch_size]
+                self.console.log(
+                    f"On iteration {i} for creating embeddings for {name} data"
+                )
+                response = self.runner.api.gen_embedding(
+                    model=embedding_model,
+                    input=batch,
+                )
+                embeddings.extend([data["embedding"] for data in response["data"]])
+                total_cost += completion_cost(response)
+            return embeddings, total_cost
+
+        left_embeddings, left_cost = get_embeddings(left_data, left_keys, "left")
+        right_embeddings, right_cost = get_embeddings(
+            right_data, right_keys, "right"
+        )
+        total_cost += left_cost + right_cost
+        self.console.log(
+            f"Created embeddings for datasets. Total embedding creation cost: {total_cost}"
+        )
+
+        # Compute all cosine similarities in one call
+        from sklearn.metrics.pairwise import cosine_similarity
+
+        similarities = cosine_similarity(left_embeddings, right_embeddings)
+
+        # Additional blocking based on embeddings
+        # Find indices where similarity is above threshold
+        above_threshold = np.argwhere(similarities >= blocking_threshold)
+        self.console.log(
+            f"There are {above_threshold.shape[0]} pairs above the threshold."
+        )
+        block_pair_set = set(
+            (get_hashable_key(left_item), get_hashable_key(right_item))
+            for left_item, right_item in blocked_pairs
+        )
+
+        # If limit_comparisons is set, take only the top pairs
+        if limit_comparisons is not None:
+            # First, get all pairs above threshold
+            above_threshold_pairs = [(int(i), int(j)) for i, j in above_threshold]
+
+            # Sort these pairs by their similarity scores
+            sorted_pairs = sorted(
+                above_threshold_pairs,
+                key=lambda pair: similarities[pair[0], pair[1]],
+                reverse=True,
+            )
+
+            # Take the top 'limit_comparisons' pairs
+            top_pairs = sorted_pairs[:limit_comparisons]
+
+            # Create new blocked_pairs based on top similarities and existing blocked pairs
+            new_blocked_pairs = []
+            remaining_limit = limit_comparisons - len(blocked_pairs)
+
+            # First, include all existing blocked pairs
+            final_blocked_pairs = blocked_pairs.copy()
+
+            # Then, add new pairs from top similarities until we reach the limit
+            for i, j in top_pairs:
+                if remaining_limit <= 0:
+                    break
+                left_item, right_item = left_data[i], right_data[j]
+                left_key = get_hashable_key(left_item)
+                right_key = get_hashable_key(right_item)
+                if (left_key, right_key) not in block_pair_set:
+                    new_blocked_pairs.append((left_item, right_item))
+                    block_pair_set.add((left_key, right_key))
+                    remaining_limit -= 1
+
+            final_blocked_pairs.extend(new_blocked_pairs)
+            blocked_pairs = final_blocked_pairs
+
+            self.console.log(
+                f"Limited comparisons to top {limit_comparisons} pairs, including {len(blocked_pairs) - len(new_blocked_pairs)} from code-based blocking and {len(new_blocked_pairs)} based on cosine similarity. Lowest cosine similarity included: {similarities[top_pairs[-1]]:.4f}"
+            )
+        else:
+            # Add new pairs to blocked_pairs
+            for i, j in above_threshold:
+                left_item, right_item = left_data[i], right_data[j]
+                left_key = get_hashable_key(left_item)
+                right_key = get_hashable_key(right_item)
+                if (left_key, right_key) not in block_pair_set:
+                    blocked_pairs.append((left_item, right_item))
+                    block_pair_set.add((left_key, right_key))
+
+    # If there are no blocking conditions or embedding threshold, use all pairs
+    if not blocking_conditions and blocking_threshold is None:
+        blocked_pairs = [
+            (left_item, right_item)
+            for left_item in left_data
+            for right_item in right_data
+        ]
+
+    # If there's a limit on the number of comparisons, randomly sample pairs
+    if limit_comparisons is not None and len(blocked_pairs) > limit_comparisons:
+        self.console.log(
+            f"Randomly sampling {limit_comparisons} pairs out of {len(blocked_pairs)} blocked pairs."
+        )
+        blocked_pairs = random.sample(blocked_pairs, limit_comparisons)
+
+    self.console.log(
+        f"Total pairs to compare after blocking and sampling: {len(blocked_pairs)}"
+    )
+
+    # Calculate and print statistics
+    total_possible_comparisons = len(left_data) * len(right_data)
+    comparisons_made = len(blocked_pairs)
+    comparisons_saved = total_possible_comparisons - comparisons_made
+    self.console.log(
+        f"[green]Comparisons saved by blocking: {comparisons_saved} "
+        f"({(comparisons_saved / total_possible_comparisons) * 100:.2f}%)[/green]"
+    )
+
+    left_match_counts = defaultdict(int)
+    right_match_counts = defaultdict(int)
+    results = []
+    comparison_costs = 0
+
+    if self.status:
+        self.status.stop()
+
+    with ThreadPoolExecutor(max_workers=self.max_threads) as executor:
+        future_to_pair = {
+            executor.submit(
+                self.compare_pair,
+                self.config["comparison_prompt"],
+                self.config.get("comparison_model", self.default_model),
+                left,
+                right,
+                self.config.get("timeout", 120),
+                self.config.get("max_retries_per_timeout", 2),
+            ): (left, right)
+            for left, right in blocked_pairs
+        }
+
+        pbar = RichLoopBar(
+            range(len(future_to_pair)),
+            desc="Comparing pairs",
+            console=self.console,
+        )
+
+        for i in pbar:
+            future = list(future_to_pair.keys())[i]
+            pair = future_to_pair[future]
+            is_match, cost = future.result()
+            comparison_costs += cost
+
+            if is_match:
+                joined_item = {}
+                left_item, right_item = pair
+                left_key_hash = get_hashable_key(left_item)
+                right_key_hash = get_hashable_key(right_item)
+                if (
+                    left_match_counts[left_key_hash] >= left_limit
+                    or right_match_counts[right_key_hash] >= right_limit
+                ):
+                    continue
+
+                for key, value in left_item.items():
+                    joined_item[f"{key}_left" if key in right_item else key] = value
+                for key, value in right_item.items():
+                    joined_item[f"{key}_right" if key in left_item else key] = value
+                if self.runner.api.validate_output(
+                    self.config, joined_item, self.console
+                ):
+                    results.append(joined_item)
+                    left_match_counts[left_key_hash] += 1
+                    right_match_counts[right_key_hash] += 1
+
+                # TODO: support retry in validation failure
+
+    total_cost += comparison_costs
+
+    if self.status:
+        self.status.start()
+
+    # Calculate and print the join selectivity
+    join_selectivity = (
+        len(results) / (len(left_data) * len(right_data))
+        if len(left_data) * len(right_data) > 0
+        else 0
+    )
+    self.console.log(f"Equijoin selectivity: {join_selectivity:.4f}")
+
+    if self.status:
+        self.status.start()
+
+    return results, total_cost
+
+
+
+ +
+ +
+ + +

+ syntax_check() + +

+ + +
+ +

Checks the configuration of the EquijoinOperation for required keys and valid structure.

+ + +

Raises:

+ + + + + + + + + + + + + + + + + +
TypeDescription
+ ValueError + +
+

If required keys are missing or if the blocking_keys structure is invalid.

+
+
+ Specifically + +
+ +
+
+ +
+ Source code in docetl/operations/equijoin.py +
125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
def syntax_check(self) -> None:
+    """
+    Checks the configuration of the EquijoinOperation for required keys and valid structure.
+
+    Raises:
+        ValueError: If required keys are missing or if the blocking_keys structure is invalid.
+        Specifically:
+        - Raises if 'comparison_prompt' is missing from the config.
+        - Raises if 'left' or 'right' are missing from the 'blocking_keys' structure (if present).
+        - Raises if 'left' or 'right' are missing from the 'limits' structure (if present).
+    """
+    if "comparison_prompt" not in self.config:
+        raise ValueError(
+            "Missing required key 'comparison_prompt' in EquijoinOperation configuration"
+        )
+
+    if "blocking_keys" in self.config:
+        if (
+            "left" not in self.config["blocking_keys"]
+            or "right" not in self.config["blocking_keys"]
+        ):
+            raise ValueError(
+                "Both 'left' and 'right' must be specified in 'blocking_keys'"
+            )
+
+    if "limits" in self.config:
+        if (
+            "left" not in self.config["limits"]
+            or "right" not in self.config["limits"]
+        ):
+            raise ValueError(
+                "Both 'left' and 'right' must be specified in 'limits'"
+            )
+
+    if "limit_comparisons" in self.config:
+        if not isinstance(self.config["limit_comparisons"], int):
+            raise ValueError("limit_comparisons must be an integer")
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ docetl.operations.cluster.ClusterOperation + + +

+ + +
+

+ Bases: BaseOperation

+ + + + + + + +
+ Source code in docetl/operations/cluster.py +
 12
+ 13
+ 14
+ 15
+ 16
+ 17
+ 18
+ 19
+ 20
+ 21
+ 22
+ 23
+ 24
+ 25
+ 26
+ 27
+ 28
+ 29
+ 30
+ 31
+ 32
+ 33
+ 34
+ 35
+ 36
+ 37
+ 38
+ 39
+ 40
+ 41
+ 42
+ 43
+ 44
+ 45
+ 46
+ 47
+ 48
+ 49
+ 50
+ 51
+ 52
+ 53
+ 54
+ 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
+162
+163
+164
+165
+166
+167
+168
+169
+170
+171
+172
+173
+174
+175
+176
+177
+178
+179
+180
+181
+182
+183
+184
+185
+186
+187
+188
+189
+190
+191
+192
+193
+194
+195
+196
+197
+198
+199
+200
+201
+202
+203
+204
+205
+206
+207
+208
+209
+210
+211
+212
+213
+214
+215
+216
+217
+218
+219
+220
+221
+222
+223
+224
+225
+226
+227
+228
+229
+230
+231
+232
+233
+234
+235
+236
+237
+238
+239
+240
+241
+242
+243
+244
+245
+246
+247
+248
+249
+250
+251
+252
+253
+254
+255
+256
+257
+258
+259
+260
+261
+262
class ClusterOperation(BaseOperation):
+    def __init__(
+        self,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.max_batch_size: int = self.config.get(
+            "max_batch_size", kwargs.get("max_batch_size", float("inf"))
+        )
+
+    def syntax_check(self) -> None:
+        """
+        Checks the configuration of the ClusterOperation for required keys and valid structure.
+
+        Raises:
+            ValueError: If required keys are missing or invalid in the configuration.
+            TypeError: If configuration values have incorrect types.
+        """
+        required_keys = ["embedding_keys", "summary_schema", "summary_prompt"]
+        for key in required_keys:
+            if key not in self.config:
+                raise ValueError(
+                    f"Missing required key '{key}' in ClusterOperation configuration"
+                )
+
+        if not isinstance(self.config["embedding_keys"], list):
+            raise TypeError("'embedding_keys' must be a list of strings")
+
+        if "output_key" in self.config:
+            if not isinstance(self.config["output_key"], str):
+                raise TypeError("'output_key' must be a string")
+
+        if not isinstance(self.config["summary_schema"], dict):
+            raise TypeError("'summary_schema' must be a dictionary")
+
+        if not isinstance(self.config["summary_prompt"], str):
+            raise TypeError("'prompt' must be a string")
+
+        # Check if the prompt is a valid Jinja2 template
+        try:
+            Template(self.config["summary_prompt"])
+        except Exception as e:
+            raise ValueError(f"Invalid Jinja2 template in 'prompt': {str(e)}")
+
+        # Check optional parameters
+        if "max_batch_size" in self.config:
+            if not isinstance(self.config["max_batch_size"], int):
+                raise TypeError("'max_batch_size' must be an integer")
+
+        if "embedding_model" in self.config:
+            if not isinstance(self.config["embedding_model"], str):
+                raise TypeError("'embedding_model' must be a string")
+
+        if "model" in self.config:
+            if not isinstance(self.config["model"], str):
+                raise TypeError("'model' must be a string")
+
+        if "validate" in self.config:
+            if not isinstance(self.config["validate"], list):
+                raise TypeError("'validate' must be a list of strings")
+            for rule in self.config["validate"]:
+                if not isinstance(rule, str):
+                    raise TypeError("Each validation rule must be a string")
+
+    def execute(
+        self, input_data: List[Dict], is_build: bool = False
+    ) -> Tuple[List[Dict], float]:
+        """
+        Executes the cluster operation on the input data. Modifies the
+        input data and returns it in place.
+
+        Args:
+            input_data (List[Dict]): A list of dictionaries to process.
+            is_build (bool): Whether the operation is being executed
+              in the build phase. Defaults to False.
+
+        Returns:
+            Tuple[List[Dict], float]: A tuple containing the clustered
+              list of dictionaries and the total cost of the operation.
+        """
+        if not input_data:
+            return input_data, 0
+
+        if len(input_data) == 1:
+            input_data[0][self.config.get("output_key", "clusters")] = ()
+            return input_data, 0
+
+        embeddings, cost = get_embeddings_for_clustering(
+            input_data, self.config, self.runner.api
+        )
+
+        tree = self.agglomerative_cluster_of_embeddings(input_data, embeddings)
+
+        if "collapse" in self.config:
+            tree = self.collapse_tree(tree, collapse=self.config["collapse"])
+
+        self.prompt_template = Template(self.config["summary_prompt"])
+        cost += self.annotate_clustering_tree(tree)
+        self.annotate_leaves(tree)
+
+        return input_data, cost
+
+    def agglomerative_cluster_of_embeddings(self, input_data, embeddings):
+        import sklearn.cluster
+
+        cl = sklearn.cluster.AgglomerativeClustering(
+            compute_full_tree=True, compute_distances=True
+        )
+        cl.fit(embeddings)
+
+        nsamples = len(embeddings)
+
+        def build_tree(i):
+            if i < nsamples:
+                res = input_data[i]
+                #                res["embedding"] = list(embeddings[i])
+                return res
+            return {
+                "children": [
+                    build_tree(cl.children_[i - nsamples, 0]),
+                    build_tree(cl.children_[i - nsamples, 1]),
+                ],
+                "distance": cl.distances_[i - nsamples],
+            }
+
+        return build_tree(nsamples + len(cl.children_) - 1)
+
+    def get_tree_distances(self, t):
+        res = set()
+        if "distance" in t:
+            res.update(
+                set(
+                    [
+                        t["distance"] - child["distance"]
+                        for child in t["children"]
+                        if "distance" in child
+                    ]
+                )
+            )
+        if "children" in t:
+            for child in t["children"]:
+                res.update(self.get_tree_distances(child))
+        return res
+
+    def _collapse_tree(self, t, parent_dist=None, collapse=None):
+        if "children" in t:
+            if (
+                "distance" in t
+                and parent_dist is not None
+                and collapse is not None
+                and parent_dist - t["distance"] < collapse
+            ):
+                return [
+                    grandchild
+                    for child in t["children"]
+                    for grandchild in self._collapse_tree(
+                        child, parent_dist=parent_dist, collapse=collapse
+                    )
+                ]
+            else:
+                res = dict(t)
+                res["children"] = [
+                    grandchild
+                    for idx, child in enumerate(t["children"])
+                    for grandchild in self._collapse_tree(
+                        child, parent_dist=t["distance"], collapse=collapse
+                    )
+                ]
+                return [res]
+        else:
+            return [t]
+
+    def collapse_tree(self, tree, collapse=None):
+        if collapse is not None:
+            tree_distances = np.array(sorted(self.get_tree_distances(tree)))
+            collapse = tree_distances[int(len(tree_distances) * collapse)]
+        return self._collapse_tree(tree, collapse=collapse)[0]
+
+    def annotate_clustering_tree(self, t):
+        if "children" in t:
+            with ThreadPoolExecutor(max_workers=self.max_batch_size) as executor:
+                futures = [
+                    executor.submit(self.annotate_clustering_tree, child)
+                    for child in t["children"]
+                ]
+
+                total_cost = 0
+                pbar = RichLoopBar(
+                    range(len(futures)),
+                    desc=f"Processing {self.config['name']} (map) on all documents",
+                    console=self.console,
+                )
+                for i in pbar:
+                    total_cost += futures[i].result()
+                    pbar.update(i)
+
+            prompt = strict_render(self.prompt_template, {"inputs": t["children"]})
+
+            def validation_fn(response: Dict[str, Any]):
+                output = self.runner.api.parse_llm_response(
+                    response,
+                    schema=self.config["summary_schema"],
+                    manually_fix_errors=self.manually_fix_errors,
+                )[0]
+                if self.runner.api.validate_output(self.config, output, self.console):
+                    return output, True
+                return output, False
+
+            response = self.runner.api.call_llm(
+                model=self.config.get("model", self.default_model),
+                op_type="cluster",
+                messages=[{"role": "user", "content": prompt}],
+                output_schema=self.config["summary_schema"],
+                timeout_seconds=self.config.get("timeout", 120),
+                bypass_cache=self.config.get("bypass_cache", False),
+                max_retries_per_timeout=self.config.get("max_retries_per_timeout", 2),
+                validation_config=(
+                    {
+                        "num_retries": self.num_retries_on_validate_failure,
+                        "val_rule": self.config.get("validate", []),
+                        "validation_fn": validation_fn,
+                    }
+                    if self.config.get("validate", None)
+                    else None
+                ),
+                verbose=self.config.get("verbose", False),
+                litellm_completion_kwargs=self.config.get(
+                    "litellm_completion_kwargs", {}
+                ),
+            )
+            total_cost += response.total_cost
+            if response.validated:
+                output = self.runner.api.parse_llm_response(
+                    response.response,
+                    schema=self.config["summary_schema"],
+                    manually_fix_errors=self.manually_fix_errors,
+                )[0]
+                t.update(output)
+
+            return total_cost
+        return 0
+
+    def annotate_leaves(self, tree, path=()):
+        if "children" in tree:
+            item = dict(tree)
+            item.pop("children")
+            for child in tree["children"]:
+                self.annotate_leaves(child, path=(item,) + path)
+        else:
+            tree[self.config.get("output_key", "clusters")] = path
+
+
+ + + +
+ + + + + + + + + +
+ + +

+ execute(input_data, is_build=False) + +

+ + +
+ +

Executes the cluster operation on the input data. Modifies the +input data and returns it in place.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ input_data + + List[Dict] + +
+

A list of dictionaries to process.

+
+
+ required +
+ is_build + + bool + +
+

Whether the operation is being executed +in the build phase. Defaults to False.

+
+
+ False +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ Tuple[List[Dict], float] + +
+

Tuple[List[Dict], float]: A tuple containing the clustered +list of dictionaries and the total cost of the operation.

+
+
+ +
+ Source code in docetl/operations/cluster.py +
 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
def execute(
+    self, input_data: List[Dict], is_build: bool = False
+) -> Tuple[List[Dict], float]:
+    """
+    Executes the cluster operation on the input data. Modifies the
+    input data and returns it in place.
+
+    Args:
+        input_data (List[Dict]): A list of dictionaries to process.
+        is_build (bool): Whether the operation is being executed
+          in the build phase. Defaults to False.
+
+    Returns:
+        Tuple[List[Dict], float]: A tuple containing the clustered
+          list of dictionaries and the total cost of the operation.
+    """
+    if not input_data:
+        return input_data, 0
+
+    if len(input_data) == 1:
+        input_data[0][self.config.get("output_key", "clusters")] = ()
+        return input_data, 0
+
+    embeddings, cost = get_embeddings_for_clustering(
+        input_data, self.config, self.runner.api
+    )
+
+    tree = self.agglomerative_cluster_of_embeddings(input_data, embeddings)
+
+    if "collapse" in self.config:
+        tree = self.collapse_tree(tree, collapse=self.config["collapse"])
+
+    self.prompt_template = Template(self.config["summary_prompt"])
+    cost += self.annotate_clustering_tree(tree)
+    self.annotate_leaves(tree)
+
+    return input_data, cost
+
+
+
+ +
+ +
+ + +

+ syntax_check() + +

+ + +
+ +

Checks the configuration of the ClusterOperation for required keys and valid structure.

+ + +

Raises:

+ + + + + + + + + + + + + + + + + +
TypeDescription
+ ValueError + +
+

If required keys are missing or invalid in the configuration.

+
+
+ TypeError + +
+

If configuration values have incorrect types.

+
+
+ +
+ Source code in docetl/operations/cluster.py +
23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
+52
+53
+54
+55
+56
+57
+58
+59
+60
+61
+62
+63
+64
+65
+66
+67
+68
+69
+70
+71
+72
+73
+74
+75
def syntax_check(self) -> None:
+    """
+    Checks the configuration of the ClusterOperation for required keys and valid structure.
+
+    Raises:
+        ValueError: If required keys are missing or invalid in the configuration.
+        TypeError: If configuration values have incorrect types.
+    """
+    required_keys = ["embedding_keys", "summary_schema", "summary_prompt"]
+    for key in required_keys:
+        if key not in self.config:
+            raise ValueError(
+                f"Missing required key '{key}' in ClusterOperation configuration"
+            )
+
+    if not isinstance(self.config["embedding_keys"], list):
+        raise TypeError("'embedding_keys' must be a list of strings")
+
+    if "output_key" in self.config:
+        if not isinstance(self.config["output_key"], str):
+            raise TypeError("'output_key' must be a string")
+
+    if not isinstance(self.config["summary_schema"], dict):
+        raise TypeError("'summary_schema' must be a dictionary")
+
+    if not isinstance(self.config["summary_prompt"], str):
+        raise TypeError("'prompt' must be a string")
+
+    # Check if the prompt is a valid Jinja2 template
+    try:
+        Template(self.config["summary_prompt"])
+    except Exception as e:
+        raise ValueError(f"Invalid Jinja2 template in 'prompt': {str(e)}")
+
+    # Check optional parameters
+    if "max_batch_size" in self.config:
+        if not isinstance(self.config["max_batch_size"], int):
+            raise TypeError("'max_batch_size' must be an integer")
+
+    if "embedding_model" in self.config:
+        if not isinstance(self.config["embedding_model"], str):
+            raise TypeError("'embedding_model' must be a string")
+
+    if "model" in self.config:
+        if not isinstance(self.config["model"], str):
+            raise TypeError("'model' must be a string")
+
+    if "validate" in self.config:
+        if not isinstance(self.config["validate"], list):
+            raise TypeError("'validate' must be a list of strings")
+        for rule in self.config["validate"]:
+            if not isinstance(rule, str):
+                raise TypeError("Each validation rule must be a string")
+
+
+
+ +
+ + + +
+ +
+ +

Auxiliary Operators

+ + +
+ + + +

+ docetl.operations.split.SplitOperation + + +

+ + +
+

+ Bases: BaseOperation

+ + +

A class that implements a split operation on input data, dividing it into manageable chunks.

+

This class extends BaseOperation to: +1. Split input data into chunks of specified size based on the 'split_key' and 'token_count' configuration. +2. Assign unique identifiers to each original document and number chunks sequentially. +3. Return results containing: + - {split_key}_chunk: The content of the split chunk. + - {name}_id: A unique identifier for each original document. + - {name}_chunk_num: The sequential number of the chunk within its original document.

+ + + + + + +
+ Source code in docetl/operations/split.py +
  9
+ 10
+ 11
+ 12
+ 13
+ 14
+ 15
+ 16
+ 17
+ 18
+ 19
+ 20
+ 21
+ 22
+ 23
+ 24
+ 25
+ 26
+ 27
+ 28
+ 29
+ 30
+ 31
+ 32
+ 33
+ 34
+ 35
+ 36
+ 37
+ 38
+ 39
+ 40
+ 41
+ 42
+ 43
+ 44
+ 45
+ 46
+ 47
+ 48
+ 49
+ 50
+ 51
+ 52
+ 53
+ 54
+ 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
class SplitOperation(BaseOperation):
+    """
+    A class that implements a split operation on input data, dividing it into manageable chunks.
+
+    This class extends BaseOperation to:
+    1. Split input data into chunks of specified size based on the 'split_key' and 'token_count' configuration.
+    2. Assign unique identifiers to each original document and number chunks sequentially.
+    3. Return results containing:
+       - {split_key}_chunk: The content of the split chunk.
+       - {name}_id: A unique identifier for each original document.
+       - {name}_chunk_num: The sequential number of the chunk within its original document.
+    """
+
+    class schema(BaseOperation.schema):
+        type: str = "split"
+        split_key: str
+        method: str
+        method_kwargs: Dict[str, Any]
+        model: Optional[str] = None
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.name = self.config["name"]
+
+    def syntax_check(self) -> None:
+        required_keys = ["split_key", "method", "method_kwargs"]
+        for key in required_keys:
+            if key not in self.config:
+                raise ValueError(
+                    f"Missing required key '{key}' in SplitOperation configuration"
+                )
+
+        if not isinstance(self.config["split_key"], str):
+            raise TypeError("'split_key' must be a string")
+
+        if self.config["method"] not in ["token_count", "delimiter"]:
+            raise ValueError(f"Invalid method '{self.config['method']}'")
+
+        if self.config["method"] == "token_count":
+            if (
+                not isinstance(self.config["method_kwargs"]["num_tokens"], int)
+                or self.config["method_kwargs"]["num_tokens"] <= 0
+            ):
+                raise ValueError("'num_tokens' must be a positive integer")
+        elif self.config["method"] == "delimiter":
+            if not isinstance(self.config["method_kwargs"]["delimiter"], str):
+                raise ValueError("'delimiter' must be a string")
+
+    def execute(self, input_data: List[Dict]) -> Tuple[List[Dict], float]:
+        split_key = self.config["split_key"]
+        method = self.config["method"]
+        method_kwargs = self.config["method_kwargs"]
+        try:
+            encoder = tiktoken.encoding_for_model(
+                self.config["method_kwargs"]
+                .get("model", self.default_model)
+                .split("/")[-1]
+            )
+        except Exception:
+            encoder = tiktoken.encoding_for_model("gpt-4o")
+
+        results = []
+        cost = 0.0
+
+        for item in input_data:
+            if split_key not in item:
+                raise KeyError(f"Split key '{split_key}' not found in item")
+
+            content = item[split_key]
+            doc_id = str(uuid.uuid4())
+
+            if method == "token_count":
+                token_count = method_kwargs["num_tokens"]
+                tokens = encoder.encode(content)
+
+                for chunk_num, i in enumerate(
+                    range(0, len(tokens), token_count), start=1
+                ):
+                    chunk_tokens = tokens[i : i + token_count]
+                    chunk = encoder.decode(chunk_tokens)
+
+                    result = item.copy()
+                    result.update(
+                        {
+                            f"{split_key}_chunk": chunk,
+                            f"{self.name}_id": doc_id,
+                            f"{self.name}_chunk_num": chunk_num,
+                        }
+                    )
+                    results.append(result)
+
+            elif method == "delimiter":
+                delimiter = method_kwargs["delimiter"]
+                num_splits_to_group = method_kwargs.get("num_splits_to_group", 1)
+                chunks = content.split(delimiter)
+
+                # Get rid of empty chunks
+                chunks = [chunk for chunk in chunks if chunk.strip()]
+
+                for chunk_num, i in enumerate(
+                    range(0, len(chunks), num_splits_to_group), start=1
+                ):
+                    grouped_chunks = chunks[i : i + num_splits_to_group]
+                    joined_chunk = delimiter.join(grouped_chunks).strip()
+
+                    result = item.copy()
+                    result.update(
+                        {
+                            f"{split_key}_chunk": joined_chunk,
+                            f"{self.name}_id": doc_id,
+                            f"{self.name}_chunk_num": chunk_num,
+                        }
+                    )
+                    results.append(result)
+
+        return results, cost
+
+
+ + + +
+ + + + + + + + + + + +
+ +
+ +
+ +
+ + + +

+ docetl.operations.gather.GatherOperation + + +

+ + +
+

+ Bases: BaseOperation

+ + +

A class that implements a gather operation on input data, adding contextual information from surrounding chunks.

+

This class extends BaseOperation to: +1. Group chunks by their document ID. +2. Order chunks within each group. +3. Add peripheral context to each chunk based on the configuration. +4. Include headers for each chunk and its upward hierarchy. +5. Return results containing the rendered chunks with added context, including information about skipped characters and headers.

+ + + + + + +
+ Source code in docetl/operations/gather.py +
  6
+  7
+  8
+  9
+ 10
+ 11
+ 12
+ 13
+ 14
+ 15
+ 16
+ 17
+ 18
+ 19
+ 20
+ 21
+ 22
+ 23
+ 24
+ 25
+ 26
+ 27
+ 28
+ 29
+ 30
+ 31
+ 32
+ 33
+ 34
+ 35
+ 36
+ 37
+ 38
+ 39
+ 40
+ 41
+ 42
+ 43
+ 44
+ 45
+ 46
+ 47
+ 48
+ 49
+ 50
+ 51
+ 52
+ 53
+ 54
+ 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
+162
+163
+164
+165
+166
+167
+168
+169
+170
+171
+172
+173
+174
+175
+176
+177
+178
+179
+180
+181
+182
+183
+184
+185
+186
+187
+188
+189
+190
+191
+192
+193
+194
+195
+196
+197
+198
+199
+200
+201
+202
+203
+204
+205
+206
+207
+208
+209
+210
+211
+212
+213
+214
+215
+216
+217
+218
+219
+220
+221
+222
+223
+224
+225
+226
+227
+228
+229
+230
+231
+232
+233
+234
+235
+236
+237
+238
+239
+240
+241
+242
+243
+244
+245
+246
+247
+248
+249
+250
+251
+252
+253
+254
+255
+256
+257
+258
+259
+260
+261
+262
+263
+264
+265
+266
+267
+268
+269
+270
+271
+272
+273
+274
+275
+276
+277
+278
+279
+280
+281
+282
+283
+284
+285
+286
+287
+288
+289
+290
+291
+292
+293
+294
+295
+296
+297
+298
+299
+300
+301
+302
+303
+304
+305
+306
+307
+308
+309
+310
+311
+312
+313
+314
+315
+316
+317
+318
+319
+320
+321
+322
+323
+324
+325
class GatherOperation(BaseOperation):
+    """
+    A class that implements a gather operation on input data, adding contextual information from surrounding chunks.
+
+    This class extends BaseOperation to:
+    1. Group chunks by their document ID.
+    2. Order chunks within each group.
+    3. Add peripheral context to each chunk based on the configuration.
+    4. Include headers for each chunk and its upward hierarchy.
+    5. Return results containing the rendered chunks with added context, including information about skipped characters and headers.
+    """
+
+    class schema(BaseOperation.schema):
+        type: str = "gather"
+        content_key: str
+        doc_id_key: str
+        order_key: str
+        peripheral_chunks: Dict[str, Any]
+        doc_header_key: Optional[str] = None
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        """
+        Initialize the GatherOperation.
+
+        Args:
+            *args: Variable length argument list.
+            **kwargs: Arbitrary keyword arguments.
+        """
+        super().__init__(*args, **kwargs)
+
+    def syntax_check(self) -> None:
+        """
+        Perform a syntax check on the operation configuration.
+
+        Raises:
+            ValueError: If required keys are missing or if there are configuration errors.
+            TypeError: If main_chunk_start or main_chunk_end are not strings.
+        """
+        required_keys = ["content_key", "doc_id_key", "order_key"]
+        for key in required_keys:
+            if key not in self.config:
+                raise ValueError(
+                    f"Missing required key '{key}' in GatherOperation configuration"
+                )
+
+        peripheral_config = self.config.get("peripheral_chunks", {})
+        for direction in ["previous", "next"]:
+            if direction not in peripheral_config:
+                continue
+            for section in ["head", "middle", "tail"]:
+                if section in peripheral_config[direction]:
+                    section_config = peripheral_config[direction][section]
+                    if section != "middle" and "count" not in section_config:
+                        raise ValueError(
+                            f"Missing 'count' in {direction}.{section} configuration"
+                        )
+
+        if "main_chunk_start" in self.config and not isinstance(
+            self.config["main_chunk_start"], str
+        ):
+            raise TypeError("'main_chunk_start' must be a string")
+        if "main_chunk_end" in self.config and not isinstance(
+            self.config["main_chunk_end"], str
+        ):
+            raise TypeError("'main_chunk_end' must be a string")
+
+    def execute(self, input_data: List[Dict]) -> Tuple[List[Dict], float]:
+        """
+        Execute the gather operation on the input data.
+
+        Args:
+            input_data (List[Dict]): The input data to process.
+
+        Returns:
+            Tuple[List[Dict], float]: A tuple containing the processed results and the cost of the operation.
+        """
+        content_key = self.config["content_key"]
+        doc_id_key = self.config["doc_id_key"]
+        order_key = self.config["order_key"]
+        peripheral_config = self.config.get("peripheral_chunks", {})
+        main_chunk_start = self.config.get(
+            "main_chunk_start", "--- Begin Main Chunk ---"
+        )
+        main_chunk_end = self.config.get("main_chunk_end", "--- End Main Chunk ---")
+        doc_header_key = self.config.get("doc_header_key", None)
+        results = []
+        cost = 0.0
+
+        # Group chunks by document ID
+        grouped_chunks = {}
+        for item in input_data:
+            doc_id = item[doc_id_key]
+            if doc_id not in grouped_chunks:
+                grouped_chunks[doc_id] = []
+            grouped_chunks[doc_id].append(item)
+
+        # Process each group of chunks
+        for chunks in grouped_chunks.values():
+            # Sort chunks by their order within the document
+            chunks.sort(key=lambda x: x[order_key])
+
+            # Process each chunk with its peripheral context and headers
+            for i, chunk in enumerate(chunks):
+                rendered_chunk = self.render_chunk_with_context(
+                    chunks,
+                    i,
+                    peripheral_config,
+                    content_key,
+                    order_key,
+                    main_chunk_start,
+                    main_chunk_end,
+                    doc_header_key,
+                )
+
+                result = chunk.copy()
+                result[f"{content_key}_rendered"] = rendered_chunk
+                results.append(result)
+
+        return results, cost
+
+    def render_chunk_with_context(
+        self,
+        chunks: List[Dict],
+        current_index: int,
+        peripheral_config: Dict,
+        content_key: str,
+        order_key: str,
+        main_chunk_start: str,
+        main_chunk_end: str,
+        doc_header_key: str,
+    ) -> str:
+        """
+        Render a chunk with its peripheral context and headers.
+
+        Args:
+            chunks (List[Dict]): List of all chunks in the document.
+            current_index (int): Index of the current chunk being processed.
+            peripheral_config (Dict): Configuration for peripheral chunks.
+            content_key (str): Key for the content in each chunk.
+            order_key (str): Key for the order of each chunk.
+            main_chunk_start (str): String to mark the start of the main chunk.
+            main_chunk_end (str): String to mark the end of the main chunk.
+            doc_header_key (str): The key for the headers in the current chunk.
+
+        Returns:
+            str: Renderted chunk with context and headers.
+        """
+
+        # If there are no peripheral chunks, return the main chunk
+        if not peripheral_config:
+            return chunks[current_index][content_key]
+
+        combined_parts = ["--- Previous Context ---"]
+
+        combined_parts.extend(
+            self.process_peripheral_chunks(
+                chunks[:current_index],
+                peripheral_config.get("previous", {}),
+                content_key,
+                order_key,
+            )
+        )
+        combined_parts.append("--- End Previous Context ---\n")
+
+        # Process main chunk
+        main_chunk = chunks[current_index]
+        if headers := self.render_hierarchy_headers(
+            main_chunk, chunks[: current_index + 1], doc_header_key
+        ):
+            combined_parts.append(headers)
+        combined_parts.extend(
+            (
+                f"{main_chunk_start}",
+                f"{main_chunk[content_key]}",
+                f"{main_chunk_end}",
+                "\n--- Next Context ---",
+            )
+        )
+        combined_parts.extend(
+            self.process_peripheral_chunks(
+                chunks[current_index + 1 :],
+                peripheral_config.get("next", {}),
+                content_key,
+                order_key,
+            )
+        )
+        combined_parts.append("--- End Next Context ---")
+
+        return "\n".join(combined_parts)
+
+    def process_peripheral_chunks(
+        self,
+        chunks: List[Dict],
+        config: Dict,
+        content_key: str,
+        order_key: str,
+        reverse: bool = False,
+    ) -> List[str]:
+        """
+        Process peripheral chunks according to the configuration.
+
+        Args:
+            chunks (List[Dict]): List of chunks to process.
+            config (Dict): Configuration for processing peripheral chunks.
+            content_key (str): Key for the content in each chunk.
+            order_key (str): Key for the order of each chunk.
+            reverse (bool, optional): Whether to process chunks in reverse order. Defaults to False.
+
+        Returns:
+            List[str]: List of processed chunk strings.
+        """
+        if reverse:
+            chunks = list(reversed(chunks))
+
+        processed_parts = []
+        included_chunks = []
+        total_chunks = len(chunks)
+
+        head_config = config.get("head", {})
+        tail_config = config.get("tail", {})
+
+        head_count = int(head_config.get("count", 0))
+        tail_count = int(tail_config.get("count", 0))
+        in_skip = False
+        skip_char_count = 0
+
+        for i, chunk in enumerate(chunks):
+            if i < head_count:
+                section = "head"
+            elif i >= total_chunks - tail_count:
+                section = "tail"
+            elif "middle" in config:
+                section = "middle"
+            else:
+                # Show number of characters skipped
+                skipped_chars = len(chunk[content_key])
+                if not in_skip:
+                    skip_char_count = skipped_chars
+                    in_skip = True
+                else:
+                    skip_char_count += skipped_chars
+
+                continue
+
+            if in_skip:
+                processed_parts.append(
+                    f"[... {skip_char_count} characters skipped ...]"
+                )
+                in_skip = False
+                skip_char_count = 0
+
+            section_config = config.get(section, {})
+            section_content_key = section_config.get("content_key", content_key)
+
+            is_summary = section_content_key != content_key
+            summary_suffix = " (Summary)" if is_summary else ""
+
+            chunk_prefix = f"[Chunk {chunk[order_key]}{summary_suffix}]"
+            processed_parts.extend((chunk_prefix, f"{chunk[section_content_key]}"))
+            included_chunks.append(chunk)
+
+        if in_skip:
+            processed_parts.append(f"[... {skip_char_count} characters skipped ...]")
+
+        if reverse:
+            processed_parts = list(reversed(processed_parts))
+
+        return processed_parts
+
+    def render_hierarchy_headers(
+        self,
+        current_chunk: Dict,
+        chunks: List[Dict],
+        doc_header_key: str,
+    ) -> str:
+        """
+        Render headers for the current chunk's hierarchy.
+
+        Args:
+            current_chunk (Dict): The current chunk being processed.
+            chunks (List[Dict]): List of chunks up to and including the current chunk.
+            doc_header_key (str): The key for the headers in the current chunk.
+        Returns:
+            str: Renderted headers in the current chunk's hierarchy.
+        """
+        current_hierarchy = {}
+
+        if doc_header_key is None:
+            return ""
+
+        # Find the largest/highest level in the current chunk
+        current_chunk_headers = current_chunk.get(doc_header_key, [])
+        highest_level = float("inf")  # Initialize with positive infinity
+        for header_info in current_chunk_headers:
+            level = header_info.get("level")
+            if level is not None and level < highest_level:
+                highest_level = level
+
+        # If no headers found in the current chunk, set highest_level to None
+        if highest_level == float("inf"):
+            highest_level = None
+
+        for chunk in chunks:
+            for header_info in chunk.get(doc_header_key, []):
+                header = header_info["header"]
+                level = header_info["level"]
+                if header and level:
+                    current_hierarchy[level] = header
+                    # Clear lower levels when a higher level header is found
+                    for lower_level in range(level + 1, len(current_hierarchy) + 1):
+                        if lower_level in current_hierarchy:
+                            current_hierarchy[lower_level] = None
+
+        rendered_headers = [
+            f"{'#' * level} {header}"
+            for level, header in sorted(current_hierarchy.items())
+            if header is not None and (highest_level is None or level < highest_level)
+        ]
+        rendered_headers = " > ".join(rendered_headers)
+        return f"_Current Section:_ {rendered_headers}" if rendered_headers else ""
+
+
+ + + +
+ + + + + + + + + +
+ + +

+ __init__(*args, **kwargs) + +

+ + +
+ +

Initialize the GatherOperation.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ *args + + Any + +
+

Variable length argument list.

+
+
+ () +
+ **kwargs + + Any + +
+

Arbitrary keyword arguments.

+
+
+ {} +
+ +
+ Source code in docetl/operations/gather.py +
26
+27
+28
+29
+30
+31
+32
+33
+34
def __init__(self, *args: Any, **kwargs: Any) -> None:
+    """
+    Initialize the GatherOperation.
+
+    Args:
+        *args: Variable length argument list.
+        **kwargs: Arbitrary keyword arguments.
+    """
+    super().__init__(*args, **kwargs)
+
+
+
+ +
+ +
+ + +

+ execute(input_data) + +

+ + +
+ +

Execute the gather operation on the input data.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ input_data + + List[Dict] + +
+

The input data to process.

+
+
+ required +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ Tuple[List[Dict], float] + +
+

Tuple[List[Dict], float]: A tuple containing the processed results and the cost of the operation.

+
+
+ +
+ Source code in docetl/operations/gather.py +
 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
def execute(self, input_data: List[Dict]) -> Tuple[List[Dict], float]:
+    """
+    Execute the gather operation on the input data.
+
+    Args:
+        input_data (List[Dict]): The input data to process.
+
+    Returns:
+        Tuple[List[Dict], float]: A tuple containing the processed results and the cost of the operation.
+    """
+    content_key = self.config["content_key"]
+    doc_id_key = self.config["doc_id_key"]
+    order_key = self.config["order_key"]
+    peripheral_config = self.config.get("peripheral_chunks", {})
+    main_chunk_start = self.config.get(
+        "main_chunk_start", "--- Begin Main Chunk ---"
+    )
+    main_chunk_end = self.config.get("main_chunk_end", "--- End Main Chunk ---")
+    doc_header_key = self.config.get("doc_header_key", None)
+    results = []
+    cost = 0.0
+
+    # Group chunks by document ID
+    grouped_chunks = {}
+    for item in input_data:
+        doc_id = item[doc_id_key]
+        if doc_id not in grouped_chunks:
+            grouped_chunks[doc_id] = []
+        grouped_chunks[doc_id].append(item)
+
+    # Process each group of chunks
+    for chunks in grouped_chunks.values():
+        # Sort chunks by their order within the document
+        chunks.sort(key=lambda x: x[order_key])
+
+        # Process each chunk with its peripheral context and headers
+        for i, chunk in enumerate(chunks):
+            rendered_chunk = self.render_chunk_with_context(
+                chunks,
+                i,
+                peripheral_config,
+                content_key,
+                order_key,
+                main_chunk_start,
+                main_chunk_end,
+                doc_header_key,
+            )
+
+            result = chunk.copy()
+            result[f"{content_key}_rendered"] = rendered_chunk
+            results.append(result)
+
+    return results, cost
+
+
+
+ +
+ +
+ + +

+ process_peripheral_chunks(chunks, config, content_key, order_key, reverse=False) + +

+ + +
+ +

Process peripheral chunks according to the configuration.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ chunks + + List[Dict] + +
+

List of chunks to process.

+
+
+ required +
+ config + + Dict + +
+

Configuration for processing peripheral chunks.

+
+
+ required +
+ content_key + + str + +
+

Key for the content in each chunk.

+
+
+ required +
+ order_key + + str + +
+

Key for the order of each chunk.

+
+
+ required +
+ reverse + + bool + +
+

Whether to process chunks in reverse order. Defaults to False.

+
+
+ False +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ List[str] + +
+

List[str]: List of processed chunk strings.

+
+
+ +
+ Source code in docetl/operations/gather.py +
196
+197
+198
+199
+200
+201
+202
+203
+204
+205
+206
+207
+208
+209
+210
+211
+212
+213
+214
+215
+216
+217
+218
+219
+220
+221
+222
+223
+224
+225
+226
+227
+228
+229
+230
+231
+232
+233
+234
+235
+236
+237
+238
+239
+240
+241
+242
+243
+244
+245
+246
+247
+248
+249
+250
+251
+252
+253
+254
+255
+256
+257
+258
+259
+260
+261
+262
+263
+264
+265
+266
+267
+268
+269
+270
+271
+272
+273
def process_peripheral_chunks(
+    self,
+    chunks: List[Dict],
+    config: Dict,
+    content_key: str,
+    order_key: str,
+    reverse: bool = False,
+) -> List[str]:
+    """
+    Process peripheral chunks according to the configuration.
+
+    Args:
+        chunks (List[Dict]): List of chunks to process.
+        config (Dict): Configuration for processing peripheral chunks.
+        content_key (str): Key for the content in each chunk.
+        order_key (str): Key for the order of each chunk.
+        reverse (bool, optional): Whether to process chunks in reverse order. Defaults to False.
+
+    Returns:
+        List[str]: List of processed chunk strings.
+    """
+    if reverse:
+        chunks = list(reversed(chunks))
+
+    processed_parts = []
+    included_chunks = []
+    total_chunks = len(chunks)
+
+    head_config = config.get("head", {})
+    tail_config = config.get("tail", {})
+
+    head_count = int(head_config.get("count", 0))
+    tail_count = int(tail_config.get("count", 0))
+    in_skip = False
+    skip_char_count = 0
+
+    for i, chunk in enumerate(chunks):
+        if i < head_count:
+            section = "head"
+        elif i >= total_chunks - tail_count:
+            section = "tail"
+        elif "middle" in config:
+            section = "middle"
+        else:
+            # Show number of characters skipped
+            skipped_chars = len(chunk[content_key])
+            if not in_skip:
+                skip_char_count = skipped_chars
+                in_skip = True
+            else:
+                skip_char_count += skipped_chars
+
+            continue
+
+        if in_skip:
+            processed_parts.append(
+                f"[... {skip_char_count} characters skipped ...]"
+            )
+            in_skip = False
+            skip_char_count = 0
+
+        section_config = config.get(section, {})
+        section_content_key = section_config.get("content_key", content_key)
+
+        is_summary = section_content_key != content_key
+        summary_suffix = " (Summary)" if is_summary else ""
+
+        chunk_prefix = f"[Chunk {chunk[order_key]}{summary_suffix}]"
+        processed_parts.extend((chunk_prefix, f"{chunk[section_content_key]}"))
+        included_chunks.append(chunk)
+
+    if in_skip:
+        processed_parts.append(f"[... {skip_char_count} characters skipped ...]")
+
+    if reverse:
+        processed_parts = list(reversed(processed_parts))
+
+    return processed_parts
+
+
+
+ +
+ +
+ + +

+ render_chunk_with_context(chunks, current_index, peripheral_config, content_key, order_key, main_chunk_start, main_chunk_end, doc_header_key) + +

+ + +
+ +

Render a chunk with its peripheral context and headers.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ chunks + + List[Dict] + +
+

List of all chunks in the document.

+
+
+ required +
+ current_index + + int + +
+

Index of the current chunk being processed.

+
+
+ required +
+ peripheral_config + + Dict + +
+

Configuration for peripheral chunks.

+
+
+ required +
+ content_key + + str + +
+

Key for the content in each chunk.

+
+
+ required +
+ order_key + + str + +
+

Key for the order of each chunk.

+
+
+ required +
+ main_chunk_start + + str + +
+

String to mark the start of the main chunk.

+
+
+ required +
+ main_chunk_end + + str + +
+

String to mark the end of the main chunk.

+
+
+ required +
+ doc_header_key + + str + +
+

The key for the headers in the current chunk.

+
+
+ required +
+ + +

Returns:

+ + + + + + + + + + + + + +
Name TypeDescription
str + str + +
+

Renderted chunk with context and headers.

+
+
+ +
+ Source code in docetl/operations/gather.py +
126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
+162
+163
+164
+165
+166
+167
+168
+169
+170
+171
+172
+173
+174
+175
+176
+177
+178
+179
+180
+181
+182
+183
+184
+185
+186
+187
+188
+189
+190
+191
+192
+193
+194
def render_chunk_with_context(
+    self,
+    chunks: List[Dict],
+    current_index: int,
+    peripheral_config: Dict,
+    content_key: str,
+    order_key: str,
+    main_chunk_start: str,
+    main_chunk_end: str,
+    doc_header_key: str,
+) -> str:
+    """
+    Render a chunk with its peripheral context and headers.
+
+    Args:
+        chunks (List[Dict]): List of all chunks in the document.
+        current_index (int): Index of the current chunk being processed.
+        peripheral_config (Dict): Configuration for peripheral chunks.
+        content_key (str): Key for the content in each chunk.
+        order_key (str): Key for the order of each chunk.
+        main_chunk_start (str): String to mark the start of the main chunk.
+        main_chunk_end (str): String to mark the end of the main chunk.
+        doc_header_key (str): The key for the headers in the current chunk.
+
+    Returns:
+        str: Renderted chunk with context and headers.
+    """
+
+    # If there are no peripheral chunks, return the main chunk
+    if not peripheral_config:
+        return chunks[current_index][content_key]
+
+    combined_parts = ["--- Previous Context ---"]
+
+    combined_parts.extend(
+        self.process_peripheral_chunks(
+            chunks[:current_index],
+            peripheral_config.get("previous", {}),
+            content_key,
+            order_key,
+        )
+    )
+    combined_parts.append("--- End Previous Context ---\n")
+
+    # Process main chunk
+    main_chunk = chunks[current_index]
+    if headers := self.render_hierarchy_headers(
+        main_chunk, chunks[: current_index + 1], doc_header_key
+    ):
+        combined_parts.append(headers)
+    combined_parts.extend(
+        (
+            f"{main_chunk_start}",
+            f"{main_chunk[content_key]}",
+            f"{main_chunk_end}",
+            "\n--- Next Context ---",
+        )
+    )
+    combined_parts.extend(
+        self.process_peripheral_chunks(
+            chunks[current_index + 1 :],
+            peripheral_config.get("next", {}),
+            content_key,
+            order_key,
+        )
+    )
+    combined_parts.append("--- End Next Context ---")
+
+    return "\n".join(combined_parts)
+
+
+
+ +
+ +
+ + +

+ render_hierarchy_headers(current_chunk, chunks, doc_header_key) + +

+ + +
+ +

Render headers for the current chunk's hierarchy.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ current_chunk + + Dict + +
+

The current chunk being processed.

+
+
+ required +
+ chunks + + List[Dict] + +
+

List of chunks up to and including the current chunk.

+
+
+ required +
+ doc_header_key + + str + +
+

The key for the headers in the current chunk.

+
+
+ required +
+

Returns: + str: Renderted headers in the current chunk's hierarchy.

+ +
+ Source code in docetl/operations/gather.py +
275
+276
+277
+278
+279
+280
+281
+282
+283
+284
+285
+286
+287
+288
+289
+290
+291
+292
+293
+294
+295
+296
+297
+298
+299
+300
+301
+302
+303
+304
+305
+306
+307
+308
+309
+310
+311
+312
+313
+314
+315
+316
+317
+318
+319
+320
+321
+322
+323
+324
+325
def render_hierarchy_headers(
+    self,
+    current_chunk: Dict,
+    chunks: List[Dict],
+    doc_header_key: str,
+) -> str:
+    """
+    Render headers for the current chunk's hierarchy.
+
+    Args:
+        current_chunk (Dict): The current chunk being processed.
+        chunks (List[Dict]): List of chunks up to and including the current chunk.
+        doc_header_key (str): The key for the headers in the current chunk.
+    Returns:
+        str: Renderted headers in the current chunk's hierarchy.
+    """
+    current_hierarchy = {}
+
+    if doc_header_key is None:
+        return ""
+
+    # Find the largest/highest level in the current chunk
+    current_chunk_headers = current_chunk.get(doc_header_key, [])
+    highest_level = float("inf")  # Initialize with positive infinity
+    for header_info in current_chunk_headers:
+        level = header_info.get("level")
+        if level is not None and level < highest_level:
+            highest_level = level
+
+    # If no headers found in the current chunk, set highest_level to None
+    if highest_level == float("inf"):
+        highest_level = None
+
+    for chunk in chunks:
+        for header_info in chunk.get(doc_header_key, []):
+            header = header_info["header"]
+            level = header_info["level"]
+            if header and level:
+                current_hierarchy[level] = header
+                # Clear lower levels when a higher level header is found
+                for lower_level in range(level + 1, len(current_hierarchy) + 1):
+                    if lower_level in current_hierarchy:
+                        current_hierarchy[lower_level] = None
+
+    rendered_headers = [
+        f"{'#' * level} {header}"
+        for level, header in sorted(current_hierarchy.items())
+        if header is not None and (highest_level is None or level < highest_level)
+    ]
+    rendered_headers = " > ".join(rendered_headers)
+    return f"_Current Section:_ {rendered_headers}" if rendered_headers else ""
+
+
+
+ +
+ +
+ + +

+ syntax_check() + +

+ + +
+ +

Perform a syntax check on the operation configuration.

+ + +

Raises:

+ + + + + + + + + + + + + + + + + +
TypeDescription
+ ValueError + +
+

If required keys are missing or if there are configuration errors.

+
+
+ TypeError + +
+

If main_chunk_start or main_chunk_end are not strings.

+
+
+ +
+ Source code in docetl/operations/gather.py +
36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
+52
+53
+54
+55
+56
+57
+58
+59
+60
+61
+62
+63
+64
+65
+66
+67
+68
+69
+70
def syntax_check(self) -> None:
+    """
+    Perform a syntax check on the operation configuration.
+
+    Raises:
+        ValueError: If required keys are missing or if there are configuration errors.
+        TypeError: If main_chunk_start or main_chunk_end are not strings.
+    """
+    required_keys = ["content_key", "doc_id_key", "order_key"]
+    for key in required_keys:
+        if key not in self.config:
+            raise ValueError(
+                f"Missing required key '{key}' in GatherOperation configuration"
+            )
+
+    peripheral_config = self.config.get("peripheral_chunks", {})
+    for direction in ["previous", "next"]:
+        if direction not in peripheral_config:
+            continue
+        for section in ["head", "middle", "tail"]:
+            if section in peripheral_config[direction]:
+                section_config = peripheral_config[direction][section]
+                if section != "middle" and "count" not in section_config:
+                    raise ValueError(
+                        f"Missing 'count' in {direction}.{section} configuration"
+                    )
+
+    if "main_chunk_start" in self.config and not isinstance(
+        self.config["main_chunk_start"], str
+    ):
+        raise TypeError("'main_chunk_start' must be a string")
+    if "main_chunk_end" in self.config and not isinstance(
+        self.config["main_chunk_end"], str
+    ):
+        raise TypeError("'main_chunk_end' must be a string")
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ docetl.operations.unnest.UnnestOperation + + +

+ + +
+

+ Bases: BaseOperation

+ + +

A class that represents an operation to unnest a list-like or dictionary value in a dictionary into multiple dictionaries.

+

This operation takes a list of dictionaries and a specified key, and creates new dictionaries based on the value type: +- For list-like values: Creates a new dictionary for each element in the list, copying all other key-value pairs. +- For dictionary values: Expands specified fields from the nested dictionary into the parent dictionary.

+ + +
+ Inherits from +

BaseOperation

+

Usage: +

from docetl.operations import UnnestOperation
+
+# Unnesting a list
+config_list = {"unnest_key": "tags"}
+input_data_list = [
+    {"id": 1, "tags": ["a", "b", "c"]},
+    {"id": 2, "tags": ["d", "e"]}
+]
+
+unnest_op_list = UnnestOperation(config_list)
+result_list, _ = unnest_op_list.execute(input_data_list)
+
+# Result will be:
+# [
+#     {"id": 1, "tags": "a"},
+#     {"id": 1, "tags": "b"},
+#     {"id": 1, "tags": "c"},
+#     {"id": 2, "tags": "d"},
+#     {"id": 2, "tags": "e"}
+# ]
+
+# Unnesting a dictionary
+config_dict = {"unnest_key": "user", "expand_fields": ["name", "age"]}
+input_data_dict = [
+    {"id": 1, "user": {"name": "Alice", "age": 30, "email": "alice@example.com"}},
+    {"id": 2, "user": {"name": "Bob", "age": 25, "email": "bob@example.com"}}
+]
+
+unnest_op_dict = UnnestOperation(config_dict)
+result_dict, _ = unnest_op_dict.execute(input_data_dict)
+
+# Result will be:
+# [
+#     {"id": 1, "name": "Alice", "age": 30, "user": {"name": "Alice", "age": 30, "email": "alice@example.com"}},
+#     {"id": 2, "name": "Bob", "age": 25, "user": {"name": "Bob", "age": 25, "email": "bob@example.com"}}
+# ]
+

+ + + + + + +
+ Source code in docetl/operations/unnest.py +
  7
+  8
+  9
+ 10
+ 11
+ 12
+ 13
+ 14
+ 15
+ 16
+ 17
+ 18
+ 19
+ 20
+ 21
+ 22
+ 23
+ 24
+ 25
+ 26
+ 27
+ 28
+ 29
+ 30
+ 31
+ 32
+ 33
+ 34
+ 35
+ 36
+ 37
+ 38
+ 39
+ 40
+ 41
+ 42
+ 43
+ 44
+ 45
+ 46
+ 47
+ 48
+ 49
+ 50
+ 51
+ 52
+ 53
+ 54
+ 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
+162
+163
+164
+165
+166
+167
+168
+169
+170
+171
+172
+173
+174
+175
+176
+177
+178
+179
+180
+181
+182
+183
+184
+185
+186
+187
+188
+189
+190
+191
+192
+193
+194
+195
+196
+197
+198
+199
+200
+201
+202
+203
+204
+205
+206
+207
+208
+209
class UnnestOperation(BaseOperation):
+    """
+    A class that represents an operation to unnest a list-like or dictionary value in a dictionary into multiple dictionaries.
+
+    This operation takes a list of dictionaries and a specified key, and creates new dictionaries based on the value type:
+    - For list-like values: Creates a new dictionary for each element in the list, copying all other key-value pairs.
+    - For dictionary values: Expands specified fields from the nested dictionary into the parent dictionary.
+
+    Inherits from:
+        BaseOperation
+
+    Usage:
+    ```python
+    from docetl.operations import UnnestOperation
+
+    # Unnesting a list
+    config_list = {"unnest_key": "tags"}
+    input_data_list = [
+        {"id": 1, "tags": ["a", "b", "c"]},
+        {"id": 2, "tags": ["d", "e"]}
+    ]
+
+    unnest_op_list = UnnestOperation(config_list)
+    result_list, _ = unnest_op_list.execute(input_data_list)
+
+    # Result will be:
+    # [
+    #     {"id": 1, "tags": "a"},
+    #     {"id": 1, "tags": "b"},
+    #     {"id": 1, "tags": "c"},
+    #     {"id": 2, "tags": "d"},
+    #     {"id": 2, "tags": "e"}
+    # ]
+
+    # Unnesting a dictionary
+    config_dict = {"unnest_key": "user", "expand_fields": ["name", "age"]}
+    input_data_dict = [
+        {"id": 1, "user": {"name": "Alice", "age": 30, "email": "alice@example.com"}},
+        {"id": 2, "user": {"name": "Bob", "age": 25, "email": "bob@example.com"}}
+    ]
+
+    unnest_op_dict = UnnestOperation(config_dict)
+    result_dict, _ = unnest_op_dict.execute(input_data_dict)
+
+    # Result will be:
+    # [
+    #     {"id": 1, "name": "Alice", "age": 30, "user": {"name": "Alice", "age": 30, "email": "alice@example.com"}},
+    #     {"id": 2, "name": "Bob", "age": 25, "user": {"name": "Bob", "age": 25, "email": "bob@example.com"}}
+    # ]
+    ```
+    """
+
+    class schema(BaseOperation.schema):
+        type: str = "unnest"
+        unnest_key: str
+        keep_empty: Optional[bool] = None
+        expand_fields: Optional[List[str]] = None
+        recursive: Optional[bool] = None
+        depth: Optional[int] = None
+
+    def syntax_check(self) -> None:
+        """
+        Checks if the required configuration key is present in the operation's config.
+
+        Raises:
+            ValueError: If the required 'unnest_key' is missing from the configuration.
+        """
+
+        required_keys = ["unnest_key"]
+        for key in required_keys:
+            if key not in self.config:
+                raise ValueError(
+                    f"Missing required key '{key}' in UnnestOperation configuration"
+                )
+
+    def execute(self, input_data: List[Dict]) -> Tuple[List[Dict], float]:
+        """
+        Executes the unnest operation on the input data.
+
+        Args:
+            input_data (List[Dict]): A list of dictionaries to process.
+
+        Returns:
+            Tuple[List[Dict], float]: A tuple containing the processed list of dictionaries
+            and a float value (always 0 in this implementation).
+
+        Raises:
+            KeyError: If the specified unnest_key is not found in an input dictionary.
+            TypeError: If the value of the unnest_key is not iterable (list, tuple, set, or dict).
+            ValueError: If unnesting a dictionary and 'expand_fields' is not provided in the config.
+
+        The operation supports unnesting of both list-like values and dictionary values:
+
+        1. For list-like values (list, tuple, set):
+           Each element in the list becomes a separate dictionary in the output.
+
+        2. For dictionary values:
+           The operation expands specified fields from the nested dictionary into the parent dictionary.
+           The 'expand_fields' config parameter must be provided to specify which fields to expand.
+
+        Examples:
+        ```python
+        # Unnesting a list
+        unnest_op = UnnestOperation({"unnest_key": "colors"})
+        input_data = [
+            {"id": 1, "colors": ["red", "blue"]},
+            {"id": 2, "colors": ["green"]}
+        ]
+        result, _ = unnest_op.execute(input_data)
+        # Result will be:
+        # [
+        #     {"id": 1, "colors": "red"},
+        #     {"id": 1, "colors": "blue"},
+        #     {"id": 2, "colors": "green"}
+        # ]
+
+        # Unnesting a dictionary
+        unnest_op = UnnestOperation({"unnest_key": "details", "expand_fields": ["color", "size"]})
+        input_data = [
+            {"id": 1, "details": {"color": "red", "size": "large", "stock": 5}},
+            {"id": 2, "details": {"color": "blue", "size": "medium", "stock": 3}}
+        ]
+        result, _ = unnest_op.execute(input_data)
+        # Result will be:
+        # [
+        #     {"id": 1, "details": {"color": "red", "size": "large", "stock": 5}, "color": "red", "size": "large"},
+        #     {"id": 2, "details": {"color": "blue", "size": "medium", "stock": 3}, "color": "blue", "size": "medium"}
+        # ]
+        ```
+
+        Note: When unnesting dictionaries, the original nested dictionary is preserved in the output,
+        and the specified fields are expanded into the parent dictionary.
+        """
+
+        unnest_key = self.config["unnest_key"]
+        recursive = self.config.get("recursive", False)
+        depth = self.config.get("depth", None)
+        if not depth:
+            depth = 1 if not recursive else float("inf")
+        results = []
+
+        def unnest_recursive(item, key, level=0):
+            if level == 0 and not isinstance(item[key], (list, tuple, set, dict)):
+                raise TypeError(f"Value of unnest key '{key}' is not iterable")
+
+            if level > 0 and not isinstance(item[key], (list, tuple, set, dict)):
+                return [item]
+
+            if level >= depth:
+                return [item]
+
+            if isinstance(item[key], dict):
+                expand_fields = self.config.get("expand_fields")
+                if expand_fields is None:
+                    expand_fields = item[key].keys()
+                new_item = copy.deepcopy(item)
+                for field in expand_fields:
+                    if field in new_item[key]:
+                        new_item[field] = new_item[key][field]
+                    else:
+                        new_item[field] = None
+                return [new_item]
+            else:
+                nested_results = []
+                for value in item[key]:
+                    new_item = copy.deepcopy(item)
+                    new_item[key] = value
+                    if recursive and isinstance(value, (list, tuple, set, dict)):
+                        nested_results.extend(
+                            unnest_recursive(new_item, key, level + 1)
+                        )
+                    else:
+                        nested_results.append(new_item)
+                return nested_results
+
+        for item in input_data:
+            if unnest_key not in item:
+                raise KeyError(
+                    f"Unnest key '{unnest_key}' not found in item. Other keys are {item.keys()}"
+                )
+
+            results.extend(unnest_recursive(item, unnest_key))
+
+            if not item[unnest_key] and self.config.get("keep_empty", False):
+                expand_fields = self.config.get("expand_fields")
+                new_item = copy.deepcopy(item)
+                if isinstance(item[unnest_key], dict):
+                    if expand_fields is None:
+                        expand_fields = item[unnest_key].keys()
+                    for field in expand_fields:
+                        new_item[field] = None
+                else:
+                    new_item[unnest_key] = None
+                results.append(new_item)
+
+        # Assert that no keys are missing after the operation
+        if results:
+            original_keys = set(input_data[0].keys())
+            assert original_keys.issubset(
+                set(results[0].keys())
+            ), "Keys lost during unnest operation"
+
+        return results, 0
+
+
+ + + +
+ + + + + + + + + +
+ + +

+ execute(input_data) + +

+ + +
+ +

Executes the unnest operation on the input data.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ input_data + + List[Dict] + +
+

A list of dictionaries to process.

+
+
+ required +
+ + +

Returns:

+ + + + + + + + + + + + + + + + + +
TypeDescription
+ List[Dict] + +
+

Tuple[List[Dict], float]: A tuple containing the processed list of dictionaries

+
+
+ float + +
+

and a float value (always 0 in this implementation).

+
+
+ + +

Raises:

+ + + + + + + + + + + + + + + + + + + + + +
TypeDescription
+ KeyError + +
+

If the specified unnest_key is not found in an input dictionary.

+
+
+ TypeError + +
+

If the value of the unnest_key is not iterable (list, tuple, set, or dict).

+
+
+ ValueError + +
+

If unnesting a dictionary and 'expand_fields' is not provided in the config.

+
+
+

The operation supports unnesting of both list-like values and dictionary values:

+
    +
  1. +

    For list-like values (list, tuple, set): + Each element in the list becomes a separate dictionary in the output.

    +
  2. +
  3. +

    For dictionary values: + The operation expands specified fields from the nested dictionary into the parent dictionary. + The 'expand_fields' config parameter must be provided to specify which fields to expand.

    +
  4. +
+

Examples: +

# Unnesting a list
+unnest_op = UnnestOperation({"unnest_key": "colors"})
+input_data = [
+    {"id": 1, "colors": ["red", "blue"]},
+    {"id": 2, "colors": ["green"]}
+]
+result, _ = unnest_op.execute(input_data)
+# Result will be:
+# [
+#     {"id": 1, "colors": "red"},
+#     {"id": 1, "colors": "blue"},
+#     {"id": 2, "colors": "green"}
+# ]
+
+# Unnesting a dictionary
+unnest_op = UnnestOperation({"unnest_key": "details", "expand_fields": ["color", "size"]})
+input_data = [
+    {"id": 1, "details": {"color": "red", "size": "large", "stock": 5}},
+    {"id": 2, "details": {"color": "blue", "size": "medium", "stock": 3}}
+]
+result, _ = unnest_op.execute(input_data)
+# Result will be:
+# [
+#     {"id": 1, "details": {"color": "red", "size": "large", "stock": 5}, "color": "red", "size": "large"},
+#     {"id": 2, "details": {"color": "blue", "size": "medium", "stock": 3}, "color": "blue", "size": "medium"}
+# ]
+

+

Note: When unnesting dictionaries, the original nested dictionary is preserved in the output, +and the specified fields are expanded into the parent dictionary.

+ +
+ Source code in docetl/operations/unnest.py +
 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
+162
+163
+164
+165
+166
+167
+168
+169
+170
+171
+172
+173
+174
+175
+176
+177
+178
+179
+180
+181
+182
+183
+184
+185
+186
+187
+188
+189
+190
+191
+192
+193
+194
+195
+196
+197
+198
+199
+200
+201
+202
+203
+204
+205
+206
+207
+208
+209
def execute(self, input_data: List[Dict]) -> Tuple[List[Dict], float]:
+    """
+    Executes the unnest operation on the input data.
+
+    Args:
+        input_data (List[Dict]): A list of dictionaries to process.
+
+    Returns:
+        Tuple[List[Dict], float]: A tuple containing the processed list of dictionaries
+        and a float value (always 0 in this implementation).
+
+    Raises:
+        KeyError: If the specified unnest_key is not found in an input dictionary.
+        TypeError: If the value of the unnest_key is not iterable (list, tuple, set, or dict).
+        ValueError: If unnesting a dictionary and 'expand_fields' is not provided in the config.
+
+    The operation supports unnesting of both list-like values and dictionary values:
+
+    1. For list-like values (list, tuple, set):
+       Each element in the list becomes a separate dictionary in the output.
+
+    2. For dictionary values:
+       The operation expands specified fields from the nested dictionary into the parent dictionary.
+       The 'expand_fields' config parameter must be provided to specify which fields to expand.
+
+    Examples:
+    ```python
+    # Unnesting a list
+    unnest_op = UnnestOperation({"unnest_key": "colors"})
+    input_data = [
+        {"id": 1, "colors": ["red", "blue"]},
+        {"id": 2, "colors": ["green"]}
+    ]
+    result, _ = unnest_op.execute(input_data)
+    # Result will be:
+    # [
+    #     {"id": 1, "colors": "red"},
+    #     {"id": 1, "colors": "blue"},
+    #     {"id": 2, "colors": "green"}
+    # ]
+
+    # Unnesting a dictionary
+    unnest_op = UnnestOperation({"unnest_key": "details", "expand_fields": ["color", "size"]})
+    input_data = [
+        {"id": 1, "details": {"color": "red", "size": "large", "stock": 5}},
+        {"id": 2, "details": {"color": "blue", "size": "medium", "stock": 3}}
+    ]
+    result, _ = unnest_op.execute(input_data)
+    # Result will be:
+    # [
+    #     {"id": 1, "details": {"color": "red", "size": "large", "stock": 5}, "color": "red", "size": "large"},
+    #     {"id": 2, "details": {"color": "blue", "size": "medium", "stock": 3}, "color": "blue", "size": "medium"}
+    # ]
+    ```
+
+    Note: When unnesting dictionaries, the original nested dictionary is preserved in the output,
+    and the specified fields are expanded into the parent dictionary.
+    """
+
+    unnest_key = self.config["unnest_key"]
+    recursive = self.config.get("recursive", False)
+    depth = self.config.get("depth", None)
+    if not depth:
+        depth = 1 if not recursive else float("inf")
+    results = []
+
+    def unnest_recursive(item, key, level=0):
+        if level == 0 and not isinstance(item[key], (list, tuple, set, dict)):
+            raise TypeError(f"Value of unnest key '{key}' is not iterable")
+
+        if level > 0 and not isinstance(item[key], (list, tuple, set, dict)):
+            return [item]
+
+        if level >= depth:
+            return [item]
+
+        if isinstance(item[key], dict):
+            expand_fields = self.config.get("expand_fields")
+            if expand_fields is None:
+                expand_fields = item[key].keys()
+            new_item = copy.deepcopy(item)
+            for field in expand_fields:
+                if field in new_item[key]:
+                    new_item[field] = new_item[key][field]
+                else:
+                    new_item[field] = None
+            return [new_item]
+        else:
+            nested_results = []
+            for value in item[key]:
+                new_item = copy.deepcopy(item)
+                new_item[key] = value
+                if recursive and isinstance(value, (list, tuple, set, dict)):
+                    nested_results.extend(
+                        unnest_recursive(new_item, key, level + 1)
+                    )
+                else:
+                    nested_results.append(new_item)
+            return nested_results
+
+    for item in input_data:
+        if unnest_key not in item:
+            raise KeyError(
+                f"Unnest key '{unnest_key}' not found in item. Other keys are {item.keys()}"
+            )
+
+        results.extend(unnest_recursive(item, unnest_key))
+
+        if not item[unnest_key] and self.config.get("keep_empty", False):
+            expand_fields = self.config.get("expand_fields")
+            new_item = copy.deepcopy(item)
+            if isinstance(item[unnest_key], dict):
+                if expand_fields is None:
+                    expand_fields = item[unnest_key].keys()
+                for field in expand_fields:
+                    new_item[field] = None
+            else:
+                new_item[unnest_key] = None
+            results.append(new_item)
+
+    # Assert that no keys are missing after the operation
+    if results:
+        original_keys = set(input_data[0].keys())
+        assert original_keys.issubset(
+            set(results[0].keys())
+        ), "Keys lost during unnest operation"
+
+    return results, 0
+
+
+
+ +
+ +
+ + +

+ syntax_check() + +

+ + +
+ +

Checks if the required configuration key is present in the operation's config.

+ + +

Raises:

+ + + + + + + + + + + + + +
TypeDescription
+ ValueError + +
+

If the required 'unnest_key' is missing from the configuration.

+
+
+ +
+ Source code in docetl/operations/unnest.py +
67
+68
+69
+70
+71
+72
+73
+74
+75
+76
+77
+78
+79
+80
def syntax_check(self) -> None:
+    """
+    Checks if the required configuration key is present in the operation's config.
+
+    Raises:
+        ValueError: If the required 'unnest_key' is missing from the configuration.
+    """
+
+    required_keys = ["unnest_key"]
+    for key in required_keys:
+        if key not in self.config:
+            raise ValueError(
+                f"Missing required key '{key}' in UnnestOperation configuration"
+            )
+
+
+
+ +
+ + + +
+ +
+ +
+ + + + + + + + + + + + + +
+
+ + + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + + \ No newline at end of file diff --git a/api-reference/optimizers/index.html b/api-reference/optimizers/index.html new file mode 100644 index 00000000..fe7ba6f2 --- /dev/null +++ b/api-reference/optimizers/index.html @@ -0,0 +1,11674 @@ + + + + + + + + + + + + + + + + + + + + + + + Optimizers - docetl docs + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + Skip to content + + +
+
+ +
+ + + + + + +
+ + + + + + + +
+ +
+ + + + +
+
+ + + +
+
+
+ + + + + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

Optimizers

+ +
+ + + +

+ docetl.optimizers.map_optimizer.optimizer.MapOptimizer + + +

+ + +
+ + +

A class for optimizing map operations in data processing pipelines.

+

This optimizer analyzes the input operation configuration and data, +and generates optimized plans for executing the operation. It can +create plans for chunking, metadata extraction, gleaning, chain +decomposition, and parallel execution.

+ + +

Attributes:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescription
config + Dict[str, Any] + +
+

The configuration dictionary for the optimizer.

+
+
console + Console + +
+

A Rich console object for pretty printing.

+
+
llm_client + LLMClient + +
+

A client for interacting with a language model.

+
+
_run_operation + Callable + +
+

A function to execute operations.

+
+
max_threads + int + +
+

The maximum number of threads to use for parallel execution.

+
+
timeout + int + +
+

The timeout in seconds for operation execution.

+
+
+ + + + + + +
+ Source code in docetl/optimizers/map_optimizer/optimizer.py +
 19
+ 20
+ 21
+ 22
+ 23
+ 24
+ 25
+ 26
+ 27
+ 28
+ 29
+ 30
+ 31
+ 32
+ 33
+ 34
+ 35
+ 36
+ 37
+ 38
+ 39
+ 40
+ 41
+ 42
+ 43
+ 44
+ 45
+ 46
+ 47
+ 48
+ 49
+ 50
+ 51
+ 52
+ 53
+ 54
+ 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
+162
+163
+164
+165
+166
+167
+168
+169
+170
+171
+172
+173
+174
+175
+176
+177
+178
+179
+180
+181
+182
+183
+184
+185
+186
+187
+188
+189
+190
+191
+192
+193
+194
+195
+196
+197
+198
+199
+200
+201
+202
+203
+204
+205
+206
+207
+208
+209
+210
+211
+212
+213
+214
+215
+216
+217
+218
+219
+220
+221
+222
+223
+224
+225
+226
+227
+228
+229
+230
+231
+232
+233
+234
+235
+236
+237
+238
+239
+240
+241
+242
+243
+244
+245
+246
+247
+248
+249
+250
+251
+252
+253
+254
+255
+256
+257
+258
+259
+260
+261
+262
+263
+264
+265
+266
+267
+268
+269
+270
+271
+272
+273
+274
+275
+276
+277
+278
+279
+280
+281
+282
+283
+284
+285
+286
+287
+288
+289
+290
+291
+292
+293
+294
+295
+296
+297
+298
+299
+300
+301
+302
+303
+304
+305
+306
+307
+308
+309
+310
+311
+312
+313
+314
+315
+316
+317
+318
+319
+320
+321
+322
+323
+324
+325
+326
+327
+328
+329
+330
+331
+332
+333
+334
+335
+336
+337
+338
+339
+340
+341
+342
+343
+344
+345
+346
+347
+348
+349
+350
+351
+352
+353
+354
+355
+356
+357
+358
+359
+360
+361
+362
+363
+364
+365
+366
+367
+368
+369
+370
+371
+372
+373
+374
+375
+376
+377
+378
+379
+380
+381
+382
+383
+384
+385
+386
+387
+388
+389
+390
+391
+392
+393
+394
+395
+396
+397
+398
+399
+400
+401
+402
+403
+404
+405
+406
+407
+408
+409
+410
+411
+412
+413
+414
+415
+416
+417
+418
+419
+420
+421
+422
+423
+424
+425
+426
+427
+428
+429
+430
+431
+432
+433
+434
+435
+436
+437
+438
+439
+440
+441
+442
+443
+444
+445
+446
+447
+448
+449
+450
+451
+452
+453
+454
+455
+456
+457
+458
+459
+460
+461
+462
+463
+464
+465
+466
+467
+468
+469
+470
+471
+472
+473
+474
+475
+476
+477
+478
+479
+480
+481
+482
+483
+484
+485
+486
+487
+488
+489
+490
+491
+492
+493
+494
+495
+496
+497
+498
+499
+500
+501
+502
+503
+504
+505
+506
+507
+508
+509
+510
+511
+512
+513
+514
+515
+516
+517
+518
+519
+520
+521
+522
+523
+524
+525
class MapOptimizer:
+    """
+    A class for optimizing map operations in data processing pipelines.
+
+    This optimizer analyzes the input operation configuration and data,
+    and generates optimized plans for executing the operation. It can
+    create plans for chunking, metadata extraction, gleaning, chain
+    decomposition, and parallel execution.
+
+    Attributes:
+        config (Dict[str, Any]): The configuration dictionary for the optimizer.
+        console (Console): A Rich console object for pretty printing.
+        llm_client (LLMClient): A client for interacting with a language model.
+        _run_operation (Callable): A function to execute operations.
+        max_threads (int): The maximum number of threads to use for parallel execution.
+        timeout (int): The timeout in seconds for operation execution.
+
+    """
+
+    def __init__(
+        self,
+        runner,
+        run_operation: Callable,
+        timeout: int = 10,
+        is_filter: bool = False,
+        depth: int = 1,
+    ):
+        """
+        Initialize the MapOptimizer.
+
+        Args:
+            runner (Runner): The runner object.
+            run_operation (Callable): A function to execute operations.
+            timeout (int, optional): The timeout in seconds for operation execution. Defaults to 10.
+            is_filter (bool, optional): If True, the operation is a filter operation. Defaults to False.
+        """
+        self.runner = runner
+        self.config = runner.config
+        self.console = runner.console
+        self.llm_client = runner.optimizer.llm_client
+        self._run_operation = run_operation
+        self.max_threads = runner.max_threads
+        self.timeout = runner.optimizer.timeout
+        self._num_plans_to_evaluate_in_parallel = 5
+        self.is_filter = is_filter
+        self.k_to_pairwise_compare = 6
+
+        self.plan_generator = PlanGenerator(
+            runner,
+            self.llm_client,
+            self.console,
+            self.config,
+            run_operation,
+            self.max_threads,
+            is_filter,
+            depth,
+        )
+        self.evaluator = Evaluator(
+            self.llm_client,
+            self.console,
+            self._run_operation,
+            self.timeout,
+            self._num_plans_to_evaluate_in_parallel,
+            self.is_filter,
+        )
+        self.prompt_generator = PromptGenerator(
+            self.runner,
+            self.llm_client,
+            self.console,
+            self.config,
+            self.max_threads,
+            self.is_filter,
+        )
+
+    def should_optimize(
+        self, op_config: Dict[str, Any], input_data: List[Dict[str, Any]]
+    ) -> Tuple[str, List[Dict[str, Any]], List[Dict[str, Any]]]:
+        """
+        Determine if the given operation configuration should be optimized.
+        """
+        (
+            input_data,
+            output_data,
+            _,
+            _,
+            validator_prompt,
+            assessment,
+            data_exceeds_limit,
+        ) = self._should_optimize_helper(op_config, input_data)
+        if data_exceeds_limit or assessment.get("needs_improvement", True):
+            assessment_str = (
+                "\n".join(assessment.get("reasons", []))
+                + "\n\nHere are some improvements that may help:\n"
+                + "\n".join(assessment.get("improvements", []))
+            )
+            if data_exceeds_limit:
+                assessment_str += "\nAlso, the input data exceeds the token limit."
+            return assessment_str, input_data, output_data
+        else:
+            return "", input_data, output_data
+
+    def _should_optimize_helper(
+        self, op_config: Dict[str, Any], input_data: List[Dict[str, Any]]
+    ) -> Tuple[
+        List[Dict[str, Any]],
+        List[Dict[str, Any]],
+        int,
+        float,
+        str,
+        Dict[str, Any],
+        bool,
+    ]:
+        """
+        Determine if the given operation configuration should be optimized.
+        Create a custom validator prompt and assess the operation's performance
+        using the validator.
+        """
+        self.console.post_optimizer_status(StageType.SAMPLE_RUN)
+        input_data = copy.deepcopy(input_data)
+        # Add id to each input_data
+        for i in range(len(input_data)):
+            input_data[i]["_map_opt_id"] = str(uuid.uuid4())
+
+        # Define the token limit (adjust as needed)
+        model_input_context_length = model_cost.get(
+            op_config.get("model", self.config.get("default_model")), {}
+        ).get("max_input_tokens", 8192)
+
+        # Render the prompt with all sample inputs and count tokens
+        total_tokens = 0
+        exceed_count = 0
+        for sample in input_data:
+            rendered_prompt = Template(op_config["prompt"]).render(input=sample)
+            prompt_tokens = count_tokens(
+                rendered_prompt,
+                op_config.get("model", self.config.get("default_model")),
+            )
+            total_tokens += prompt_tokens
+
+            if prompt_tokens > model_input_context_length:
+                exceed_count += 1
+
+        # Calculate average tokens and percentage of samples exceeding limit
+        avg_tokens = total_tokens / len(input_data)
+        exceed_percentage = (exceed_count / len(input_data)) * 100
+
+        data_exceeds_limit = exceed_count > 0
+        if exceed_count > 0:
+            self.console.log(
+                f"[yellow]Warning: {exceed_percentage:.2f}% of prompts exceed token limit. "
+                f"Average token count: {avg_tokens:.2f}. "
+                f"Truncating input data when generating validators.[/yellow]"
+            )
+
+        # Execute the original operation on the sample data
+        no_change_start = time.time()
+        output_data = self._run_operation(op_config, input_data, is_build=True)
+        no_change_runtime = time.time() - no_change_start
+
+        # Capture output for the sample run
+        self.runner.optimizer.captured_output.save_optimizer_output(
+            stage_type=StageType.SAMPLE_RUN,
+            output={
+                "operation_config": op_config,
+                "input_data": input_data,
+                "output_data": output_data,
+            },
+        )
+
+        # Generate custom validator prompt
+        self.console.post_optimizer_status(StageType.SHOULD_OPTIMIZE)
+        validator_prompt = self.prompt_generator._generate_validator_prompt(
+            op_config, input_data, output_data
+        )
+
+        # Log the validator prompt
+        self.console.log("[bold]Validator Prompt:[/bold]")
+        self.console.log(validator_prompt)
+        self.console.log("\n")  # Add a newline for better readability
+
+        # Step 2: Use the validator prompt to assess the operation's performance
+        assessment = self.evaluator._assess_operation(
+            op_config, input_data, output_data, validator_prompt
+        )
+
+        # Print out the assessment
+        self.console.log(
+            f"[bold]Assessment for whether we should improve operation {op_config['name']}:[/bold]"
+        )
+        for key, value in assessment.items():
+            self.console.log(f"[bold cyan]{key}:[/bold cyan] [yellow]{value}[/yellow]")
+        self.console.log("\n")  # Add a newline for better readability
+
+        self.runner.optimizer.captured_output.save_optimizer_output(
+            stage_type=StageType.SHOULD_OPTIMIZE,
+            output={
+                "validator_prompt": validator_prompt,
+                "needs_improvement": assessment.get("needs_improvement", True),
+                "reasons": assessment.get("reasons", []),
+                "improvements": assessment.get("improvements", []),
+            },
+        )
+        self.console.post_optimizer_rationale(
+            assessment.get("needs_improvement", True),
+            "\n".join(assessment.get("reasons", []))
+            + "\n\n"
+            + "\n".join(assessment.get("improvements", [])),
+            validator_prompt,
+        )
+
+        return (
+            input_data,
+            output_data,
+            model_input_context_length,
+            no_change_runtime,
+            validator_prompt,
+            assessment,
+            data_exceeds_limit,
+        )
+
+    def optimize(
+        self,
+        op_config: Dict[str, Any],
+        input_data: List[Dict[str, Any]],
+        plan_types: Optional[List[str]] = ["chunk", "proj_synthesis", "glean"],
+    ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], float]:
+        """
+        Optimize the given operation configuration for the input data.
+        This method analyzes the operation and input data, generates various
+        optimization plans, evaluates them, and returns the best plan along
+        with its output. A key part of this process is creating a custom
+        validator prompt for evaluation. The validator prompt is generated
+        based on the specific task, input data, and output data. It serves
+        as a critical tool for assessing the quality and correctness of
+        each optimization plan's output. This custom prompt ensures that
+        the evaluation is tailored to the unique requirements and nuances
+        of the given operation. The types of optimization plans include:
+
+        1. Improved Prompt Plan: Enhances the original prompt based on evaluation, aiming to improve output quality.
+
+        2. Chunk Size Plan: Splits input data into chunks of different sizes,
+           processes each chunk separately, and then combines the results. This
+           can improve performance for large inputs.
+
+        3. Gleaning Plans: Implements an iterative refinement process where the
+           output is validated and improved over multiple rounds, enhancing accuracy.
+
+        4. Chain Decomposition Plan: Breaks down complex operations into a series
+           of simpler sub-operations, potentially improving overall performance
+           and interpretability.
+
+        5. Parallel Map Plan: Decomposes the task into subtasks that can be
+           executed in parallel, potentially speeding up processing for
+           independent operations.
+
+        The method generates these plans, evaluates their performance using
+        a custom validator, and selects the best performing plan based on
+        output quality and execution time.
+
+        Args:
+            op_config (Dict[str, Any]): The configuration of the operation to optimize.
+            input_data (List[Dict[str, Any]]): The input data for the operation.
+
+        Returns:
+            Tuple[List[Dict[str, Any]], List[Dict[str, Any]], float]: A tuple containing
+            the best optimization plan and its output. The plan is a list of
+            operation configurations that achieve the best performance.
+            The cost is the cost of the optimizer (from possibly synthesizing resolves).
+
+        """
+        # Verify that the plan types are valid
+        for plan_type in plan_types:
+            if plan_type not in ["chunk", "proj_synthesis", "glean"]:
+                raise ValueError(
+                    f"Invalid plan type: {plan_type}. Valid plan types are: chunk, proj_synthesis, glean."
+                )
+
+        (
+            input_data,
+            output_data,
+            model_input_context_length,
+            no_change_runtime,
+            validator_prompt,
+            assessment,
+            data_exceeds_limit,
+        ) = self._should_optimize_helper(op_config, input_data)
+
+        # Check if improvement is needed based on the assessment
+        if not self.config.get("optimizer_config", {}).get("force_decompose", False):
+            if not data_exceeds_limit and not assessment.get("needs_improvement", True):
+                self.console.log(
+                    f"[green]No improvement needed for operation {op_config['name']}[/green]"
+                )
+                return (
+                    [op_config],
+                    output_data,
+                    self.plan_generator.subplan_optimizer_cost,
+                )
+
+        candidate_plans = {}
+
+        # Generate improved prompt plan
+        if not data_exceeds_limit:
+            #     improved_prompt_plan = self.prompt_generator._get_improved_prompt(
+            #         op_config, assessment, input_data
+            #     )
+            #     candidate_plans["improved_instructions"] = improved_prompt_plan
+            candidate_plans["no_change"] = [op_config]
+
+        # Generate chunk size plans
+        self.console.post_optimizer_status(StageType.CANDIDATE_PLANS)
+        if "chunk" in plan_types:
+            self.console.log(
+                "[bold magenta]Generating chunking plans...[/bold magenta]"
+            )
+            chunk_size_plans = self.plan_generator._generate_chunk_size_plans(
+                op_config, input_data, validator_prompt, model_input_context_length
+            )
+            for pname, plan in chunk_size_plans.items():
+                candidate_plans[pname] = plan
+
+        # Generate gleaning plans
+        if not data_exceeds_limit and "glean" in plan_types:
+            self.console.log(
+                "[bold magenta]Generating gleaning plans...[/bold magenta]"
+            )
+            gleaning_plans = self.plan_generator._generate_gleaning_plans(
+                op_config, validator_prompt
+            )
+            for pname, plan in gleaning_plans.items():
+                candidate_plans[pname] = plan
+
+        # Generate chain decomposition plans
+        if not data_exceeds_limit and "proj_synthesis" in plan_types:
+            if not self.is_filter:
+                self.console.log(
+                    "[bold magenta]Generating chain projection synthesis plans...[/bold magenta]"
+                )
+                chain_plans = self.plan_generator._generate_chain_plans(
+                    op_config, input_data
+                )
+                for pname, plan in chain_plans.items():
+                    candidate_plans[pname] = plan
+
+                # Generate parallel map plans
+                self.console.log(
+                    "[bold magenta]Generating independent projection synthesis plans...[/bold magenta]"
+                )
+                parallel_plans = self.plan_generator._generate_parallel_plans(
+                    op_config, input_data
+                )
+                for pname, plan in parallel_plans.items():
+                    candidate_plans[pname] = plan
+
+        # Select consistent evaluation samples
+        num_evaluations = min(5, len(input_data))
+        evaluation_samples = select_evaluation_samples(input_data, num_evaluations)
+
+        results = {}
+        plans_list = list(candidate_plans.items())
+
+        # Capture candidate plans
+        self.runner.optimizer.captured_output.save_optimizer_output(
+            stage_type=StageType.CANDIDATE_PLANS,
+            output=candidate_plans,
+        )
+
+        self.console.post_optimizer_status(StageType.EVALUATION_RESULTS)
+        self.console.log(
+            f"[bold magenta]Evaluating {len(plans_list)} plans...[/bold magenta]"
+        )
+        for i in range(0, len(plans_list), self._num_plans_to_evaluate_in_parallel):
+            batch = plans_list[i : i + self._num_plans_to_evaluate_in_parallel]
+            with ThreadPoolExecutor(
+                max_workers=self._num_plans_to_evaluate_in_parallel
+            ) as executor:
+                futures = {
+                    executor.submit(
+                        self.evaluator._evaluate_plan,
+                        plan_name,
+                        op_config,
+                        plan,
+                        copy.deepcopy(evaluation_samples),
+                        validator_prompt,
+                    ): plan_name
+                    for plan_name, plan in batch
+                }
+                for future in as_completed(futures):
+                    plan_name = futures[future]
+                    try:
+                        score, runtime, output = future.result(timeout=self.timeout)
+                        results[plan_name] = (score, runtime, output)
+                    except concurrent.futures.TimeoutError:
+                        self.console.log(
+                            f"[yellow]Plan {plan_name} timed out and will be skipped.[/yellow]"
+                        )
+                    except Exception as e:
+                        # TODO: raise this error if the error is related to a Jinja error
+                        self.console.log(
+                            f"[red]Error in plan {plan_name}: {str(e)}[/red]"
+                        )
+                        import traceback
+
+                        print(traceback.format_exc())
+
+        # Add no change plan
+        if not data_exceeds_limit:
+            results["no_change"] = (
+                results["no_change"][0],
+                no_change_runtime,
+                results["no_change"][2],
+            )
+
+        # Create a table of scores sorted in descending order
+        scores = sorted(
+            [(score, runtime, plan) for plan, (score, runtime, _) in results.items()],
+            reverse=True,
+        )
+
+        # Sort results by score in descending order
+        sorted_results = sorted(results.items(), key=lambda x: x[1][0], reverse=True)
+
+        # Take the top 6 plans
+        top_plans = sorted_results[: self.k_to_pairwise_compare]
+
+        # Check if there are no top plans
+        if len(top_plans) == 0:
+            self.console.post_optimizer_status(StageType.END)
+            raise ValueError(
+                "Agent did not generate any plans. Unable to proceed with optimization. Try again."
+            )
+
+        # Include any additional plans that are tied with the last plan
+        tail_score = (
+            top_plans[-1][1][0]
+            if len(top_plans) == self.k_to_pairwise_compare
+            else float("-inf")
+        )
+        filtered_results = dict(
+            top_plans
+            + [
+                item
+                for item in sorted_results[len(top_plans) :]
+                if item[1][0] == tail_score
+            ]
+        )
+
+        # Perform pairwise comparisons on filtered plans
+        if len(filtered_results) > 1:
+            pairwise_rankings = self.evaluator._pairwise_compare_plans(
+                filtered_results, validator_prompt, op_config, evaluation_samples
+            )
+            best_plan_name = max(pairwise_rankings, key=pairwise_rankings.get)
+        else:
+            pairwise_rankings = {k: 0 for k in results.keys()}
+            best_plan_name = (
+                next(iter(filtered_results))
+                if filtered_results
+                else max(results, key=lambda x: results[x][0])
+            )
+
+        self.console.log(
+            f"\n[bold]Plan Evaluation Results for {op_config['name']} ({op_config['type']}, {len(scores)} plans, {num_evaluations} samples):[/bold]"
+        )
+        table = Table(show_header=True, header_style="bold magenta")
+        table.add_column("Plan", style="dim")
+        table.add_column("Score", justify="right", width=10)
+        table.add_column("Runtime", justify="right", width=10)
+        table.add_column("Pairwise Wins", justify="right", width=10)
+
+        for score, runtime, plan in scores:
+            table.add_row(
+                plan,
+                f"{score:.2f}",
+                f"{runtime:.2f}s",
+                f"{pairwise_rankings.get(plan, 0)}",
+            )
+
+        self.console.log(table)
+        self.console.log("\n")
+
+        _, _, best_output = results[best_plan_name]
+        self.console.log(
+            f"[green]Choosing {best_plan_name} for operation {op_config['name']} (Score: {results[best_plan_name][0]:.2f}, Runtime: {results[best_plan_name][1]:.2f}s)[/green]"
+        )
+
+        # Capture evaluation results
+        ratings = {k: v[0] for k, v in results.items()}
+        runtime = {k: v[1] for k, v in results.items()}
+        sample_outputs = {k: v[2] for k, v in results.items()}
+        self.runner.optimizer.captured_output.save_optimizer_output(
+            stage_type=StageType.EVALUATION_RESULTS,
+            output={
+                "input_data": evaluation_samples,
+                "all_plan_ratings": ratings,
+                "all_plan_runtimes": runtime,
+                "all_plan_sample_outputs": sample_outputs,
+                "all_plan_pairwise_rankings": pairwise_rankings,
+            },
+        )
+
+        self.console.post_optimizer_status(StageType.END)
+        return (
+            candidate_plans[best_plan_name],
+            best_output,
+            self.plan_generator.subplan_optimizer_cost,
+        )
+
+
+ + + +
+ + + + + + + + + +
+ + +

+ __init__(runner, run_operation, timeout=10, is_filter=False, depth=1) + +

+ + +
+ +

Initialize the MapOptimizer.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ runner + + Runner + +
+

The runner object.

+
+
+ required +
+ run_operation + + Callable + +
+

A function to execute operations.

+
+
+ required +
+ timeout + + int + +
+

The timeout in seconds for operation execution. Defaults to 10.

+
+
+ 10 +
+ is_filter + + bool + +
+

If True, the operation is a filter operation. Defaults to False.

+
+
+ False +
+ +
+ Source code in docetl/optimizers/map_optimizer/optimizer.py +
38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
+52
+53
+54
+55
+56
+57
+58
+59
+60
+61
+62
+63
+64
+65
+66
+67
+68
+69
+70
+71
+72
+73
+74
+75
+76
+77
+78
+79
+80
+81
+82
+83
+84
+85
+86
+87
+88
+89
+90
+91
def __init__(
+    self,
+    runner,
+    run_operation: Callable,
+    timeout: int = 10,
+    is_filter: bool = False,
+    depth: int = 1,
+):
+    """
+    Initialize the MapOptimizer.
+
+    Args:
+        runner (Runner): The runner object.
+        run_operation (Callable): A function to execute operations.
+        timeout (int, optional): The timeout in seconds for operation execution. Defaults to 10.
+        is_filter (bool, optional): If True, the operation is a filter operation. Defaults to False.
+    """
+    self.runner = runner
+    self.config = runner.config
+    self.console = runner.console
+    self.llm_client = runner.optimizer.llm_client
+    self._run_operation = run_operation
+    self.max_threads = runner.max_threads
+    self.timeout = runner.optimizer.timeout
+    self._num_plans_to_evaluate_in_parallel = 5
+    self.is_filter = is_filter
+    self.k_to_pairwise_compare = 6
+
+    self.plan_generator = PlanGenerator(
+        runner,
+        self.llm_client,
+        self.console,
+        self.config,
+        run_operation,
+        self.max_threads,
+        is_filter,
+        depth,
+    )
+    self.evaluator = Evaluator(
+        self.llm_client,
+        self.console,
+        self._run_operation,
+        self.timeout,
+        self._num_plans_to_evaluate_in_parallel,
+        self.is_filter,
+    )
+    self.prompt_generator = PromptGenerator(
+        self.runner,
+        self.llm_client,
+        self.console,
+        self.config,
+        self.max_threads,
+        self.is_filter,
+    )
+
+
+
+ +
+ +
+ + +

+ optimize(op_config, input_data, plan_types=['chunk', 'proj_synthesis', 'glean']) + +

+ + +
+ +

Optimize the given operation configuration for the input data. +This method analyzes the operation and input data, generates various +optimization plans, evaluates them, and returns the best plan along +with its output. A key part of this process is creating a custom +validator prompt for evaluation. The validator prompt is generated +based on the specific task, input data, and output data. It serves +as a critical tool for assessing the quality and correctness of +each optimization plan's output. This custom prompt ensures that +the evaluation is tailored to the unique requirements and nuances +of the given operation. The types of optimization plans include:

+
    +
  1. +

    Improved Prompt Plan: Enhances the original prompt based on evaluation, aiming to improve output quality.

    +
  2. +
  3. +

    Chunk Size Plan: Splits input data into chunks of different sizes, + processes each chunk separately, and then combines the results. This + can improve performance for large inputs.

    +
  4. +
  5. +

    Gleaning Plans: Implements an iterative refinement process where the + output is validated and improved over multiple rounds, enhancing accuracy.

    +
  6. +
  7. +

    Chain Decomposition Plan: Breaks down complex operations into a series + of simpler sub-operations, potentially improving overall performance + and interpretability.

    +
  8. +
  9. +

    Parallel Map Plan: Decomposes the task into subtasks that can be + executed in parallel, potentially speeding up processing for + independent operations.

    +
  10. +
+

The method generates these plans, evaluates their performance using +a custom validator, and selects the best performing plan based on +output quality and execution time.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ op_config + + Dict[str, Any] + +
+

The configuration of the operation to optimize.

+
+
+ required +
+ input_data + + List[Dict[str, Any]] + +
+

The input data for the operation.

+
+
+ required +
+ + +

Returns:

+ + + + + + + + + + + + + + + + + + + + + + + + + +
TypeDescription
+ List[Dict[str, Any]] + +
+

Tuple[List[Dict[str, Any]], List[Dict[str, Any]], float]: A tuple containing

+
+
+ List[Dict[str, Any]] + +
+

the best optimization plan and its output. The plan is a list of

+
+
+ float + +
+

operation configurations that achieve the best performance.

+
+
+ Tuple[List[Dict[str, Any]], List[Dict[str, Any]], float] + +
+

The cost is the cost of the optimizer (from possibly synthesizing resolves).

+
+
+ +
+ Source code in docetl/optimizers/map_optimizer/optimizer.py +
239
+240
+241
+242
+243
+244
+245
+246
+247
+248
+249
+250
+251
+252
+253
+254
+255
+256
+257
+258
+259
+260
+261
+262
+263
+264
+265
+266
+267
+268
+269
+270
+271
+272
+273
+274
+275
+276
+277
+278
+279
+280
+281
+282
+283
+284
+285
+286
+287
+288
+289
+290
+291
+292
+293
+294
+295
+296
+297
+298
+299
+300
+301
+302
+303
+304
+305
+306
+307
+308
+309
+310
+311
+312
+313
+314
+315
+316
+317
+318
+319
+320
+321
+322
+323
+324
+325
+326
+327
+328
+329
+330
+331
+332
+333
+334
+335
+336
+337
+338
+339
+340
+341
+342
+343
+344
+345
+346
+347
+348
+349
+350
+351
+352
+353
+354
+355
+356
+357
+358
+359
+360
+361
+362
+363
+364
+365
+366
+367
+368
+369
+370
+371
+372
+373
+374
+375
+376
+377
+378
+379
+380
+381
+382
+383
+384
+385
+386
+387
+388
+389
+390
+391
+392
+393
+394
+395
+396
+397
+398
+399
+400
+401
+402
+403
+404
+405
+406
+407
+408
+409
+410
+411
+412
+413
+414
+415
+416
+417
+418
+419
+420
+421
+422
+423
+424
+425
+426
+427
+428
+429
+430
+431
+432
+433
+434
+435
+436
+437
+438
+439
+440
+441
+442
+443
+444
+445
+446
+447
+448
+449
+450
+451
+452
+453
+454
+455
+456
+457
+458
+459
+460
+461
+462
+463
+464
+465
+466
+467
+468
+469
+470
+471
+472
+473
+474
+475
+476
+477
+478
+479
+480
+481
+482
+483
+484
+485
+486
+487
+488
+489
+490
+491
+492
+493
+494
+495
+496
+497
+498
+499
+500
+501
+502
+503
+504
+505
+506
+507
+508
+509
+510
+511
+512
+513
+514
+515
+516
+517
+518
+519
+520
+521
+522
+523
+524
+525
def optimize(
+    self,
+    op_config: Dict[str, Any],
+    input_data: List[Dict[str, Any]],
+    plan_types: Optional[List[str]] = ["chunk", "proj_synthesis", "glean"],
+) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], float]:
+    """
+    Optimize the given operation configuration for the input data.
+    This method analyzes the operation and input data, generates various
+    optimization plans, evaluates them, and returns the best plan along
+    with its output. A key part of this process is creating a custom
+    validator prompt for evaluation. The validator prompt is generated
+    based on the specific task, input data, and output data. It serves
+    as a critical tool for assessing the quality and correctness of
+    each optimization plan's output. This custom prompt ensures that
+    the evaluation is tailored to the unique requirements and nuances
+    of the given operation. The types of optimization plans include:
+
+    1. Improved Prompt Plan: Enhances the original prompt based on evaluation, aiming to improve output quality.
+
+    2. Chunk Size Plan: Splits input data into chunks of different sizes,
+       processes each chunk separately, and then combines the results. This
+       can improve performance for large inputs.
+
+    3. Gleaning Plans: Implements an iterative refinement process where the
+       output is validated and improved over multiple rounds, enhancing accuracy.
+
+    4. Chain Decomposition Plan: Breaks down complex operations into a series
+       of simpler sub-operations, potentially improving overall performance
+       and interpretability.
+
+    5. Parallel Map Plan: Decomposes the task into subtasks that can be
+       executed in parallel, potentially speeding up processing for
+       independent operations.
+
+    The method generates these plans, evaluates their performance using
+    a custom validator, and selects the best performing plan based on
+    output quality and execution time.
+
+    Args:
+        op_config (Dict[str, Any]): The configuration of the operation to optimize.
+        input_data (List[Dict[str, Any]]): The input data for the operation.
+
+    Returns:
+        Tuple[List[Dict[str, Any]], List[Dict[str, Any]], float]: A tuple containing
+        the best optimization plan and its output. The plan is a list of
+        operation configurations that achieve the best performance.
+        The cost is the cost of the optimizer (from possibly synthesizing resolves).
+
+    """
+    # Verify that the plan types are valid
+    for plan_type in plan_types:
+        if plan_type not in ["chunk", "proj_synthesis", "glean"]:
+            raise ValueError(
+                f"Invalid plan type: {plan_type}. Valid plan types are: chunk, proj_synthesis, glean."
+            )
+
+    (
+        input_data,
+        output_data,
+        model_input_context_length,
+        no_change_runtime,
+        validator_prompt,
+        assessment,
+        data_exceeds_limit,
+    ) = self._should_optimize_helper(op_config, input_data)
+
+    # Check if improvement is needed based on the assessment
+    if not self.config.get("optimizer_config", {}).get("force_decompose", False):
+        if not data_exceeds_limit and not assessment.get("needs_improvement", True):
+            self.console.log(
+                f"[green]No improvement needed for operation {op_config['name']}[/green]"
+            )
+            return (
+                [op_config],
+                output_data,
+                self.plan_generator.subplan_optimizer_cost,
+            )
+
+    candidate_plans = {}
+
+    # Generate improved prompt plan
+    if not data_exceeds_limit:
+        #     improved_prompt_plan = self.prompt_generator._get_improved_prompt(
+        #         op_config, assessment, input_data
+        #     )
+        #     candidate_plans["improved_instructions"] = improved_prompt_plan
+        candidate_plans["no_change"] = [op_config]
+
+    # Generate chunk size plans
+    self.console.post_optimizer_status(StageType.CANDIDATE_PLANS)
+    if "chunk" in plan_types:
+        self.console.log(
+            "[bold magenta]Generating chunking plans...[/bold magenta]"
+        )
+        chunk_size_plans = self.plan_generator._generate_chunk_size_plans(
+            op_config, input_data, validator_prompt, model_input_context_length
+        )
+        for pname, plan in chunk_size_plans.items():
+            candidate_plans[pname] = plan
+
+    # Generate gleaning plans
+    if not data_exceeds_limit and "glean" in plan_types:
+        self.console.log(
+            "[bold magenta]Generating gleaning plans...[/bold magenta]"
+        )
+        gleaning_plans = self.plan_generator._generate_gleaning_plans(
+            op_config, validator_prompt
+        )
+        for pname, plan in gleaning_plans.items():
+            candidate_plans[pname] = plan
+
+    # Generate chain decomposition plans
+    if not data_exceeds_limit and "proj_synthesis" in plan_types:
+        if not self.is_filter:
+            self.console.log(
+                "[bold magenta]Generating chain projection synthesis plans...[/bold magenta]"
+            )
+            chain_plans = self.plan_generator._generate_chain_plans(
+                op_config, input_data
+            )
+            for pname, plan in chain_plans.items():
+                candidate_plans[pname] = plan
+
+            # Generate parallel map plans
+            self.console.log(
+                "[bold magenta]Generating independent projection synthesis plans...[/bold magenta]"
+            )
+            parallel_plans = self.plan_generator._generate_parallel_plans(
+                op_config, input_data
+            )
+            for pname, plan in parallel_plans.items():
+                candidate_plans[pname] = plan
+
+    # Select consistent evaluation samples
+    num_evaluations = min(5, len(input_data))
+    evaluation_samples = select_evaluation_samples(input_data, num_evaluations)
+
+    results = {}
+    plans_list = list(candidate_plans.items())
+
+    # Capture candidate plans
+    self.runner.optimizer.captured_output.save_optimizer_output(
+        stage_type=StageType.CANDIDATE_PLANS,
+        output=candidate_plans,
+    )
+
+    self.console.post_optimizer_status(StageType.EVALUATION_RESULTS)
+    self.console.log(
+        f"[bold magenta]Evaluating {len(plans_list)} plans...[/bold magenta]"
+    )
+    for i in range(0, len(plans_list), self._num_plans_to_evaluate_in_parallel):
+        batch = plans_list[i : i + self._num_plans_to_evaluate_in_parallel]
+        with ThreadPoolExecutor(
+            max_workers=self._num_plans_to_evaluate_in_parallel
+        ) as executor:
+            futures = {
+                executor.submit(
+                    self.evaluator._evaluate_plan,
+                    plan_name,
+                    op_config,
+                    plan,
+                    copy.deepcopy(evaluation_samples),
+                    validator_prompt,
+                ): plan_name
+                for plan_name, plan in batch
+            }
+            for future in as_completed(futures):
+                plan_name = futures[future]
+                try:
+                    score, runtime, output = future.result(timeout=self.timeout)
+                    results[plan_name] = (score, runtime, output)
+                except concurrent.futures.TimeoutError:
+                    self.console.log(
+                        f"[yellow]Plan {plan_name} timed out and will be skipped.[/yellow]"
+                    )
+                except Exception as e:
+                    # TODO: raise this error if the error is related to a Jinja error
+                    self.console.log(
+                        f"[red]Error in plan {plan_name}: {str(e)}[/red]"
+                    )
+                    import traceback
+
+                    print(traceback.format_exc())
+
+    # Add no change plan
+    if not data_exceeds_limit:
+        results["no_change"] = (
+            results["no_change"][0],
+            no_change_runtime,
+            results["no_change"][2],
+        )
+
+    # Create a table of scores sorted in descending order
+    scores = sorted(
+        [(score, runtime, plan) for plan, (score, runtime, _) in results.items()],
+        reverse=True,
+    )
+
+    # Sort results by score in descending order
+    sorted_results = sorted(results.items(), key=lambda x: x[1][0], reverse=True)
+
+    # Take the top 6 plans
+    top_plans = sorted_results[: self.k_to_pairwise_compare]
+
+    # Check if there are no top plans
+    if len(top_plans) == 0:
+        self.console.post_optimizer_status(StageType.END)
+        raise ValueError(
+            "Agent did not generate any plans. Unable to proceed with optimization. Try again."
+        )
+
+    # Include any additional plans that are tied with the last plan
+    tail_score = (
+        top_plans[-1][1][0]
+        if len(top_plans) == self.k_to_pairwise_compare
+        else float("-inf")
+    )
+    filtered_results = dict(
+        top_plans
+        + [
+            item
+            for item in sorted_results[len(top_plans) :]
+            if item[1][0] == tail_score
+        ]
+    )
+
+    # Perform pairwise comparisons on filtered plans
+    if len(filtered_results) > 1:
+        pairwise_rankings = self.evaluator._pairwise_compare_plans(
+            filtered_results, validator_prompt, op_config, evaluation_samples
+        )
+        best_plan_name = max(pairwise_rankings, key=pairwise_rankings.get)
+    else:
+        pairwise_rankings = {k: 0 for k in results.keys()}
+        best_plan_name = (
+            next(iter(filtered_results))
+            if filtered_results
+            else max(results, key=lambda x: results[x][0])
+        )
+
+    self.console.log(
+        f"\n[bold]Plan Evaluation Results for {op_config['name']} ({op_config['type']}, {len(scores)} plans, {num_evaluations} samples):[/bold]"
+    )
+    table = Table(show_header=True, header_style="bold magenta")
+    table.add_column("Plan", style="dim")
+    table.add_column("Score", justify="right", width=10)
+    table.add_column("Runtime", justify="right", width=10)
+    table.add_column("Pairwise Wins", justify="right", width=10)
+
+    for score, runtime, plan in scores:
+        table.add_row(
+            plan,
+            f"{score:.2f}",
+            f"{runtime:.2f}s",
+            f"{pairwise_rankings.get(plan, 0)}",
+        )
+
+    self.console.log(table)
+    self.console.log("\n")
+
+    _, _, best_output = results[best_plan_name]
+    self.console.log(
+        f"[green]Choosing {best_plan_name} for operation {op_config['name']} (Score: {results[best_plan_name][0]:.2f}, Runtime: {results[best_plan_name][1]:.2f}s)[/green]"
+    )
+
+    # Capture evaluation results
+    ratings = {k: v[0] for k, v in results.items()}
+    runtime = {k: v[1] for k, v in results.items()}
+    sample_outputs = {k: v[2] for k, v in results.items()}
+    self.runner.optimizer.captured_output.save_optimizer_output(
+        stage_type=StageType.EVALUATION_RESULTS,
+        output={
+            "input_data": evaluation_samples,
+            "all_plan_ratings": ratings,
+            "all_plan_runtimes": runtime,
+            "all_plan_sample_outputs": sample_outputs,
+            "all_plan_pairwise_rankings": pairwise_rankings,
+        },
+    )
+
+    self.console.post_optimizer_status(StageType.END)
+    return (
+        candidate_plans[best_plan_name],
+        best_output,
+        self.plan_generator.subplan_optimizer_cost,
+    )
+
+
+
+ +
+ +
+ + +

+ should_optimize(op_config, input_data) + +

+ + +
+ +

Determine if the given operation configuration should be optimized.

+ +
+ Source code in docetl/optimizers/map_optimizer/optimizer.py +
 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
def should_optimize(
+    self, op_config: Dict[str, Any], input_data: List[Dict[str, Any]]
+) -> Tuple[str, List[Dict[str, Any]], List[Dict[str, Any]]]:
+    """
+    Determine if the given operation configuration should be optimized.
+    """
+    (
+        input_data,
+        output_data,
+        _,
+        _,
+        validator_prompt,
+        assessment,
+        data_exceeds_limit,
+    ) = self._should_optimize_helper(op_config, input_data)
+    if data_exceeds_limit or assessment.get("needs_improvement", True):
+        assessment_str = (
+            "\n".join(assessment.get("reasons", []))
+            + "\n\nHere are some improvements that may help:\n"
+            + "\n".join(assessment.get("improvements", []))
+        )
+        if data_exceeds_limit:
+            assessment_str += "\nAlso, the input data exceeds the token limit."
+        return assessment_str, input_data, output_data
+    else:
+        return "", input_data, output_data
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ docetl.optimizers.reduce_optimizer.ReduceOptimizer + + +

+ + +
+ + +

A class that optimizes reduce operations in data processing pipelines.

+

This optimizer analyzes the input and output of a reduce operation, creates and evaluates +multiple reduce plans, and selects the best plan for optimizing the operation's performance.

+ + +

Attributes:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescription
config + Dict[str, Any] + +
+

Configuration dictionary for the optimizer.

+
+
console + Console + +
+

Rich console object for pretty printing.

+
+
llm_client + LLMClient + +
+

Client for interacting with a language model.

+
+
_run_operation + Callable + +
+

Function to run an operation.

+
+
max_threads + int + +
+

Maximum number of threads to use for parallel processing.

+
+
num_fold_prompts + int + +
+

Number of fold prompts to generate.

+
+
num_samples_in_validation + int + +
+

Number of samples to use in validation.

+
+
+ + + + + + +
+ Source code in docetl/optimizers/reduce_optimizer.py +
  18
+  19
+  20
+  21
+  22
+  23
+  24
+  25
+  26
+  27
+  28
+  29
+  30
+  31
+  32
+  33
+  34
+  35
+  36
+  37
+  38
+  39
+  40
+  41
+  42
+  43
+  44
+  45
+  46
+  47
+  48
+  49
+  50
+  51
+  52
+  53
+  54
+  55
+  56
+  57
+  58
+  59
+  60
+  61
+  62
+  63
+  64
+  65
+  66
+  67
+  68
+  69
+  70
+  71
+  72
+  73
+  74
+  75
+  76
+  77
+  78
+  79
+  80
+  81
+  82
+  83
+  84
+  85
+  86
+  87
+  88
+  89
+  90
+  91
+  92
+  93
+  94
+  95
+  96
+  97
+  98
+  99
+ 100
+ 101
+ 102
+ 103
+ 104
+ 105
+ 106
+ 107
+ 108
+ 109
+ 110
+ 111
+ 112
+ 113
+ 114
+ 115
+ 116
+ 117
+ 118
+ 119
+ 120
+ 121
+ 122
+ 123
+ 124
+ 125
+ 126
+ 127
+ 128
+ 129
+ 130
+ 131
+ 132
+ 133
+ 134
+ 135
+ 136
+ 137
+ 138
+ 139
+ 140
+ 141
+ 142
+ 143
+ 144
+ 145
+ 146
+ 147
+ 148
+ 149
+ 150
+ 151
+ 152
+ 153
+ 154
+ 155
+ 156
+ 157
+ 158
+ 159
+ 160
+ 161
+ 162
+ 163
+ 164
+ 165
+ 166
+ 167
+ 168
+ 169
+ 170
+ 171
+ 172
+ 173
+ 174
+ 175
+ 176
+ 177
+ 178
+ 179
+ 180
+ 181
+ 182
+ 183
+ 184
+ 185
+ 186
+ 187
+ 188
+ 189
+ 190
+ 191
+ 192
+ 193
+ 194
+ 195
+ 196
+ 197
+ 198
+ 199
+ 200
+ 201
+ 202
+ 203
+ 204
+ 205
+ 206
+ 207
+ 208
+ 209
+ 210
+ 211
+ 212
+ 213
+ 214
+ 215
+ 216
+ 217
+ 218
+ 219
+ 220
+ 221
+ 222
+ 223
+ 224
+ 225
+ 226
+ 227
+ 228
+ 229
+ 230
+ 231
+ 232
+ 233
+ 234
+ 235
+ 236
+ 237
+ 238
+ 239
+ 240
+ 241
+ 242
+ 243
+ 244
+ 245
+ 246
+ 247
+ 248
+ 249
+ 250
+ 251
+ 252
+ 253
+ 254
+ 255
+ 256
+ 257
+ 258
+ 259
+ 260
+ 261
+ 262
+ 263
+ 264
+ 265
+ 266
+ 267
+ 268
+ 269
+ 270
+ 271
+ 272
+ 273
+ 274
+ 275
+ 276
+ 277
+ 278
+ 279
+ 280
+ 281
+ 282
+ 283
+ 284
+ 285
+ 286
+ 287
+ 288
+ 289
+ 290
+ 291
+ 292
+ 293
+ 294
+ 295
+ 296
+ 297
+ 298
+ 299
+ 300
+ 301
+ 302
+ 303
+ 304
+ 305
+ 306
+ 307
+ 308
+ 309
+ 310
+ 311
+ 312
+ 313
+ 314
+ 315
+ 316
+ 317
+ 318
+ 319
+ 320
+ 321
+ 322
+ 323
+ 324
+ 325
+ 326
+ 327
+ 328
+ 329
+ 330
+ 331
+ 332
+ 333
+ 334
+ 335
+ 336
+ 337
+ 338
+ 339
+ 340
+ 341
+ 342
+ 343
+ 344
+ 345
+ 346
+ 347
+ 348
+ 349
+ 350
+ 351
+ 352
+ 353
+ 354
+ 355
+ 356
+ 357
+ 358
+ 359
+ 360
+ 361
+ 362
+ 363
+ 364
+ 365
+ 366
+ 367
+ 368
+ 369
+ 370
+ 371
+ 372
+ 373
+ 374
+ 375
+ 376
+ 377
+ 378
+ 379
+ 380
+ 381
+ 382
+ 383
+ 384
+ 385
+ 386
+ 387
+ 388
+ 389
+ 390
+ 391
+ 392
+ 393
+ 394
+ 395
+ 396
+ 397
+ 398
+ 399
+ 400
+ 401
+ 402
+ 403
+ 404
+ 405
+ 406
+ 407
+ 408
+ 409
+ 410
+ 411
+ 412
+ 413
+ 414
+ 415
+ 416
+ 417
+ 418
+ 419
+ 420
+ 421
+ 422
+ 423
+ 424
+ 425
+ 426
+ 427
+ 428
+ 429
+ 430
+ 431
+ 432
+ 433
+ 434
+ 435
+ 436
+ 437
+ 438
+ 439
+ 440
+ 441
+ 442
+ 443
+ 444
+ 445
+ 446
+ 447
+ 448
+ 449
+ 450
+ 451
+ 452
+ 453
+ 454
+ 455
+ 456
+ 457
+ 458
+ 459
+ 460
+ 461
+ 462
+ 463
+ 464
+ 465
+ 466
+ 467
+ 468
+ 469
+ 470
+ 471
+ 472
+ 473
+ 474
+ 475
+ 476
+ 477
+ 478
+ 479
+ 480
+ 481
+ 482
+ 483
+ 484
+ 485
+ 486
+ 487
+ 488
+ 489
+ 490
+ 491
+ 492
+ 493
+ 494
+ 495
+ 496
+ 497
+ 498
+ 499
+ 500
+ 501
+ 502
+ 503
+ 504
+ 505
+ 506
+ 507
+ 508
+ 509
+ 510
+ 511
+ 512
+ 513
+ 514
+ 515
+ 516
+ 517
+ 518
+ 519
+ 520
+ 521
+ 522
+ 523
+ 524
+ 525
+ 526
+ 527
+ 528
+ 529
+ 530
+ 531
+ 532
+ 533
+ 534
+ 535
+ 536
+ 537
+ 538
+ 539
+ 540
+ 541
+ 542
+ 543
+ 544
+ 545
+ 546
+ 547
+ 548
+ 549
+ 550
+ 551
+ 552
+ 553
+ 554
+ 555
+ 556
+ 557
+ 558
+ 559
+ 560
+ 561
+ 562
+ 563
+ 564
+ 565
+ 566
+ 567
+ 568
+ 569
+ 570
+ 571
+ 572
+ 573
+ 574
+ 575
+ 576
+ 577
+ 578
+ 579
+ 580
+ 581
+ 582
+ 583
+ 584
+ 585
+ 586
+ 587
+ 588
+ 589
+ 590
+ 591
+ 592
+ 593
+ 594
+ 595
+ 596
+ 597
+ 598
+ 599
+ 600
+ 601
+ 602
+ 603
+ 604
+ 605
+ 606
+ 607
+ 608
+ 609
+ 610
+ 611
+ 612
+ 613
+ 614
+ 615
+ 616
+ 617
+ 618
+ 619
+ 620
+ 621
+ 622
+ 623
+ 624
+ 625
+ 626
+ 627
+ 628
+ 629
+ 630
+ 631
+ 632
+ 633
+ 634
+ 635
+ 636
+ 637
+ 638
+ 639
+ 640
+ 641
+ 642
+ 643
+ 644
+ 645
+ 646
+ 647
+ 648
+ 649
+ 650
+ 651
+ 652
+ 653
+ 654
+ 655
+ 656
+ 657
+ 658
+ 659
+ 660
+ 661
+ 662
+ 663
+ 664
+ 665
+ 666
+ 667
+ 668
+ 669
+ 670
+ 671
+ 672
+ 673
+ 674
+ 675
+ 676
+ 677
+ 678
+ 679
+ 680
+ 681
+ 682
+ 683
+ 684
+ 685
+ 686
+ 687
+ 688
+ 689
+ 690
+ 691
+ 692
+ 693
+ 694
+ 695
+ 696
+ 697
+ 698
+ 699
+ 700
+ 701
+ 702
+ 703
+ 704
+ 705
+ 706
+ 707
+ 708
+ 709
+ 710
+ 711
+ 712
+ 713
+ 714
+ 715
+ 716
+ 717
+ 718
+ 719
+ 720
+ 721
+ 722
+ 723
+ 724
+ 725
+ 726
+ 727
+ 728
+ 729
+ 730
+ 731
+ 732
+ 733
+ 734
+ 735
+ 736
+ 737
+ 738
+ 739
+ 740
+ 741
+ 742
+ 743
+ 744
+ 745
+ 746
+ 747
+ 748
+ 749
+ 750
+ 751
+ 752
+ 753
+ 754
+ 755
+ 756
+ 757
+ 758
+ 759
+ 760
+ 761
+ 762
+ 763
+ 764
+ 765
+ 766
+ 767
+ 768
+ 769
+ 770
+ 771
+ 772
+ 773
+ 774
+ 775
+ 776
+ 777
+ 778
+ 779
+ 780
+ 781
+ 782
+ 783
+ 784
+ 785
+ 786
+ 787
+ 788
+ 789
+ 790
+ 791
+ 792
+ 793
+ 794
+ 795
+ 796
+ 797
+ 798
+ 799
+ 800
+ 801
+ 802
+ 803
+ 804
+ 805
+ 806
+ 807
+ 808
+ 809
+ 810
+ 811
+ 812
+ 813
+ 814
+ 815
+ 816
+ 817
+ 818
+ 819
+ 820
+ 821
+ 822
+ 823
+ 824
+ 825
+ 826
+ 827
+ 828
+ 829
+ 830
+ 831
+ 832
+ 833
+ 834
+ 835
+ 836
+ 837
+ 838
+ 839
+ 840
+ 841
+ 842
+ 843
+ 844
+ 845
+ 846
+ 847
+ 848
+ 849
+ 850
+ 851
+ 852
+ 853
+ 854
+ 855
+ 856
+ 857
+ 858
+ 859
+ 860
+ 861
+ 862
+ 863
+ 864
+ 865
+ 866
+ 867
+ 868
+ 869
+ 870
+ 871
+ 872
+ 873
+ 874
+ 875
+ 876
+ 877
+ 878
+ 879
+ 880
+ 881
+ 882
+ 883
+ 884
+ 885
+ 886
+ 887
+ 888
+ 889
+ 890
+ 891
+ 892
+ 893
+ 894
+ 895
+ 896
+ 897
+ 898
+ 899
+ 900
+ 901
+ 902
+ 903
+ 904
+ 905
+ 906
+ 907
+ 908
+ 909
+ 910
+ 911
+ 912
+ 913
+ 914
+ 915
+ 916
+ 917
+ 918
+ 919
+ 920
+ 921
+ 922
+ 923
+ 924
+ 925
+ 926
+ 927
+ 928
+ 929
+ 930
+ 931
+ 932
+ 933
+ 934
+ 935
+ 936
+ 937
+ 938
+ 939
+ 940
+ 941
+ 942
+ 943
+ 944
+ 945
+ 946
+ 947
+ 948
+ 949
+ 950
+ 951
+ 952
+ 953
+ 954
+ 955
+ 956
+ 957
+ 958
+ 959
+ 960
+ 961
+ 962
+ 963
+ 964
+ 965
+ 966
+ 967
+ 968
+ 969
+ 970
+ 971
+ 972
+ 973
+ 974
+ 975
+ 976
+ 977
+ 978
+ 979
+ 980
+ 981
+ 982
+ 983
+ 984
+ 985
+ 986
+ 987
+ 988
+ 989
+ 990
+ 991
+ 992
+ 993
+ 994
+ 995
+ 996
+ 997
+ 998
+ 999
+1000
+1001
+1002
+1003
+1004
+1005
+1006
+1007
+1008
+1009
+1010
+1011
+1012
+1013
+1014
+1015
+1016
+1017
+1018
+1019
+1020
+1021
+1022
+1023
+1024
+1025
+1026
+1027
+1028
+1029
+1030
+1031
+1032
+1033
+1034
+1035
+1036
+1037
+1038
+1039
+1040
+1041
+1042
+1043
+1044
+1045
+1046
+1047
+1048
+1049
+1050
+1051
+1052
+1053
+1054
+1055
+1056
+1057
+1058
+1059
+1060
+1061
+1062
+1063
+1064
+1065
+1066
+1067
+1068
+1069
+1070
+1071
+1072
+1073
+1074
+1075
+1076
+1077
+1078
+1079
+1080
+1081
+1082
+1083
+1084
+1085
+1086
+1087
+1088
+1089
+1090
+1091
+1092
+1093
+1094
+1095
+1096
+1097
+1098
+1099
+1100
+1101
+1102
+1103
+1104
+1105
+1106
+1107
+1108
+1109
+1110
+1111
+1112
+1113
+1114
+1115
+1116
+1117
+1118
+1119
+1120
+1121
+1122
+1123
+1124
+1125
+1126
+1127
+1128
+1129
+1130
+1131
+1132
+1133
+1134
+1135
+1136
+1137
+1138
+1139
+1140
+1141
+1142
+1143
+1144
+1145
+1146
+1147
+1148
+1149
+1150
+1151
+1152
+1153
+1154
+1155
+1156
+1157
+1158
+1159
+1160
+1161
+1162
+1163
+1164
+1165
+1166
+1167
+1168
+1169
+1170
+1171
+1172
+1173
+1174
+1175
+1176
+1177
+1178
+1179
+1180
+1181
+1182
+1183
+1184
+1185
+1186
+1187
+1188
+1189
+1190
+1191
+1192
+1193
+1194
+1195
+1196
+1197
+1198
+1199
+1200
+1201
+1202
+1203
+1204
+1205
+1206
+1207
+1208
+1209
+1210
+1211
+1212
+1213
+1214
+1215
+1216
+1217
+1218
+1219
+1220
+1221
+1222
+1223
+1224
+1225
+1226
+1227
+1228
+1229
+1230
+1231
+1232
+1233
+1234
+1235
+1236
+1237
+1238
+1239
+1240
+1241
+1242
+1243
+1244
+1245
+1246
+1247
+1248
+1249
+1250
+1251
+1252
+1253
+1254
+1255
+1256
+1257
+1258
+1259
+1260
+1261
+1262
+1263
+1264
+1265
+1266
+1267
+1268
+1269
+1270
+1271
+1272
+1273
+1274
+1275
+1276
+1277
+1278
+1279
+1280
+1281
+1282
+1283
+1284
+1285
+1286
+1287
+1288
+1289
+1290
+1291
+1292
+1293
+1294
+1295
+1296
+1297
+1298
+1299
+1300
+1301
+1302
+1303
+1304
+1305
+1306
+1307
+1308
+1309
+1310
+1311
+1312
+1313
+1314
+1315
+1316
+1317
+1318
+1319
+1320
+1321
+1322
+1323
+1324
+1325
+1326
+1327
+1328
+1329
+1330
+1331
+1332
+1333
+1334
+1335
+1336
+1337
+1338
+1339
+1340
+1341
+1342
+1343
+1344
+1345
+1346
+1347
+1348
+1349
+1350
+1351
+1352
+1353
+1354
+1355
+1356
+1357
+1358
+1359
+1360
+1361
+1362
+1363
+1364
+1365
+1366
+1367
+1368
+1369
+1370
+1371
+1372
+1373
+1374
+1375
+1376
+1377
+1378
+1379
+1380
+1381
+1382
+1383
+1384
+1385
+1386
+1387
+1388
+1389
+1390
+1391
+1392
+1393
+1394
+1395
+1396
+1397
+1398
+1399
+1400
+1401
+1402
+1403
+1404
+1405
+1406
+1407
+1408
+1409
+1410
+1411
+1412
+1413
+1414
+1415
+1416
+1417
+1418
+1419
+1420
+1421
+1422
+1423
+1424
+1425
+1426
+1427
+1428
+1429
+1430
+1431
+1432
+1433
+1434
+1435
+1436
+1437
+1438
+1439
+1440
+1441
+1442
+1443
+1444
+1445
+1446
+1447
+1448
+1449
+1450
+1451
+1452
+1453
+1454
+1455
+1456
+1457
+1458
+1459
+1460
+1461
+1462
+1463
+1464
+1465
+1466
+1467
+1468
+1469
+1470
+1471
+1472
+1473
+1474
+1475
+1476
+1477
+1478
+1479
+1480
+1481
+1482
+1483
+1484
+1485
+1486
+1487
+1488
+1489
+1490
+1491
+1492
+1493
+1494
+1495
+1496
+1497
+1498
+1499
+1500
+1501
+1502
+1503
+1504
+1505
+1506
+1507
+1508
+1509
+1510
+1511
+1512
+1513
+1514
+1515
+1516
+1517
+1518
+1519
+1520
+1521
+1522
+1523
+1524
+1525
+1526
+1527
+1528
+1529
+1530
+1531
+1532
+1533
+1534
+1535
+1536
+1537
+1538
+1539
+1540
+1541
+1542
+1543
+1544
+1545
+1546
+1547
+1548
+1549
+1550
+1551
+1552
+1553
+1554
+1555
+1556
+1557
+1558
+1559
+1560
+1561
+1562
+1563
+1564
+1565
+1566
+1567
+1568
+1569
+1570
+1571
+1572
+1573
+1574
+1575
+1576
+1577
+1578
+1579
+1580
+1581
+1582
+1583
+1584
+1585
+1586
+1587
+1588
+1589
+1590
+1591
+1592
+1593
+1594
+1595
+1596
+1597
+1598
+1599
+1600
+1601
+1602
+1603
+1604
+1605
+1606
+1607
+1608
+1609
+1610
+1611
+1612
+1613
+1614
+1615
+1616
+1617
+1618
+1619
+1620
+1621
+1622
+1623
+1624
+1625
+1626
+1627
+1628
+1629
+1630
+1631
+1632
+1633
+1634
+1635
+1636
+1637
+1638
+1639
+1640
+1641
+1642
+1643
+1644
+1645
+1646
+1647
+1648
+1649
+1650
+1651
+1652
+1653
+1654
+1655
+1656
+1657
+1658
+1659
+1660
+1661
+1662
+1663
+1664
+1665
+1666
+1667
+1668
+1669
+1670
+1671
+1672
+1673
+1674
+1675
+1676
+1677
+1678
+1679
+1680
+1681
+1682
+1683
+1684
+1685
+1686
+1687
+1688
+1689
+1690
+1691
+1692
+1693
+1694
+1695
+1696
+1697
+1698
+1699
+1700
+1701
+1702
+1703
+1704
+1705
+1706
+1707
+1708
+1709
+1710
+1711
+1712
+1713
+1714
+1715
+1716
+1717
+1718
+1719
+1720
+1721
+1722
+1723
+1724
+1725
+1726
+1727
+1728
+1729
+1730
+1731
+1732
+1733
+1734
+1735
+1736
+1737
+1738
+1739
+1740
+1741
+1742
+1743
+1744
+1745
+1746
+1747
+1748
+1749
+1750
+1751
+1752
+1753
+1754
+1755
+1756
+1757
+1758
+1759
+1760
+1761
+1762
+1763
+1764
+1765
+1766
+1767
+1768
+1769
+1770
+1771
+1772
+1773
+1774
+1775
+1776
+1777
+1778
+1779
+1780
+1781
+1782
+1783
+1784
+1785
+1786
+1787
+1788
+1789
+1790
+1791
+1792
+1793
+1794
+1795
+1796
+1797
+1798
+1799
+1800
+1801
+1802
+1803
+1804
+1805
+1806
+1807
+1808
+1809
+1810
+1811
+1812
+1813
+1814
+1815
+1816
+1817
+1818
+1819
+1820
+1821
+1822
+1823
+1824
+1825
+1826
+1827
+1828
+1829
+1830
+1831
+1832
+1833
+1834
+1835
+1836
+1837
+1838
+1839
+1840
+1841
+1842
+1843
+1844
+1845
+1846
+1847
+1848
+1849
+1850
+1851
+1852
+1853
+1854
+1855
+1856
+1857
+1858
+1859
+1860
+1861
+1862
+1863
+1864
+1865
+1866
+1867
+1868
+1869
+1870
+1871
+1872
+1873
+1874
+1875
+1876
+1877
+1878
+1879
+1880
+1881
+1882
+1883
+1884
+1885
+1886
+1887
+1888
+1889
+1890
+1891
+1892
+1893
+1894
+1895
+1896
+1897
+1898
+1899
+1900
+1901
+1902
+1903
+1904
+1905
+1906
+1907
+1908
+1909
+1910
+1911
+1912
+1913
+1914
class ReduceOptimizer:
+    """
+    A class that optimizes reduce operations in data processing pipelines.
+
+    This optimizer analyzes the input and output of a reduce operation, creates and evaluates
+    multiple reduce plans, and selects the best plan for optimizing the operation's performance.
+
+    Attributes:
+        config (Dict[str, Any]): Configuration dictionary for the optimizer.
+        console (Console): Rich console object for pretty printing.
+        llm_client (LLMClient): Client for interacting with a language model.
+        _run_operation (Callable): Function to run an operation.
+        max_threads (int): Maximum number of threads to use for parallel processing.
+        num_fold_prompts (int): Number of fold prompts to generate.
+        num_samples_in_validation (int): Number of samples to use in validation.
+    """
+
+    def __init__(
+        self,
+        runner,
+        run_operation: Callable,
+        num_fold_prompts: int = 1,
+        num_samples_in_validation: int = 10,
+    ):
+        """
+        Initialize the ReduceOptimizer.
+
+        Args:
+            config (Dict[str, Any]): Configuration dictionary for the optimizer.
+            console (Console): Rich console object for pretty printing.
+            llm_client (LLMClient): Client for interacting with a language model.
+            max_threads (int): Maximum number of threads to use for parallel processing.
+            run_operation (Callable): Function to run an operation.
+            num_fold_prompts (int, optional): Number of fold prompts to generate. Defaults to 1.
+            num_samples_in_validation (int, optional): Number of samples to use in validation. Defaults to 10.
+        """
+        self.runner = runner
+        self.config = self.runner.config
+        self.console = self.runner.console
+        self.llm_client = self.runner.optimizer.llm_client
+        self._run_operation = run_operation
+        self.max_threads = self.runner.max_threads
+        self.num_fold_prompts = num_fold_prompts
+        self.num_samples_in_validation = num_samples_in_validation
+        self.status = self.runner.status
+
+    def should_optimize_helper(
+        self, op_config: Dict[str, Any], input_data: List[Dict[str, Any]]
+    ) -> str:
+        # Check if we're running out of token limits for the reduce prompt
+        model = op_config.get("model", self.config.get("default_model", "gpt-4o-mini"))
+        model_input_context_length = model_cost.get(model, {}).get(
+            "max_input_tokens", 4096
+        )
+
+        # Find the key with the longest value
+        if op_config["reduce_key"] == ["_all"]:
+            sample_key = tuple(["_all"])
+        else:
+            longest_key = max(
+                op_config["reduce_key"], key=lambda k: len(str(input_data[0][k]))
+            )
+            sample_key = tuple(
+                input_data[0][k] if k == longest_key else input_data[0][k]
+                for k in op_config["reduce_key"]
+            )
+
+        # Render the prompt with a sample input
+        prompt_template = Template(op_config["prompt"])
+        sample_prompt = prompt_template.render(
+            reduce_key=dict(zip(op_config["reduce_key"], sample_key)),
+            inputs=[input_data[0]],
+        )
+
+        # Count tokens in the sample prompt
+        prompt_tokens = count_tokens(sample_prompt, model)
+
+        self.console.post_optimizer_status(StageType.SAMPLE_RUN)
+        original_output = self._run_operation(op_config, input_data)
+
+        # Step 1: Synthesize a validator prompt
+        self.console.post_optimizer_status(StageType.SHOULD_OPTIMIZE)
+        validator_prompt = self._generate_validator_prompt(
+            op_config, input_data, original_output
+        )
+
+        # Log the validator prompt
+        self.console.log("[bold]Validator Prompt:[/bold]")
+        self.console.log(validator_prompt)
+        self.console.log("\n")  # Add a newline for better readability
+
+        # Step 2: validate the output
+        validator_inputs = self._create_validation_inputs(
+            input_data, op_config["reduce_key"]
+        )
+        validation_results = self._validate_reduce_output(
+            op_config, validator_inputs, original_output, validator_prompt
+        )
+
+        return (
+            validation_results,
+            prompt_tokens,
+            model_input_context_length,
+            model,
+            validator_prompt,
+            original_output,
+        )
+
+    def should_optimize(
+        self, op_config: Dict[str, Any], input_data: List[Dict[str, Any]]
+    ) -> Tuple[str, List[Dict[str, Any]], List[Dict[str, Any]]]:
+        (
+            validation_results,
+            prompt_tokens,
+            model_input_context_length,
+            model,
+            validator_prompt,
+            original_output,
+        ) = self.should_optimize_helper(op_config, input_data)
+        if prompt_tokens * 1.5 > model_input_context_length:
+            return (
+                "The reduce prompt is likely to exceed the token limit for model {model}.",
+                input_data,
+                original_output,
+            )
+
+        if validation_results.get("needs_improvement", False):
+            return (
+                "\n".join(
+                    [
+                        f"Issues: {result['issues']} Suggestions: {result['suggestions']}"
+                        for result in validation_results["validation_results"]
+                    ]
+                ),
+                input_data,
+                original_output,
+            )
+        else:
+            return "", input_data, original_output
+
+    def optimize(
+        self,
+        op_config: Dict[str, Any],
+        input_data: List[Dict[str, Any]],
+        level: int = 1,
+    ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], float]:
+        """
+        Optimize the reduce operation based on the given configuration and input data.
+
+        This method performs the following steps:
+        1. Run the original operation
+        2. Generate a validator prompt
+        3. Validate the output
+        4. If improvement is needed:
+           a. Evaluate if decomposition is beneficial
+           b. If decomposition is beneficial, recursively optimize each sub-operation
+           c. If not, proceed with single operation optimization
+        5. Run the optimized operation(s)
+
+        Args:
+            op_config (Dict[str, Any]): Configuration for the reduce operation.
+            input_data (List[Dict[str, Any]]): Input data for the reduce operation.
+
+        Returns:
+            Tuple[List[Dict[str, Any]], List[Dict[str, Any]], float]: A tuple containing the list of optimized configurations
+            and the list of outputs from the optimized operation(s), and the cost of the operation due to synthesizing any resolve operations.
+        """
+        (
+            validation_results,
+            prompt_tokens,
+            model_input_context_length,
+            model,
+            validator_prompt,
+            original_output,
+        ) = self.should_optimize_helper(op_config, input_data)
+
+        # add_map_op = False
+        if prompt_tokens * 2 > model_input_context_length:
+            # add_map_op = True
+            self.console.log(
+                f"[yellow]Warning: The reduce prompt exceeds the token limit for model {model}. "
+                f"Token count: {prompt_tokens}, Limit: {model_input_context_length}. "
+                f"Add a map operation to the pipeline.[/yellow]"
+            )
+
+        # # Also query an agent to look at a sample of the inputs and see if they think a map operation would be helpful
+        # preprocessing_steps = ""
+        # should_use_map, preprocessing_steps = self._should_use_map(
+        #     op_config, input_data
+        # )
+        # if should_use_map or add_map_op:
+        #     # Synthesize a map operation
+        #     map_prompt, map_output_schema = self._synthesize_map_operation(
+        #         op_config, preprocessing_steps, input_data
+        #     )
+        #     # Change the reduce operation prompt to use the map schema
+        #     new_reduce_prompt = self._change_reduce_prompt_to_use_map_schema(
+        #         op_config["prompt"], map_output_schema
+        #     )
+        #     op_config["prompt"] = new_reduce_prompt
+
+        #     # Return unoptimized map and reduce operations
+        #     return [map_prompt, op_config], input_data, 0.0
+
+        # Print the validation results
+        self.console.log("[bold]Validation Results on Initial Sample:[/bold]")
+        if validation_results["needs_improvement"] or self.config.get(
+            "optimizer_config", {}
+        ).get("force_decompose", False):
+            self.console.post_optimizer_rationale(
+                should_optimize=True,
+                rationale="\n".join(
+                    [
+                        f"Issues: {result['issues']} Suggestions: {result['suggestions']}"
+                        for result in validation_results["validation_results"]
+                    ]
+                ),
+                validator_prompt=validator_prompt,
+            )
+            self.console.log(
+                "\n".join(
+                    [
+                        f"Issues: {result['issues']} Suggestions: {result['suggestions']}"
+                        for result in validation_results["validation_results"]
+                    ]
+                )
+            )
+
+            # Step 3: Evaluate if decomposition is beneficial
+            decomposition_result = self._evaluate_decomposition(
+                op_config, input_data, level
+            )
+
+            if decomposition_result["should_decompose"]:
+                return self._optimize_decomposed_reduce(
+                    decomposition_result, op_config, input_data, level
+                )
+
+            return self._optimize_single_reduce(op_config, input_data, validator_prompt)
+        else:
+            self.console.log(f"No improvements identified; {validation_results}.")
+            self.console.post_optimizer_rationale(
+                should_optimize=False,
+                rationale="No improvements identified; no optimization recommended.",
+                validator_prompt=validator_prompt,
+            )
+            return [op_config], original_output, 0.0
+
+    def _should_use_map(
+        self, op_config: Dict[str, Any], input_data: List[Dict[str, Any]]
+    ) -> Tuple[bool, str]:
+        """
+        Determine if a map operation should be used based on the input data.
+        """
+        # Sample a random input item
+        sample_input = random.choice(input_data)
+
+        # Format the prompt with the sample input
+        prompt_template = Template(op_config["prompt"])
+        formatted_prompt = prompt_template.render(
+            reduce_key=dict(
+                zip(op_config["reduce_key"], sample_input[op_config["reduce_key"]])
+            ),
+            inputs=[sample_input],
+        )
+
+        # Prepare the message for the LLM
+        messages = [{"role": "user", "content": formatted_prompt}]
+
+        # Truncate the messages to fit the model's context window
+        truncated_messages = truncate_messages(
+            messages, self.config.get("model", self.default_model)
+        )
+
+        # Query the LLM for preprocessing suggestions
+        preprocessing_prompt = (
+            "Based on the following reduce operation prompt, should we do any preprocessing on the input data? "
+            "Consider if we need to remove unnecessary context, or logically construct an output that will help in the task. "
+            "If preprocessing would be beneficial, explain why and suggest specific steps. If not, explain why preprocessing isn't necessary.\n\n"
+            f"Reduce operation prompt:\n{truncated_messages[0]['content']}"
+        )
+
+        preprocessing_response = self.llm_client.generate(
+            model=self.config.get("model", self.default_model),
+            messages=[{"role": "user", "content": preprocessing_prompt}],
+            response_format={
+                "type": "json_object",
+                "schema": {
+                    "type": "object",
+                    "properties": {
+                        "preprocessing_needed": {"type": "boolean"},
+                        "rationale": {"type": "string"},
+                        "suggested_steps": {"type": "string"},
+                    },
+                    "required": [
+                        "preprocessing_needed",
+                        "rationale",
+                        "suggested_steps",
+                    ],
+                },
+            },
+        )
+
+        preprocessing_result = preprocessing_response.choices[0].message.content
+
+        should_preprocess = preprocessing_result["preprocessing_needed"]
+        preprocessing_rationale = preprocessing_result["rationale"]
+
+        self.console.log("[bold]Map-Reduce Decomposition Analysis:[/bold]")
+        self.console.log(f"Should write a map operation: {should_preprocess}")
+        self.console.log(f"Rationale: {preprocessing_rationale}")
+
+        if should_preprocess:
+            self.console.log(
+                f"Suggested steps: {preprocessing_result['suggested_steps']}"
+            )
+
+        return should_preprocess, preprocessing_result["suggested_steps"]
+
+    def _optimize_single_reduce(
+        self,
+        op_config: Dict[str, Any],
+        input_data: List[Dict[str, Any]],
+        validator_prompt: str,
+    ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], float]:
+        """
+        Optimize a single reduce operation.
+
+        This method performs the following steps:
+        1. Determine and configure value sampling
+        2. Determine if the reduce operation is associative
+        3. Create and evaluate multiple reduce plans
+        4. Run the best reduce plan
+
+        Args:
+            op_config (Dict[str, Any]): Configuration for the reduce operation.
+            input_data (List[Dict[str, Any]]): Input data for the reduce operation.
+            validator_prompt (str): The validator prompt for evaluating reduce plans.
+
+        Returns:
+            Tuple[List[Dict[str, Any]], List[Dict[str, Any]], float]: A tuple containing a single-item list with the optimized configuration
+            and a single-item list with the output from the optimized operation, and the cost of the operation due to synthesizing any resolve operations.
+        """
+        # Step 1: Determine and configure value sampling (TODO: re-enable this when the agent is more reliable)
+        # value_sampling_config = self._determine_value_sampling(op_config, input_data)
+        # if value_sampling_config["enabled"]:
+        #     op_config["value_sampling"] = value_sampling_config
+        #     self.console.log("[bold]Value Sampling Configuration:[/bold]")
+        #     self.console.log(json.dumps(value_sampling_config, indent=2))
+
+        # Step 2: Determine if the reduce operation is associative
+        is_associative = self._is_associative(op_config, input_data)
+
+        # Step 3: Create and evaluate multiple reduce plans
+        self.console.post_optimizer_status(StageType.CANDIDATE_PLANS)
+        self.console.log("[bold magenta]Generating batched plans...[/bold magenta]")
+        reduce_plans = self._create_reduce_plans(op_config, input_data, is_associative)
+
+        # Create gleaning plans
+        self.console.log("[bold magenta]Generating gleaning plans...[/bold magenta]")
+        gleaning_plans = self._generate_gleaning_plans(reduce_plans, validator_prompt)
+
+        self.console.log("[bold magenta]Evaluating plans...[/bold magenta]")
+        self.console.post_optimizer_status(StageType.EVALUATION_RESULTS)
+        best_plan = self._evaluate_reduce_plans(
+            op_config, reduce_plans + gleaning_plans, input_data, validator_prompt
+        )
+
+        # Step 4: Run the best reduce plan
+        optimized_output = self._run_operation(best_plan, input_data)
+        self.console.post_optimizer_status(StageType.END)
+
+        return [best_plan], optimized_output, 0.0
+
+    def _generate_gleaning_plans(
+        self,
+        plans: List[Dict[str, Any]],
+        validation_prompt: str,
+    ) -> List[Dict[str, Any]]:
+        """
+        Generate plans that use gleaning for the given operation.
+
+        Gleaning involves iteratively refining the output of an operation
+        based on validation feedback. This method creates plans with different
+        numbers of gleaning rounds.
+
+        Args:
+            plans (List[Dict[str, Any]]): The list of plans to use for gleaning.
+            validation_prompt (str): The prompt used for validating the operation's output.
+
+        Returns:
+            Dict[str, List[Dict[str, Any]]]: A dictionary of gleaning plans, where each key
+            is a plan name and each value is a list containing a single operation configuration
+            with gleaning parameters.
+
+        """
+        # Generate an op with gleaning num_rounds and validation_prompt
+        gleaning_plans = []
+        gleaning_rounds = [1]
+        biggest_batch_size = max([plan["fold_batch_size"] for plan in plans])
+        for plan in plans:
+            if plan["fold_batch_size"] != biggest_batch_size:
+                continue
+            for gleaning_round in gleaning_rounds:
+                plan_copy = copy.deepcopy(plan)
+                plan_copy["gleaning"] = {
+                    "num_rounds": gleaning_round,
+                    "validation_prompt": validation_prompt,
+                }
+                plan_name = f"gleaning_{gleaning_round}_rounds_{plan['name']}"
+                plan_copy["name"] = plan_name
+                gleaning_plans.append(plan_copy)
+        return gleaning_plans
+
+    def _optimize_decomposed_reduce(
+        self,
+        decomposition_result: Dict[str, Any],
+        op_config: Dict[str, Any],
+        input_data: List[Dict[str, Any]],
+        level: int,
+    ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], float]:
+        """
+        Optimize a decomposed reduce operation.
+
+        This method performs the following steps:
+        1. Group the input data by the sub-group key.
+        2. Optimize the first reduce operation.
+        3. Run the optimized first reduce operation on all groups.
+        4. Optimize the second reduce operation using the results of the first.
+        5. Run the optimized second reduce operation.
+
+        Args:
+            decomposition_result (Dict[str, Any]): The result of the decomposition evaluation.
+            op_config (Dict[str, Any]): The original reduce operation configuration.
+            input_data (List[Dict[str, Any]]): The input data for the reduce operation.
+            level (int): The current level of decomposition.
+        Returns:
+            Tuple[List[Dict[str, Any]], List[Dict[str, Any]], float]: A tuple containing the list of optimized configurations
+            for both reduce operations and the final output of the second reduce operation, and the cost of the operation due to synthesizing any resolve operations.
+        """
+        sub_group_key = decomposition_result["sub_group_key"]
+        first_reduce_prompt = decomposition_result["first_reduce_prompt"]
+        second_reduce_prompt = decomposition_result["second_reduce_prompt"]
+        pipeline = []
+        all_cost = 0.0
+
+        first_reduce_config = op_config.copy()
+        first_reduce_config["prompt"] = first_reduce_prompt
+        if isinstance(op_config["reduce_key"], list):
+            first_reduce_config["reduce_key"] = [sub_group_key] + op_config[
+                "reduce_key"
+            ]
+        else:
+            first_reduce_config["reduce_key"] = [sub_group_key, op_config["reduce_key"]]
+        first_reduce_config["pass_through"] = True
+
+        if first_reduce_config.get("synthesize_resolve", True):
+            resolve_config = {
+                "type": "resolve",
+                "empty": True,
+                "embedding_model": "text-embedding-3-small",
+                "resolution_model": self.config.get("default_model", "gpt-4o-mini"),
+                "comparison_model": self.config.get("default_model", "gpt-4o-mini"),
+                "_intermediates": {
+                    "map_prompt": op_config.get("_intermediates", {}).get(
+                        "last_map_prompt"
+                    ),
+                    "reduce_key": first_reduce_config["reduce_key"],
+                },
+            }
+            optimized_resolve_config, resolve_cost = JoinOptimizer(
+                self.config,
+                resolve_config,
+                self.console,
+                self.llm_client,
+                self.max_threads,
+            ).optimize_resolve(input_data)
+            all_cost += resolve_cost
+
+            if not optimized_resolve_config.get("empty", False):
+                # Add this to the pipeline
+                pipeline += [optimized_resolve_config]
+
+                # Run the resolver
+                optimized_output = self._run_operation(
+                    optimized_resolve_config, input_data
+                )
+                input_data = optimized_output
+
+        first_optimized_configs, first_outputs, first_cost = self.optimize(
+            first_reduce_config, input_data, level + 1
+        )
+        pipeline += first_optimized_configs
+        all_cost += first_cost
+
+        # Optimize second reduce operation
+        second_reduce_config = op_config.copy()
+        second_reduce_config["prompt"] = second_reduce_prompt
+        second_reduce_config["pass_through"] = True
+
+        second_optimized_configs, second_outputs, second_cost = self.optimize(
+            second_reduce_config, first_outputs, level + 1
+        )
+
+        # Combine optimized configs and return with final output
+        pipeline += second_optimized_configs
+        all_cost += second_cost
+
+        return pipeline, second_outputs, all_cost
+
+    def _evaluate_decomposition(
+        self,
+        op_config: Dict[str, Any],
+        input_data: List[Dict[str, Any]],
+        level: int = 1,
+    ) -> Dict[str, Any]:
+        """
+        Evaluate whether decomposing the reduce operation would be beneficial.
+
+        This method first determines if decomposition would be helpful, and if so,
+        it then determines the sub-group key and prompts for the decomposed operations.
+
+        Args:
+            op_config (Dict[str, Any]): Configuration for the reduce operation.
+            input_data (List[Dict[str, Any]]): Input data for the reduce operation.
+            level (int): The current level of decomposition.
+
+        Returns:
+            Dict[str, Any]: A dictionary containing the decomposition decision and details.
+        """
+        should_decompose = self._should_decompose(op_config, input_data, level)
+
+        # Log the decomposition decision
+        if should_decompose["should_decompose"]:
+            self.console.log(
+                f"[bold green]Decomposition recommended:[/bold green] {should_decompose['explanation']}"
+            )
+        else:
+            self.console.log(
+                f"[bold yellow]Decomposition not recommended:[/bold yellow] {should_decompose['explanation']}"
+            )
+
+        # Return early if decomposition is not recommended
+        if not should_decompose["should_decompose"]:
+            return should_decompose
+
+        # Temporarily stop the status
+        if self.status:
+            self.status.stop()
+
+        # Ask user if they agree with the decomposition assessment
+        user_agrees = Confirm.ask(
+            f"Do you agree with the decomposition assessment? "
+            f"[bold]{'Recommended' if should_decompose['should_decompose'] else 'Not recommended'}[/bold]",
+            console=self.console,
+        )
+
+        # If user disagrees, invert the decomposition decision
+        if not user_agrees:
+            should_decompose["should_decompose"] = not should_decompose[
+                "should_decompose"
+            ]
+            should_decompose["explanation"] = (
+                "User disagreed with the initial assessment."
+            )
+
+        # Restart the status
+        if self.status:
+            self.status.start()
+
+        # Return if decomposition is not recommended
+        if not should_decompose["should_decompose"]:
+            return should_decompose
+
+        decomposition_details = self._get_decomposition_details(op_config, input_data)
+        result = {**should_decompose, **decomposition_details}
+        if decomposition_details["sub_group_key"] in op_config["reduce_key"]:
+            result["should_decompose"] = False
+            result[
+                "explanation"
+            ] += " However, the suggested sub-group key is already part of the current reduce key(s), so decomposition is not recommended."
+            result["sub_group_key"] = ""
+
+        return result
+
+    def _should_decompose(
+        self,
+        op_config: Dict[str, Any],
+        input_data: List[Dict[str, Any]],
+        level: int = 1,
+    ) -> Dict[str, Any]:
+        """
+        Determine if decomposing the reduce operation would be beneficial.
+
+        Args:
+            op_config (Dict[str, Any]): Configuration for the reduce operation.
+            input_data (List[Dict[str, Any]]): Input data for the reduce operation.
+            level (int): The current level of decomposition.
+
+        Returns:
+            Dict[str, Any]: A dictionary containing the decomposition decision and explanation.
+        """
+        # TODO: we have not enabled recursive decomposition yet
+        if level > 1 and not op_config.get("recursively_optimize", False):
+            return {
+                "should_decompose": False,
+                "explanation": "Recursive decomposition is not enabled.",
+            }
+
+        system_prompt = (
+            "You are an AI assistant tasked with optimizing data processing pipelines."
+        )
+
+        # Sample a subset of input data for analysis
+        sample_size = min(10, len(input_data))
+        sample_input = random.sample(input_data, sample_size)
+
+        # Get all keys from the input data
+        all_keys = set().union(*(item.keys() for item in sample_input))
+        reduce_key = op_config["reduce_key"]
+        reduce_keys = [reduce_key] if isinstance(reduce_key, str) else reduce_key
+        other_keys = [key for key in all_keys if key not in reduce_keys]
+
+        # See if there's an input schema and constrain the sample_input to that schema
+        input_schema = op_config.get("input", {}).get("schema", {})
+        if input_schema:
+            sample_input = [
+                {key: item[key] for key in input_schema} for item in sample_input
+            ]
+
+        # Create a sample of values for other keys
+        sample_values = {
+            key: list(set(str(item.get(key))[:50] for item in sample_input))[:5]
+            for key in other_keys
+        }
+
+        prompt = f"""Analyze the following reduce operation and determine if it should be decomposed into two reduce operations chained together:
+
+        Reduce Operation Prompt:
+        ```
+        {op_config['prompt']}
+        ```
+
+        Current Reduce Key(s): {reduce_keys}
+        Other Available Keys: {', '.join(other_keys)}
+
+        Sample values for other keys:
+        {json.dumps(sample_values, indent=2)}
+
+        Based on this information, determine if it would be beneficial to decompose this reduce operation into a sub-reduce operation followed by a final reduce operation. Consider ALL of the following:
+
+        1. Is there a natural hierarchy in the data (e.g., country -> state -> city) among the other available keys, with a key at a finer level of granularity than the current reduce key(s)?
+        2. Are the current reduce key(s) some form of ID, and are there many different types of inputs for that ID among the other available keys?
+        3. Does the prompt implicitly ask for sub-grouping based on the other available keys (e.g., "summarize policies by state, then by country")?
+        4. Would splitting the operation improve accuracy (i.e., make sure information isn't lost when reducing)?
+        5. Are all the keys of the potential hierarchy provided in the other available keys? If not, we should not decompose.
+        6. Importantly, do not suggest decomposition using any key that is already part of the current reduce key(s). We are looking for a new key from the other available keys to use for sub-grouping.
+        7. Do not suggest keys that don't contain meaningful information (e.g., id-related keys).
+
+        Provide your analysis in the following format:
+        """
+
+        parameters = {
+            "type": "object",
+            "properties": {
+                "should_decompose": {"type": "boolean"},
+                "explanation": {"type": "string"},
+            },
+            "required": ["should_decompose", "explanation"],
+        }
+
+        response = self.llm_client.generate(
+            [{"role": "user", "content": prompt}],
+            system_prompt,
+            parameters,
+        )
+        return json.loads(response.choices[0].message.content)
+
+    def _get_decomposition_details(
+        self,
+        op_config: Dict[str, Any],
+        input_data: List[Dict[str, Any]],
+    ) -> Dict[str, Any]:
+        """
+        Determine the sub-group key and prompts for decomposed reduce operations.
+
+        Args:
+            op_config (Dict[str, Any]): Configuration for the reduce operation.
+            input_data (List[Dict[str, Any]]): Input data for the reduce operation.
+
+        Returns:
+            Dict[str, Any]: A dictionary containing the sub-group key and prompts for decomposed operations.
+        """
+        system_prompt = (
+            "You are an AI assistant tasked with optimizing data processing pipelines."
+        )
+
+        # Sample a subset of input data for analysis
+        sample_size = min(10, len(input_data))
+        sample_input = random.sample(input_data, sample_size)
+
+        # Get all keys from the input data
+        all_keys = set().union(*(item.keys() for item in sample_input))
+        reduce_key = op_config["reduce_key"]
+        reduce_keys = [reduce_key] if isinstance(reduce_key, str) else reduce_key
+        other_keys = [key for key in all_keys if key not in reduce_keys]
+
+        prompt = f"""Given that we've decided to decompose the following reduce operation, suggest a two-step reduce process:
+
+        Reduce Operation Prompt:
+        ```
+        {op_config['prompt']}
+        ```
+
+        Reduce Key(s): {reduce_key}
+        Other Keys: {', '.join(other_keys)}
+
+        Provide the following:
+        1. A sub-group key to use for the first reduce operation
+        2. A prompt for the first reduce operation
+        3. A prompt for the second (final) reduce operation
+
+        For the reduce operation prompts, you should only minimally modify the original prompt. The prompts should be Jinja templates, and the only variables they can access are the `reduce_key` and `inputs` variables.
+
+        Provide your suggestions in the following format:
+        """
+
+        parameters = {
+            "type": "object",
+            "properties": {
+                "sub_group_key": {"type": "string"},
+                "first_reduce_prompt": {"type": "string"},
+                "second_reduce_prompt": {"type": "string"},
+            },
+            "required": [
+                "sub_group_key",
+                "first_reduce_prompt",
+                "second_reduce_prompt",
+            ],
+        }
+
+        response = self.llm_client.generate(
+            [{"role": "user", "content": prompt}],
+            system_prompt,
+            parameters,
+        )
+        return json.loads(response.choices[0].message.content)
+
+    def _determine_value_sampling(
+        self, op_config: Dict[str, Any], input_data: List[Dict[str, Any]]
+    ) -> Dict[str, Any]:
+        """
+        Determine whether value sampling should be enabled and configure its parameters.
+        """
+        system_prompt = (
+            "You are an AI assistant helping to optimize data processing pipelines."
+        )
+
+        # Sample a subset of input data for analysis
+        sample_size = min(100, len(input_data))
+        sample_input = random.sample(input_data, sample_size)
+
+        prompt = f"""
+        Analyze the following reduce operation and determine if value sampling should be enabled:
+
+        Reduce Operation Prompt:
+        {op_config['prompt']}
+
+        Sample Input Data (first 2 items):
+        {json.dumps(sample_input[:2], indent=2)}
+
+        Value sampling is appropriate for reduce operations that don't need to look at all the values for each key to produce a good result, such as generic summarization tasks.
+
+        Based on the reduce operation prompt and the sample input data, determine if value sampling should be enabled.
+        Answer with 'yes' if value sampling should be enabled or 'no' if it should not be enabled. Explain your reasoning briefly.
+        """
+
+        parameters = {
+            "type": "object",
+            "properties": {
+                "enable_sampling": {"type": "boolean"},
+                "explanation": {"type": "string"},
+            },
+            "required": ["enable_sampling", "explanation"],
+        }
+
+        response = self.llm_client.generate(
+            [{"role": "user", "content": prompt}],
+            system_prompt,
+            parameters,
+        )
+        result = json.loads(response.choices[0].message.content)
+
+        if not result["enable_sampling"]:
+            return {"enabled": False}
+
+        # Print the explanation for enabling value sampling
+        self.console.log(f"Value sampling enabled: {result['explanation']}")
+
+        # Determine sampling method
+        prompt = f"""
+        We are optimizing a reduce operation in a data processing pipeline. The reduce operation is defined by the following prompt:
+
+        Reduce Operation Prompt:
+        {op_config['prompt']}
+
+        Sample Input Data (first 2 items):
+        {json.dumps(sample_input[:2], indent=2)}
+
+        We have determined that value sampling should be enabled for this reduce operation. Value sampling is a technique used to process only a subset of the input data for each reduce key, rather than processing all items. This can significantly reduce processing time and costs for very large datasets, especially when the reduce operation doesn't require looking at every single item to produce a good result (e.g., summarization tasks).
+
+        Now we need to choose the most appropriate sampling method. The available methods are:
+
+        1. "random": Randomly select a subset of values.
+        Example: In a customer review analysis task, randomly selecting a subset of reviews to summarize the overall sentiment.
+
+        2. "cluster": Use K-means clustering to select representative samples.
+        Example: In a document categorization task, clustering documents based on their content and selecting representative documents from each cluster to determine the overall categories.
+
+        3. "sem_sim": Use semantic similarity to select the most relevant samples to a query text.
+        Example: In a news article summarization task, selecting articles that are semantically similar to a query like "Major economic events of {{reduce_key}}" to produce a focused summary.
+
+        Based on the reduce operation prompt, the nature of the task, and the sample input data, which sampling method would be most appropriate?
+
+        Provide your answer as either "random", "cluster", or "sem_sim", and explain your reasoning in detail. Consider the following in your explanation:
+        - The nature of the reduce task (e.g., summarization, aggregation, analysis)
+        - The structure and content of the input data
+        - The potential benefits and drawbacks of each sampling method for this specific task
+        """
+
+        parameters = {
+            "type": "object",
+            "properties": {
+                "method": {"type": "string", "enum": ["random", "cluster", "sem_sim"]},
+                "explanation": {"type": "string"},
+            },
+            "required": ["method", "explanation"],
+        }
+
+        response = self.llm_client.generate(
+            [{"role": "user", "content": prompt}],
+            system_prompt,
+            parameters,
+        )
+        result = json.loads(response.choices[0].message.content)
+        method = result["method"]
+
+        value_sampling_config = {
+            "enabled": True,
+            "method": method,
+            "sample_size": 100,  # Default sample size
+            "embedding_model": "text-embedding-3-small",
+        }
+
+        if method in ["cluster", "sem_sim"]:
+            # Determine embedding keys
+            prompt = f"""
+            For the {method} sampling method, we need to determine which keys from the input data should be used for generating embeddings.
+
+            Input data keys:
+            {', '.join(sample_input[0].keys())}
+
+            Sample Input Data:
+            {json.dumps(sample_input[0], indent=2)[:1000]}...
+
+            Based on the reduce operation prompt and the sample input data, which keys should be used for generating embeddings? Use keys that will create meaningful embeddings (i.e., not id-related keys).
+            Provide your answer as a list of key names that is a subset of the input data keys. You should pick only the 1-3 keys that are necessary for generating meaningful embeddings, that have relatively short values.
+            """
+
+            parameters = {
+                "type": "object",
+                "properties": {
+                    "embedding_keys": {"type": "array", "items": {"type": "string"}},
+                    "explanation": {"type": "string"},
+                },
+                "required": ["embedding_keys", "explanation"],
+            }
+
+            response = self.llm_client.generate(
+                [{"role": "user", "content": prompt}],
+                system_prompt,
+                parameters,
+            )
+            result = json.loads(response.choices[0].message.content)
+            # TODO: validate that these exist
+            embedding_keys = result["embedding_keys"]
+            for key in result["embedding_keys"]:
+                if key not in sample_input[0]:
+                    embedding_keys.remove(key)
+
+            if not embedding_keys:
+                # Select the reduce key
+                self.console.log(
+                    "No embedding keys found, selecting reduce key for embedding key"
+                )
+                embedding_keys = (
+                    op_config["reduce_key"]
+                    if isinstance(op_config["reduce_key"], list)
+                    else [op_config["reduce_key"]]
+                )
+
+            value_sampling_config["embedding_keys"] = embedding_keys
+
+        if method == "sem_sim":
+            # Determine query text
+            prompt = f"""
+            For the semantic similarity (sem_sim) sampling method, we need to determine the query text to compare against when selecting samples.
+
+            Reduce Operation Prompt:
+            {op_config['prompt']}
+
+            The query text should be a Jinja template with access to the `reduce_key` variable.
+            Based on the reduce operation prompt, what would be an appropriate query text for selecting relevant samples?
+            """
+
+            parameters = {
+                "type": "object",
+                "properties": {
+                    "query_text": {"type": "string"},
+                    "explanation": {"type": "string"},
+                },
+                "required": ["query_text", "explanation"],
+            }
+
+            response = self.llm_client.generate(
+                [{"role": "user", "content": prompt}],
+                system_prompt,
+                parameters,
+            )
+            result = json.loads(response.choices[0].message.content)
+            value_sampling_config["query_text"] = result["query_text"]
+
+        return value_sampling_config
+
+    def _is_associative(
+        self, op_config: Dict[str, Any], input_data: List[Dict[str, Any]]
+    ) -> bool:
+        """
+        Determine if the reduce operation is associative.
+
+        This method analyzes the reduce operation configuration and a sample of the input data
+        to determine if the operation is associative (i.e., the order of combining elements
+        doesn't affect the final result).
+
+        Args:
+            op_config (Dict[str, Any]): Configuration for the reduce operation.
+            input_data (List[Dict[str, Any]]): Input data for the reduce operation.
+
+        Returns:
+            bool: True if the operation is determined to be associative, False otherwise.
+        """
+        system_prompt = (
+            "You are an AI assistant helping to optimize data processing pipelines."
+        )
+
+        # Sample a subset of input data for analysis
+        sample_size = min(5, len(input_data))
+        sample_input = random.sample(input_data, sample_size)
+
+        prompt = f"""
+        Analyze the following reduce operation and determine if it is associative:
+
+        Reduce Operation Prompt:
+        {op_config['prompt']}
+
+        Sample Input Data:
+        {json.dumps(sample_input, indent=2)[:1000]}...
+
+        Based on the reduce operation prompt, determine whether the order in which we process data matters.
+        Answer with 'yes' if order matters or 'no' if order doesn't matter.
+        Explain your reasoning briefly.
+
+        For example:
+        - Merging extracted key-value pairs from documents does not require order: combining {{"name": "John", "age": 30}} with {{"city": "New York", "job": "Engineer"}} yields the same result regardless of order
+        - Generating a timeline of events requires order: the order of events matters for maintaining chronological accuracy.
+
+        Consider these examples when determining whether the order in which we process data matters. You might also have to consider the specific data.
+        """
+
+        parameters = {
+            "type": "object",
+            "properties": {
+                "order_matters": {"type": "boolean"},
+                "explanation": {"type": "string"},
+            },
+            "required": ["order_matters", "explanation"],
+        }
+
+        response = self.llm_client.generate(
+            [{"role": "user", "content": prompt}],
+            system_prompt,
+            parameters,
+        )
+        result = json.loads(response.choices[0].message.content)
+        result["is_associative"] = not result["order_matters"]
+
+        self.console.log(
+            f"[yellow]Reduce operation {'is associative' if result['is_associative'] else 'is not associative'}.[/yellow] Analysis: {result['explanation']}"
+        )
+        return result["is_associative"]
+
+    def _generate_validator_prompt(
+        self,
+        op_config: Dict[str, Any],
+        input_data: List[Dict[str, Any]],
+        original_output: List[Dict[str, Any]],
+    ) -> str:
+        """
+        Generate a custom validator prompt for assessing the quality of the reduce operation output.
+
+        This method creates a prompt that will be used to validate the output of the reduce operation.
+        It includes specific questions about the quality and completeness of the output.
+
+        Args:
+            op_config (Dict[str, Any]): Configuration for the reduce operation.
+            input_data (List[Dict[str, Any]]): Input data for the reduce operation.
+            original_output (List[Dict[str, Any]]): Original output of the reduce operation.
+
+        Returns:
+            str: A custom validator prompt as a string.
+        """
+        system_prompt = "You are an AI assistant tasked with creating custom validation prompts for reduce operations in data processing pipelines."
+
+        sample_input = random.choice(input_data)
+        input_keys = op_config.get("input", {}).get("schema", {})
+        if input_keys:
+            sample_input = {k: sample_input[k] for k in input_keys}
+
+        reduce_key = op_config.get("reduce_key")
+        if reduce_key and original_output:
+            if isinstance(reduce_key, list):
+                key = next(
+                    (
+                        tuple(item[k] for k in reduce_key)
+                        for item in original_output
+                        if all(k in item for k in reduce_key)
+                    ),
+                    tuple(None for _ in reduce_key),
+                )
+                sample_output = next(
+                    (
+                        item
+                        for item in original_output
+                        if all(item.get(k) == v for k, v in zip(reduce_key, key))
+                    ),
+                    {},
+                )
+            else:
+                key = next(
+                    (
+                        item[reduce_key]
+                        for item in original_output
+                        if reduce_key in item
+                    ),
+                    None,
+                )
+                sample_output = next(
+                    (item for item in original_output if item.get(reduce_key) == key),
+                    {},
+                )
+        else:
+            sample_output = original_output[0] if original_output else {}
+
+        output_keys = op_config.get("output", {}).get("schema", {})
+        sample_output = {k: sample_output[k] for k in output_keys}
+
+        prompt = f"""
+        Analyze the following reduce operation and its input/output:
+
+        Reduce Operation Prompt:
+        {op_config["prompt"]}
+
+        Sample Input (just one item):
+        {json.dumps(sample_input, indent=2)}
+
+        Sample Output:
+        {json.dumps(sample_output, indent=2)}
+
+        Create a custom validator prompt that will assess how well the reduce operation performed its intended task. The prompt should ask specific 2-3 questions about the quality of the output, such as:
+        1. Does the output accurately reflect the aggregation method specified in the task? For example, if finding anomalies, are the identified anomalies actually anomalies?
+        2. Are there any missing fields, unexpected null values, or data type mismatches in the output compared to the expected schema?
+        3. Does the output maintain the key information from the input while appropriately condensing or summarizing it? For instance, in a text summarization task, are the main points preserved?
+        4. How well does the output adhere to any specific formatting requirements mentioned in the original prompt, such as character limits for summaries or specific data types for aggregated values?
+
+        Note that the output may reflect more than just the input provided, since we only provide a one-item sample input. Provide your response as a single string containing the custom validator prompt. The prompt should be tailored to the task and avoid generic criteria. The prompt should not reference a specific value in the sample input, but rather a general property.
+
+        Your prompt should not have any placeholders like {{ reduce_key }} or {{ input_key }}. It should just be a string.
+        """
+
+        parameters = {
+            "type": "object",
+            "properties": {"validator_prompt": {"type": "string"}},
+            "required": ["validator_prompt"],
+        }
+
+        response = self.llm_client.generate(
+            [{"role": "user", "content": prompt}],
+            system_prompt,
+            parameters,
+        )
+        return json.loads(response.choices[0].message.content)["validator_prompt"]
+
+    def _validate_reduce_output(
+        self,
+        op_config: Dict[str, Any],
+        validation_inputs: Dict[Any, List[Dict[str, Any]]],
+        output_data: List[Dict[str, Any]],
+        validator_prompt: str,
+    ) -> Dict[str, Any]:
+        """
+        Validate the output of the reduce operation using the generated validator prompt.
+
+        This method assesses the quality of the reduce operation output by applying the validator prompt
+        to multiple samples of the input and output data.
+
+        Args:
+            op_config (Dict[str, Any]): Configuration for the reduce operation.
+            validation_inputs (Dict[Any, List[Dict[str, Any]]]): Validation inputs for the reduce operation.
+            output_data (List[Dict[str, Any]]): Output data from the reduce operation.
+            validator_prompt (str): The validator prompt generated earlier.
+
+        Returns:
+            Dict[str, Any]: A dictionary containing validation results and a flag indicating if improvement is needed.
+        """
+        system_prompt = "You are an AI assistant tasked with validating the output of reduce operations in data processing pipelines."
+
+        validation_results = []
+        with ThreadPoolExecutor(max_workers=self.max_threads) as executor:
+            futures = []
+            for reduce_key, inputs in validation_inputs.items():
+                if (
+                    op_config["reduce_key"] == ["_all"]
+                    or op_config["reduce_key"] == "_all"
+                ):
+                    sample_output = output_data[0]
+                elif isinstance(op_config["reduce_key"], list):
+                    sample_output = next(
+                        (
+                            item
+                            for item in output_data
+                            if all(
+                                item[key] == reduce_key[i]
+                                for i, key in enumerate(op_config["reduce_key"])
+                            )
+                        ),
+                        None,
+                    )
+                else:
+                    sample_output = next(
+                        (
+                            item
+                            for item in output_data
+                            if item[op_config["reduce_key"]] == reduce_key
+                        ),
+                        None,
+                    )
+
+                if sample_output is None:
+                    self.console.log(
+                        f"Warning: No output found for reduce key {reduce_key}"
+                    )
+                    continue
+
+                input_str = json.dumps(inputs, indent=2)
+                # truncate input_str to 40,000 words
+                input_str = input_str.split()[:40000]
+                input_str = " ".join(input_str) + "..."
+
+                prompt = f"""{validator_prompt}
+
+                Reduce Operation Task:
+                {op_config["prompt"]}
+
+                Input Data Samples:
+                {input_str}
+
+                Output Data Sample:
+                {json.dumps(sample_output, indent=2)}
+
+                Based on the validator prompt and the input/output samples, assess the quality (e.g., correctness, completeness) of the reduce operation output.
+                Provide your assessment in the following format:
+                """
+
+                parameters = {
+                    "type": "object",
+                    "properties": {
+                        "is_correct": {"type": "boolean"},
+                        "issues": {"type": "array", "items": {"type": "string"}},
+                        "suggestions": {"type": "array", "items": {"type": "string"}},
+                    },
+                    "required": ["is_correct", "issues", "suggestions"],
+                }
+
+                futures.append(
+                    executor.submit(
+                        self.llm_client.generate,
+                        [{"role": "user", "content": prompt}],
+                        system_prompt,
+                        parameters,
+                    )
+                )
+
+            for future, (reduce_key, inputs) in zip(futures, validation_inputs.items()):
+                response = future.result()
+                result = json.loads(response.choices[0].message.content)
+                validation_results.append(result)
+
+        # Determine if optimization is needed based on validation results
+        invalid_count = sum(
+            1 for result in validation_results if not result["is_correct"]
+        )
+        needs_improvement = invalid_count > 1 or (
+            invalid_count == 1 and len(validation_results) == 1
+        )
+
+        return {
+            "needs_improvement": needs_improvement,
+            "validation_results": validation_results,
+        }
+
+    def _create_validation_inputs(
+        self, input_data: List[Dict[str, Any]], reduce_key: Union[str, List[str]]
+    ) -> Dict[Any, List[Dict[str, Any]]]:
+        # Group input data by reduce_key
+        grouped_data = {}
+        if reduce_key == ["_all"]:
+            # Put all data in one group under a single key
+            grouped_data[("_all",)] = input_data
+        else:
+            # Group by reduce key(s) as before
+            for item in input_data:
+                if isinstance(reduce_key, list):
+                    key = tuple(item[k] for k in reduce_key)
+                else:
+                    key = item[reduce_key]
+                if key not in grouped_data:
+                    grouped_data[key] = []
+                grouped_data[key].append(item)
+
+        # Select a fixed number of reduce keys
+        selected_keys = random.sample(
+            list(grouped_data.keys()),
+            min(self.num_samples_in_validation, len(grouped_data)),
+        )
+
+        # Create a new dict with only the selected keys
+        validation_inputs = {key: grouped_data[key] for key in selected_keys}
+
+        return validation_inputs
+
+    def _create_reduce_plans(
+        self,
+        op_config: Dict[str, Any],
+        input_data: List[Dict[str, Any]],
+        is_associative: bool,
+    ) -> List[Dict[str, Any]]:
+        """
+        Create multiple reduce plans based on the input data and operation configuration.
+
+        This method generates various reduce plans by varying batch sizes and fold prompts.
+        It takes into account the LLM's context window size to determine appropriate batch sizes.
+
+        Args:
+            op_config (Dict[str, Any]): Configuration for the reduce operation.
+            input_data (List[Dict[str, Any]]): Input data for the reduce operation.
+            is_associative (bool): Flag indicating whether the reduce operation is associative.
+
+        Returns:
+            List[Dict[str, Any]]: A list of reduce plans, each with different batch sizes and fold prompts.
+        """
+        model = op_config.get("model", "gpt-4o-mini")
+        model_input_context_length = model_cost.get(model, {}).get(
+            "max_input_tokens", 8192
+        )
+
+        # Estimate tokens for prompt, input, and output
+        prompt_tokens = count_tokens(op_config["prompt"], model)
+        sample_input = input_data[:100]
+        sample_output = self._run_operation(op_config, input_data[:100])
+
+        prompt_vars = extract_jinja_variables(op_config["prompt"])
+        prompt_vars = [var.split(".")[-1] for var in prompt_vars]
+        avg_input_tokens = mean(
+            [
+                count_tokens(
+                    json.dumps({k: item[k] for k in prompt_vars if k in item}), model
+                )
+                for item in sample_input
+            ]
+        )
+        avg_output_tokens = mean(
+            [
+                count_tokens(
+                    json.dumps({k: item[k] for k in prompt_vars if k in item}), model
+                )
+                for item in sample_output
+            ]
+        )
+
+        # Calculate max batch size that fits in context window
+        max_batch_size = (
+            model_input_context_length - prompt_tokens - avg_output_tokens
+        ) // avg_input_tokens
+
+        # Generate 6 candidate batch sizes
+        batch_sizes = [
+            max(1, int(max_batch_size * ratio))
+            for ratio in [0.1, 0.2, 0.4, 0.6, 0.75, 0.9]
+        ]
+        # Log the generated batch sizes
+        self.console.log("[cyan]Generating plans for batch sizes:[/cyan]")
+        for size in batch_sizes:
+            self.console.log(f"  - {size}")
+        batch_sizes = sorted(set(batch_sizes))  # Remove duplicates and sort
+
+        plans = []
+
+        # Generate multiple fold prompts
+        max_retries = 5
+        retry_count = 0
+        fold_prompts = []
+
+        while retry_count < max_retries and not fold_prompts:
+            try:
+                fold_prompts = self._synthesize_fold_prompts(
+                    op_config,
+                    sample_input,
+                    sample_output,
+                    num_prompts=self.num_fold_prompts,
+                )
+                fold_prompts = list(set(fold_prompts))
+                if not fold_prompts:
+                    raise ValueError("No fold prompts generated")
+            except Exception as e:
+                retry_count += 1
+                if retry_count == max_retries:
+                    raise RuntimeError(
+                        f"Failed to generate fold prompts after {max_retries} attempts: {str(e)}"
+                    )
+                self.console.log(
+                    f"Retry {retry_count}/{max_retries}: Failed to generate fold prompts. Retrying..."
+                )
+
+        for batch_size in batch_sizes:
+            for fold_idx, fold_prompt in enumerate(fold_prompts):
+                plan = op_config.copy()
+                plan["fold_prompt"] = fold_prompt
+                plan["fold_batch_size"] = batch_size
+                plan["associative"] = is_associative
+                plan["name"] = f"{op_config['name']}_bs_{batch_size}_fp_{fold_idx}"
+                plans.append(plan)
+
+        return plans
+
+    def _calculate_compression_ratio(
+        self,
+        op_config: Dict[str, Any],
+        sample_input: List[Dict[str, Any]],
+        sample_output: List[Dict[str, Any]],
+    ) -> float:
+        """
+        Calculate the compression ratio of the reduce operation.
+
+        This method compares the size of the input data to the size of the output data
+        to determine how much the data is being compressed by the reduce operation.
+
+        Args:
+            op_config (Dict[str, Any]): Configuration for the reduce operation.
+            sample_input (List[Dict[str, Any]]): Sample input data.
+            sample_output (List[Dict[str, Any]]): Sample output data.
+
+        Returns:
+            float: The calculated compression ratio.
+        """
+        reduce_key = op_config["reduce_key"]
+        input_schema = op_config.get("input", {}).get("schema", {})
+        output_schema = op_config["output"]["schema"]
+        model = op_config.get("model", "gpt-4o-mini")
+
+        compression_ratios = {}
+
+        # Handle both single key and list of keys
+        if isinstance(reduce_key, list):
+            distinct_keys = set(
+                tuple(item[k] for k in reduce_key) for item in sample_input
+            )
+        else:
+            distinct_keys = set(item[reduce_key] for item in sample_input)
+
+        for key in distinct_keys:
+            if isinstance(reduce_key, list):
+                key_input = [
+                    item
+                    for item in sample_input
+                    if tuple(item[k] for k in reduce_key) == key
+                ]
+                key_output = [
+                    item
+                    for item in sample_output
+                    if tuple(item[k] for k in reduce_key) == key
+                ]
+            else:
+                key_input = [item for item in sample_input if item[reduce_key] == key]
+                key_output = [item for item in sample_output if item[reduce_key] == key]
+
+            if input_schema:
+                key_input_tokens = sum(
+                    count_tokens(
+                        json.dumps({k: item[k] for k in input_schema if k in item}),
+                        model,
+                    )
+                    for item in key_input
+                )
+            else:
+                key_input_tokens = sum(
+                    count_tokens(json.dumps(item), model) for item in key_input
+                )
+
+            key_output_tokens = sum(
+                count_tokens(
+                    json.dumps({k: item[k] for k in output_schema if k in item}), model
+                )
+                for item in key_output
+            )
+
+            compression_ratios[key] = (
+                key_output_tokens / key_input_tokens if key_input_tokens > 0 else 1
+            )
+
+        if not compression_ratios:
+            return 1
+
+        # Calculate importance weights based on the number of items for each key
+        total_items = len(sample_input)
+        if isinstance(reduce_key, list):
+            importance_weights = {
+                key: len(
+                    [
+                        item
+                        for item in sample_input
+                        if tuple(item[k] for k in reduce_key) == key
+                    ]
+                )
+                / total_items
+                for key in compression_ratios
+            }
+        else:
+            importance_weights = {
+                key: len([item for item in sample_input if item[reduce_key] == key])
+                / total_items
+                for key in compression_ratios
+            }
+
+        # Calculate weighted average of compression ratios
+        weighted_sum = sum(
+            compression_ratios[key] * importance_weights[key]
+            for key in compression_ratios
+        )
+        return weighted_sum
+
+    def _synthesize_fold_prompts(
+        self,
+        op_config: Dict[str, Any],
+        sample_input: List[Dict[str, Any]],
+        sample_output: List[Dict[str, Any]],
+        num_prompts: int = 2,
+    ) -> List[str]:
+        """
+        Synthesize fold prompts for the reduce operation. We generate multiple
+        fold prompts in case one is bad.
+
+        A fold operation is a higher-order function that iterates through a data structure,
+        accumulating the results of applying a given combining operation to its elements.
+        In the context of reduce operations, folding allows processing of data in batches,
+        which can significantly improve performance for large datasets.
+
+        This method generates multiple fold prompts that can be used to optimize the reduce operation
+        by allowing it to run on batches of inputs. It uses the language model to create prompts
+        that are variations of the original reduce prompt, adapted for folding operations.
+
+        Args:
+            op_config (Dict[str, Any]): The configuration of the reduce operation.
+            sample_input (List[Dict[str, Any]]): A sample of the input data.
+            sample_output (List[Dict[str, Any]]): A sample of the output data.
+            num_prompts (int, optional): The number of fold prompts to generate. Defaults to 2.
+
+        Returns:
+            List[str]: A list of synthesized fold prompts.
+
+        The method performs the following steps:
+        1. Sets up the system prompt and parameters for the language model.
+        2. Defines a function to get random examples from the sample data.
+        3. Creates a prompt template for generating fold prompts.
+        4. Uses multi-threading to generate multiple fold prompts in parallel.
+        5. Returns the list of generated fold prompts.
+        """
+        system_prompt = "You are an AI assistant tasked with creating a fold prompt for reduce operations in data processing pipelines."
+        original_prompt = op_config["prompt"]
+
+        input_schema = op_config.get("input", {}).get("schema", {})
+        output_schema = op_config["output"]["schema"]
+
+        def get_random_examples():
+            reduce_key = op_config["reduce_key"]
+            reduce_key = (
+                list(reduce_key) if not isinstance(reduce_key, list) else reduce_key
+            )
+
+            if reduce_key == ["_all"]:
+                # For _all case, just pick random input and output examples
+                input_example = random.choice(sample_input)
+                output_example = random.choice(sample_output)
+            elif isinstance(reduce_key, list):
+                random_key = tuple(
+                    random.choice(
+                        [
+                            tuple(item[k] for k in reduce_key if k in item)
+                            for item in sample_input
+                            if all(k in item for k in reduce_key)
+                        ]
+                    )
+                )
+                input_example = random.choice(
+                    [
+                        item
+                        for item in sample_input
+                        if all(item.get(k) == v for k, v in zip(reduce_key, random_key))
+                    ]
+                )
+                output_example = random.choice(
+                    [
+                        item
+                        for item in sample_output
+                        if all(item.get(k) == v for k, v in zip(reduce_key, random_key))
+                    ]
+                )
+
+            if input_schema:
+                input_example = {
+                    k: input_example[k] for k in input_schema if k in input_example
+                }
+            output_example = {
+                k: output_example[k] for k in output_schema if k in output_example
+            }
+            return input_example, output_example
+
+        parameters = {
+            "type": "object",
+            "properties": {
+                "fold_prompt": {
+                    "type": "string",
+                }
+            },
+            "required": ["fold_prompt"],
+        }
+
+        def generate_single_prompt():
+            input_example, output_example = get_random_examples()
+            prompt = f"""
+            Original Reduce Operation Prompt:
+            {original_prompt}
+
+            Sample Input:
+            {json.dumps(input_example, indent=2)}
+
+            Sample Output:
+            {json.dumps(output_example, indent=2)}
+
+            Create a fold prompt for the reduce operation to run on batches of inputs. The fold prompt should:
+            1. Minimally modify the original reduce prompt
+            2. Describe how to combine the new values with the current reduced value
+            3. Be designed to work iteratively, allowing for multiple fold operations. The first iteration will use the original prompt, and all successive iterations will use the fold prompt.
+
+            The fold prompt should be a Jinja2 template with the following variables available:
+            - {{{{ output }}}}: The current reduced value (a dictionary with the current output schema)
+            - {{{{ inputs }}}}: A list of new values to be folded in
+            - {{{{ reduce_key }}}}: The key used for grouping in the reduce operation
+
+            Provide the fold prompt as a string.
+            """
+            response = self.llm_client.generate(
+                [{"role": "user", "content": prompt}],
+                system_prompt,
+                parameters,
+            )
+            fold_prompt = json.loads(response.choices[0].message.content)["fold_prompt"]
+
+            # Run the operation with the fold prompt
+            # Create a temporary plan with the fold prompt
+            temp_plan = op_config.copy()
+            temp_plan["fold_prompt"] = fold_prompt
+            temp_plan["fold_batch_size"] = min(
+                len(sample_input), 2
+            )  # Use a small batch size for testing
+
+            # Run the operation with the fold prompt
+            try:
+                self._run_operation(
+                    temp_plan, sample_input[: temp_plan["fold_batch_size"]]
+                )
+
+                return fold_prompt
+            except Exception as e:
+                self.console.log(
+                    f"[red]Error in agent-generated fold prompt: {e}[/red]"
+                )
+
+                # Create a default fold prompt that instructs folding new data into existing output
+                fold_prompt = f"""Analyze this batch of data using the following instructions:
+
+{original_prompt}
+
+However, instead of starting fresh, fold your analysis into the existing output that has already been generated. The existing output is provided in the 'output' variable below:
+
+{{{{ output }}}}
+
+Remember, you must fold the new data into the existing output, do not start fresh."""
+                return fold_prompt
+
+        with ThreadPoolExecutor(max_workers=self.max_threads) as executor:
+            fold_prompts = list(
+                executor.map(lambda _: generate_single_prompt(), range(num_prompts))
+            )
+
+        return fold_prompts
+
+    def _evaluate_reduce_plans(
+        self,
+        op_config: Dict[str, Any],
+        plans: List[Dict[str, Any]],
+        input_data: List[Dict[str, Any]],
+        validator_prompt: str,
+    ) -> Dict[str, Any]:
+        """
+        Evaluate multiple reduce plans and select the best one.
+
+        This method takes a list of reduce plans, evaluates each one using the input data
+        and a validator prompt, and selects the best plan based on the evaluation scores.
+        It also attempts to create and evaluate a merged plan that enhances the runtime performance
+        of the best plan.
+
+        A merged plan is an optimization technique applied to the best-performing plan
+        that uses the fold operation. It allows the best plan to run even faster by
+        executing parallel folds and then merging the results of these individual folds
+        together. We default to a merge batch size of 2, but one can increase this.
+
+        Args:
+            op_config (Dict[str, Any]): The configuration of the reduce operation.
+            plans (List[Dict[str, Any]]): A list of reduce plans to evaluate.
+            input_data (List[Dict[str, Any]]): The input data to use for evaluation.
+            validator_prompt (str): The prompt to use for validating the output of each plan.
+
+        Returns:
+            Dict[str, Any]: The best reduce plan, either the top-performing original plan
+                            or a merged plan if it performs well enough.
+
+        The method performs the following steps:
+        1. Evaluates each plan using multi-threading.
+        2. Sorts the plans based on their evaluation scores.
+        3. Selects the best plan and attempts to create a merged plan.
+        4. Evaluates the merged plan and compares it to the best original plan.
+        5. Returns either the merged plan or the best original plan based on their scores.
+        """
+        self.console.log("\n[bold]Evaluating Reduce Plans:[/bold]")
+        for i, plan in enumerate(plans):
+            self.console.log(f"Plan {i+1} (batch size: {plan['fold_batch_size']})")
+
+        plan_scores = []
+        plan_outputs = {}
+
+        # Create a fixed random sample for evaluation
+        sample_size = min(100, len(input_data))
+        evaluation_sample = random.sample(input_data, sample_size)
+
+        # Create a fixed set of validation samples
+        validation_inputs = self._create_validation_inputs(
+            evaluation_sample, plan["reduce_key"]
+        )
+
+        with ThreadPoolExecutor(max_workers=self.max_threads) as executor:
+            futures = [
+                executor.submit(
+                    self._evaluate_single_plan,
+                    plan,
+                    evaluation_sample,
+                    validator_prompt,
+                    validation_inputs,
+                )
+                for plan in plans
+            ]
+            for future in as_completed(futures):
+                plan, score, output = future.result()
+                plan_scores.append((plan, score))
+                plan_outputs[id(plan)] = output
+
+        # Sort plans by score in descending order, then by fold_batch_size in descending order
+        sorted_plans = sorted(
+            plan_scores, key=lambda x: (x[1], x[0]["fold_batch_size"]), reverse=True
+        )
+
+        self.console.log("\n[bold]Reduce Plan Scores:[/bold]")
+        for i, (plan, score) in enumerate(sorted_plans):
+            self.console.log(
+                f"Plan {i+1} (batch size: {plan['fold_batch_size']}): {score:.2f}"
+            )
+
+        best_plan, best_score = sorted_plans[0]
+        self.console.log(
+            f"\n[green]Selected best plan with score: {best_score:.2f} and batch size: {best_plan['fold_batch_size']}[/green]"
+        )
+
+        if op_config.get("synthesize_merge", False):
+            # Create a new plan with merge prompt and updated parameters
+            merged_plan = best_plan.copy()
+
+            # Synthesize merge prompt if it doesn't exist
+            if "merge_prompt" not in merged_plan:
+                merged_plan["merge_prompt"] = self._synthesize_merge_prompt(
+                    merged_plan, plan_outputs[id(best_plan)]
+                )
+                # Print the synthesized merge prompt
+                self.console.log("\n[bold]Synthesized Merge Prompt:[/bold]")
+                self.console.log(merged_plan["merge_prompt"])
+
+            # Set merge_batch_size to 2 and num_parallel_folds to 5
+            merged_plan["merge_batch_size"] = 2
+
+            # Evaluate the merged plan
+            _, merged_plan_score, _, operation_instance = self._evaluate_single_plan(
+                merged_plan,
+                evaluation_sample,
+                validator_prompt,
+                validation_inputs,
+                return_instance=True,
+            )
+
+            # Get the merge and fold times from the operation instance
+            merge_times = operation_instance.merge_times
+            fold_times = operation_instance.fold_times
+            merge_avg_time = mean(merge_times) if merge_times else None
+            fold_avg_time = mean(fold_times) if fold_times else None
+
+            self.console.log("\n[bold]Scores:[/bold]")
+            self.console.log(f"Original plan: {best_score:.2f}")
+            self.console.log(f"Merged plan: {merged_plan_score:.2f}")
+
+            # Compare scores and decide which plan to use
+            if merged_plan_score >= best_score * 0.75:
+                self.console.log(
+                    f"\n[green]Using merged plan with score: {merged_plan_score:.2f}[/green]"
+                )
+                if merge_avg_time and fold_avg_time:
+                    merged_plan["merge_time"] = merge_avg_time
+                    merged_plan["fold_time"] = fold_avg_time
+                return merged_plan
+            else:
+                self.console.log(
+                    f"\n[yellow]Merged plan quality too low. Using original plan with score: {best_score:.2f}[/yellow]"
+                )
+                return best_plan
+        else:
+            return best_plan
+
+    def _evaluate_single_plan(
+        self,
+        plan: Dict[str, Any],
+        input_data: List[Dict[str, Any]],
+        validator_prompt: str,
+        validation_inputs: List[Dict[str, Any]],
+        return_instance: bool = False,
+    ) -> Union[
+        Tuple[Dict[str, Any], float, List[Dict[str, Any]]],
+        Tuple[Dict[str, Any], float, List[Dict[str, Any]], BaseOperation],
+    ]:
+        """
+        Evaluate a single reduce plan using the provided input data and validator prompt.
+
+        This method runs the reduce operation with the given plan, validates the output,
+        and calculates a score based on the validation results. The scoring works as follows:
+        1. It counts the number of valid results from the validation.
+        2. The score is calculated as the ratio of valid results to the total number of validation results.
+        3. This produces a score between 0 and 1, where 1 indicates all results were valid, and 0 indicates none were valid.
+
+        TODO: We should come up with a better scoring method here, maybe pairwise comparisons.
+
+        Args:
+            plan (Dict[str, Any]): The reduce plan to evaluate.
+            input_data (List[Dict[str, Any]]): The input data to use for evaluation.
+            validator_prompt (str): The prompt to use for validating the output.
+            return_instance (bool, optional): Whether to return the operation instance. Defaults to False.
+
+        Returns:
+            Union[
+                Tuple[Dict[str, Any], float, List[Dict[str, Any]]],
+                Tuple[Dict[str, Any], float, List[Dict[str, Any]], BaseOperation],
+            ]: A tuple containing the plan, its score, the output data, and optionally the operation instance.
+
+        The method performs the following steps:
+        1. Runs the reduce operation with the given plan on the input data.
+        2. Validates the output using the validator prompt.
+        3. Calculates a score based on the validation results.
+        4. Returns the plan, score, output data, and optionally the operation instance.
+        """
+        output = self._run_operation(plan, input_data, return_instance)
+        if return_instance:
+            output, operation_instance = output
+
+        validation_result = self._validate_reduce_output(
+            plan, validation_inputs, output, validator_prompt
+        )
+
+        # Calculate a score based on validation results
+        valid_count = sum(
+            1
+            for result in validation_result["validation_results"]
+            if result["is_correct"]
+        )
+        score = valid_count / len(validation_result["validation_results"])
+
+        if return_instance:
+            return plan, score, output, operation_instance
+        else:
+            return plan, score, output
+
+    def _synthesize_merge_prompt(
+        self, plan: Dict[str, Any], sample_outputs: List[Dict[str, Any]]
+    ) -> str:
+        """
+        Synthesize a merge prompt for combining multiple folded outputs in a reduce operation.
+
+        This method generates a merge prompt that can be used to combine the results of multiple
+        parallel fold operations into a single output. It uses the language model to create a prompt
+        that is consistent with the original reduce and fold prompts while addressing the specific
+        requirements of merging multiple outputs.
+
+        Args:
+            plan (Dict[str, Any]): The reduce plan containing the original prompt and fold prompt.
+            sample_outputs (List[Dict[str, Any]]): Sample outputs from the fold operation to use as examples.
+
+        Returns:
+            str: The synthesized merge prompt as a string.
+
+        The method performs the following steps:
+        1. Sets up the system prompt for the language model.
+        2. Prepares a random sample output to use as an example.
+        3. Creates a detailed prompt for the language model, including the original reduce prompt,
+           fold prompt, sample output, and instructions for creating the merge prompt.
+        4. Uses the language model to generate the merge prompt.
+        5. Returns the generated merge prompt.
+        """
+        system_prompt = "You are an AI assistant tasked with creating a merge prompt for reduce operations in data processing pipelines. The pipeline has a reduce operation, and incrementally folds inputs into a single output. We want to optimize the pipeline for speed by running multiple folds on different inputs in parallel, and then merging the fold outputs into a single output."
+
+        output_schema = plan["output"]["schema"]
+        random_output = random.choice(sample_outputs)
+        random_output = {
+            k: random_output[k] for k in output_schema if k in random_output
+        }
+
+        prompt = f"""Reduce Operation Prompt (runs on the first batch of inputs):
+        {plan["prompt"]}
+
+        Fold Prompt (runs on the second and subsequent batches of inputs):
+        {plan["fold_prompt"]}
+
+        Sample output of the fold operation (an input to the merge operation):
+        {json.dumps(random_output, indent=2)}
+
+        Create a merge prompt for the reduce operation to combine 2+ folded outputs. The merge prompt should:
+        1. Give context on the task & fold operations, describing that the prompt will be used to combine multiple outputs from the fold operation (as if the original prompt was run on all inputs at once)
+        2. Describe how to combine multiple folded outputs into a single output
+        3. Minimally deviate from the reduce and fold prompts
+
+        The merge prompt should be a Jinja2 template with the following variables available:
+        - {{ outputs }}: A list of reduced outputs to be merged (each following the output schema). You can access the first output with {{ outputs[0] }} and the second with {{ outputs[1] }}
+
+        Output Schema:
+        {json.dumps(output_schema, indent=2)}
+
+        Provide the merge prompt as a string.
+        """
+
+        parameters = {
+            "type": "object",
+            "properties": {
+                "merge_prompt": {
+                    "type": "string",
+                }
+            },
+            "required": ["merge_prompt"],
+        }
+
+        response = self.llm_client.generate(
+            [{"role": "user", "content": prompt}],
+            system_prompt,
+            parameters,
+        )
+        return json.loads(response.choices[0].message.content)["merge_prompt"]
+
+
+ + + +
+ + + + + + + + + +
+ + +

+ __init__(runner, run_operation, num_fold_prompts=1, num_samples_in_validation=10) + +

+ + +
+ +

Initialize the ReduceOptimizer.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ config + + Dict[str, Any] + +
+

Configuration dictionary for the optimizer.

+
+
+ required +
+ console + + Console + +
+

Rich console object for pretty printing.

+
+
+ required +
+ llm_client + + LLMClient + +
+

Client for interacting with a language model.

+
+
+ required +
+ max_threads + + int + +
+

Maximum number of threads to use for parallel processing.

+
+
+ required +
+ run_operation + + Callable + +
+

Function to run an operation.

+
+
+ required +
+ num_fold_prompts + + int + +
+

Number of fold prompts to generate. Defaults to 1.

+
+
+ 1 +
+ num_samples_in_validation + + int + +
+

Number of samples to use in validation. Defaults to 10.

+
+
+ 10 +
+ +
+ Source code in docetl/optimizers/reduce_optimizer.py +
35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
+52
+53
+54
+55
+56
+57
+58
+59
+60
+61
+62
def __init__(
+    self,
+    runner,
+    run_operation: Callable,
+    num_fold_prompts: int = 1,
+    num_samples_in_validation: int = 10,
+):
+    """
+    Initialize the ReduceOptimizer.
+
+    Args:
+        config (Dict[str, Any]): Configuration dictionary for the optimizer.
+        console (Console): Rich console object for pretty printing.
+        llm_client (LLMClient): Client for interacting with a language model.
+        max_threads (int): Maximum number of threads to use for parallel processing.
+        run_operation (Callable): Function to run an operation.
+        num_fold_prompts (int, optional): Number of fold prompts to generate. Defaults to 1.
+        num_samples_in_validation (int, optional): Number of samples to use in validation. Defaults to 10.
+    """
+    self.runner = runner
+    self.config = self.runner.config
+    self.console = self.runner.console
+    self.llm_client = self.runner.optimizer.llm_client
+    self._run_operation = run_operation
+    self.max_threads = self.runner.max_threads
+    self.num_fold_prompts = num_fold_prompts
+    self.num_samples_in_validation = num_samples_in_validation
+    self.status = self.runner.status
+
+
+
+ +
+ +
+ + +

+ optimize(op_config, input_data, level=1) + +

+ + +
+ +

Optimize the reduce operation based on the given configuration and input data.

+

This method performs the following steps: +1. Run the original operation +2. Generate a validator prompt +3. Validate the output +4. If improvement is needed: + a. Evaluate if decomposition is beneficial + b. If decomposition is beneficial, recursively optimize each sub-operation + c. If not, proceed with single operation optimization +5. Run the optimized operation(s)

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ op_config + + Dict[str, Any] + +
+

Configuration for the reduce operation.

+
+
+ required +
+ input_data + + List[Dict[str, Any]] + +
+

Input data for the reduce operation.

+
+
+ required +
+ + +

Returns:

+ + + + + + + + + + + + + + + + + +
TypeDescription
+ List[Dict[str, Any]] + +
+

Tuple[List[Dict[str, Any]], List[Dict[str, Any]], float]: A tuple containing the list of optimized configurations

+
+
+ List[Dict[str, Any]] + +
+

and the list of outputs from the optimized operation(s), and the cost of the operation due to synthesizing any resolve operations.

+
+
+ +
+ Source code in docetl/optimizers/reduce_optimizer.py +
158
+159
+160
+161
+162
+163
+164
+165
+166
+167
+168
+169
+170
+171
+172
+173
+174
+175
+176
+177
+178
+179
+180
+181
+182
+183
+184
+185
+186
+187
+188
+189
+190
+191
+192
+193
+194
+195
+196
+197
+198
+199
+200
+201
+202
+203
+204
+205
+206
+207
+208
+209
+210
+211
+212
+213
+214
+215
+216
+217
+218
+219
+220
+221
+222
+223
+224
+225
+226
+227
+228
+229
+230
+231
+232
+233
+234
+235
+236
+237
+238
+239
+240
+241
+242
+243
+244
+245
+246
+247
+248
+249
+250
+251
+252
+253
+254
+255
+256
+257
+258
+259
+260
+261
+262
+263
+264
def optimize(
+    self,
+    op_config: Dict[str, Any],
+    input_data: List[Dict[str, Any]],
+    level: int = 1,
+) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], float]:
+    """
+    Optimize the reduce operation based on the given configuration and input data.
+
+    This method performs the following steps:
+    1. Run the original operation
+    2. Generate a validator prompt
+    3. Validate the output
+    4. If improvement is needed:
+       a. Evaluate if decomposition is beneficial
+       b. If decomposition is beneficial, recursively optimize each sub-operation
+       c. If not, proceed with single operation optimization
+    5. Run the optimized operation(s)
+
+    Args:
+        op_config (Dict[str, Any]): Configuration for the reduce operation.
+        input_data (List[Dict[str, Any]]): Input data for the reduce operation.
+
+    Returns:
+        Tuple[List[Dict[str, Any]], List[Dict[str, Any]], float]: A tuple containing the list of optimized configurations
+        and the list of outputs from the optimized operation(s), and the cost of the operation due to synthesizing any resolve operations.
+    """
+    (
+        validation_results,
+        prompt_tokens,
+        model_input_context_length,
+        model,
+        validator_prompt,
+        original_output,
+    ) = self.should_optimize_helper(op_config, input_data)
+
+    # add_map_op = False
+    if prompt_tokens * 2 > model_input_context_length:
+        # add_map_op = True
+        self.console.log(
+            f"[yellow]Warning: The reduce prompt exceeds the token limit for model {model}. "
+            f"Token count: {prompt_tokens}, Limit: {model_input_context_length}. "
+            f"Add a map operation to the pipeline.[/yellow]"
+        )
+
+    # # Also query an agent to look at a sample of the inputs and see if they think a map operation would be helpful
+    # preprocessing_steps = ""
+    # should_use_map, preprocessing_steps = self._should_use_map(
+    #     op_config, input_data
+    # )
+    # if should_use_map or add_map_op:
+    #     # Synthesize a map operation
+    #     map_prompt, map_output_schema = self._synthesize_map_operation(
+    #         op_config, preprocessing_steps, input_data
+    #     )
+    #     # Change the reduce operation prompt to use the map schema
+    #     new_reduce_prompt = self._change_reduce_prompt_to_use_map_schema(
+    #         op_config["prompt"], map_output_schema
+    #     )
+    #     op_config["prompt"] = new_reduce_prompt
+
+    #     # Return unoptimized map and reduce operations
+    #     return [map_prompt, op_config], input_data, 0.0
+
+    # Print the validation results
+    self.console.log("[bold]Validation Results on Initial Sample:[/bold]")
+    if validation_results["needs_improvement"] or self.config.get(
+        "optimizer_config", {}
+    ).get("force_decompose", False):
+        self.console.post_optimizer_rationale(
+            should_optimize=True,
+            rationale="\n".join(
+                [
+                    f"Issues: {result['issues']} Suggestions: {result['suggestions']}"
+                    for result in validation_results["validation_results"]
+                ]
+            ),
+            validator_prompt=validator_prompt,
+        )
+        self.console.log(
+            "\n".join(
+                [
+                    f"Issues: {result['issues']} Suggestions: {result['suggestions']}"
+                    for result in validation_results["validation_results"]
+                ]
+            )
+        )
+
+        # Step 3: Evaluate if decomposition is beneficial
+        decomposition_result = self._evaluate_decomposition(
+            op_config, input_data, level
+        )
+
+        if decomposition_result["should_decompose"]:
+            return self._optimize_decomposed_reduce(
+                decomposition_result, op_config, input_data, level
+            )
+
+        return self._optimize_single_reduce(op_config, input_data, validator_prompt)
+    else:
+        self.console.log(f"No improvements identified; {validation_results}.")
+        self.console.post_optimizer_rationale(
+            should_optimize=False,
+            rationale="No improvements identified; no optimization recommended.",
+            validator_prompt=validator_prompt,
+        )
+        return [op_config], original_output, 0.0
+
+
+
+ +
+ + + +
+ +
+ +
+ +
+ + + +

+ docetl.optimizers.join_optimizer.JoinOptimizer + + +

+ + +
+ + + + + + + +
+ Source code in docetl/optimizers/join_optimizer.py +
  15
+  16
+  17
+  18
+  19
+  20
+  21
+  22
+  23
+  24
+  25
+  26
+  27
+  28
+  29
+  30
+  31
+  32
+  33
+  34
+  35
+  36
+  37
+  38
+  39
+  40
+  41
+  42
+  43
+  44
+  45
+  46
+  47
+  48
+  49
+  50
+  51
+  52
+  53
+  54
+  55
+  56
+  57
+  58
+  59
+  60
+  61
+  62
+  63
+  64
+  65
+  66
+  67
+  68
+  69
+  70
+  71
+  72
+  73
+  74
+  75
+  76
+  77
+  78
+  79
+  80
+  81
+  82
+  83
+  84
+  85
+  86
+  87
+  88
+  89
+  90
+  91
+  92
+  93
+  94
+  95
+  96
+  97
+  98
+  99
+ 100
+ 101
+ 102
+ 103
+ 104
+ 105
+ 106
+ 107
+ 108
+ 109
+ 110
+ 111
+ 112
+ 113
+ 114
+ 115
+ 116
+ 117
+ 118
+ 119
+ 120
+ 121
+ 122
+ 123
+ 124
+ 125
+ 126
+ 127
+ 128
+ 129
+ 130
+ 131
+ 132
+ 133
+ 134
+ 135
+ 136
+ 137
+ 138
+ 139
+ 140
+ 141
+ 142
+ 143
+ 144
+ 145
+ 146
+ 147
+ 148
+ 149
+ 150
+ 151
+ 152
+ 153
+ 154
+ 155
+ 156
+ 157
+ 158
+ 159
+ 160
+ 161
+ 162
+ 163
+ 164
+ 165
+ 166
+ 167
+ 168
+ 169
+ 170
+ 171
+ 172
+ 173
+ 174
+ 175
+ 176
+ 177
+ 178
+ 179
+ 180
+ 181
+ 182
+ 183
+ 184
+ 185
+ 186
+ 187
+ 188
+ 189
+ 190
+ 191
+ 192
+ 193
+ 194
+ 195
+ 196
+ 197
+ 198
+ 199
+ 200
+ 201
+ 202
+ 203
+ 204
+ 205
+ 206
+ 207
+ 208
+ 209
+ 210
+ 211
+ 212
+ 213
+ 214
+ 215
+ 216
+ 217
+ 218
+ 219
+ 220
+ 221
+ 222
+ 223
+ 224
+ 225
+ 226
+ 227
+ 228
+ 229
+ 230
+ 231
+ 232
+ 233
+ 234
+ 235
+ 236
+ 237
+ 238
+ 239
+ 240
+ 241
+ 242
+ 243
+ 244
+ 245
+ 246
+ 247
+ 248
+ 249
+ 250
+ 251
+ 252
+ 253
+ 254
+ 255
+ 256
+ 257
+ 258
+ 259
+ 260
+ 261
+ 262
+ 263
+ 264
+ 265
+ 266
+ 267
+ 268
+ 269
+ 270
+ 271
+ 272
+ 273
+ 274
+ 275
+ 276
+ 277
+ 278
+ 279
+ 280
+ 281
+ 282
+ 283
+ 284
+ 285
+ 286
+ 287
+ 288
+ 289
+ 290
+ 291
+ 292
+ 293
+ 294
+ 295
+ 296
+ 297
+ 298
+ 299
+ 300
+ 301
+ 302
+ 303
+ 304
+ 305
+ 306
+ 307
+ 308
+ 309
+ 310
+ 311
+ 312
+ 313
+ 314
+ 315
+ 316
+ 317
+ 318
+ 319
+ 320
+ 321
+ 322
+ 323
+ 324
+ 325
+ 326
+ 327
+ 328
+ 329
+ 330
+ 331
+ 332
+ 333
+ 334
+ 335
+ 336
+ 337
+ 338
+ 339
+ 340
+ 341
+ 342
+ 343
+ 344
+ 345
+ 346
+ 347
+ 348
+ 349
+ 350
+ 351
+ 352
+ 353
+ 354
+ 355
+ 356
+ 357
+ 358
+ 359
+ 360
+ 361
+ 362
+ 363
+ 364
+ 365
+ 366
+ 367
+ 368
+ 369
+ 370
+ 371
+ 372
+ 373
+ 374
+ 375
+ 376
+ 377
+ 378
+ 379
+ 380
+ 381
+ 382
+ 383
+ 384
+ 385
+ 386
+ 387
+ 388
+ 389
+ 390
+ 391
+ 392
+ 393
+ 394
+ 395
+ 396
+ 397
+ 398
+ 399
+ 400
+ 401
+ 402
+ 403
+ 404
+ 405
+ 406
+ 407
+ 408
+ 409
+ 410
+ 411
+ 412
+ 413
+ 414
+ 415
+ 416
+ 417
+ 418
+ 419
+ 420
+ 421
+ 422
+ 423
+ 424
+ 425
+ 426
+ 427
+ 428
+ 429
+ 430
+ 431
+ 432
+ 433
+ 434
+ 435
+ 436
+ 437
+ 438
+ 439
+ 440
+ 441
+ 442
+ 443
+ 444
+ 445
+ 446
+ 447
+ 448
+ 449
+ 450
+ 451
+ 452
+ 453
+ 454
+ 455
+ 456
+ 457
+ 458
+ 459
+ 460
+ 461
+ 462
+ 463
+ 464
+ 465
+ 466
+ 467
+ 468
+ 469
+ 470
+ 471
+ 472
+ 473
+ 474
+ 475
+ 476
+ 477
+ 478
+ 479
+ 480
+ 481
+ 482
+ 483
+ 484
+ 485
+ 486
+ 487
+ 488
+ 489
+ 490
+ 491
+ 492
+ 493
+ 494
+ 495
+ 496
+ 497
+ 498
+ 499
+ 500
+ 501
+ 502
+ 503
+ 504
+ 505
+ 506
+ 507
+ 508
+ 509
+ 510
+ 511
+ 512
+ 513
+ 514
+ 515
+ 516
+ 517
+ 518
+ 519
+ 520
+ 521
+ 522
+ 523
+ 524
+ 525
+ 526
+ 527
+ 528
+ 529
+ 530
+ 531
+ 532
+ 533
+ 534
+ 535
+ 536
+ 537
+ 538
+ 539
+ 540
+ 541
+ 542
+ 543
+ 544
+ 545
+ 546
+ 547
+ 548
+ 549
+ 550
+ 551
+ 552
+ 553
+ 554
+ 555
+ 556
+ 557
+ 558
+ 559
+ 560
+ 561
+ 562
+ 563
+ 564
+ 565
+ 566
+ 567
+ 568
+ 569
+ 570
+ 571
+ 572
+ 573
+ 574
+ 575
+ 576
+ 577
+ 578
+ 579
+ 580
+ 581
+ 582
+ 583
+ 584
+ 585
+ 586
+ 587
+ 588
+ 589
+ 590
+ 591
+ 592
+ 593
+ 594
+ 595
+ 596
+ 597
+ 598
+ 599
+ 600
+ 601
+ 602
+ 603
+ 604
+ 605
+ 606
+ 607
+ 608
+ 609
+ 610
+ 611
+ 612
+ 613
+ 614
+ 615
+ 616
+ 617
+ 618
+ 619
+ 620
+ 621
+ 622
+ 623
+ 624
+ 625
+ 626
+ 627
+ 628
+ 629
+ 630
+ 631
+ 632
+ 633
+ 634
+ 635
+ 636
+ 637
+ 638
+ 639
+ 640
+ 641
+ 642
+ 643
+ 644
+ 645
+ 646
+ 647
+ 648
+ 649
+ 650
+ 651
+ 652
+ 653
+ 654
+ 655
+ 656
+ 657
+ 658
+ 659
+ 660
+ 661
+ 662
+ 663
+ 664
+ 665
+ 666
+ 667
+ 668
+ 669
+ 670
+ 671
+ 672
+ 673
+ 674
+ 675
+ 676
+ 677
+ 678
+ 679
+ 680
+ 681
+ 682
+ 683
+ 684
+ 685
+ 686
+ 687
+ 688
+ 689
+ 690
+ 691
+ 692
+ 693
+ 694
+ 695
+ 696
+ 697
+ 698
+ 699
+ 700
+ 701
+ 702
+ 703
+ 704
+ 705
+ 706
+ 707
+ 708
+ 709
+ 710
+ 711
+ 712
+ 713
+ 714
+ 715
+ 716
+ 717
+ 718
+ 719
+ 720
+ 721
+ 722
+ 723
+ 724
+ 725
+ 726
+ 727
+ 728
+ 729
+ 730
+ 731
+ 732
+ 733
+ 734
+ 735
+ 736
+ 737
+ 738
+ 739
+ 740
+ 741
+ 742
+ 743
+ 744
+ 745
+ 746
+ 747
+ 748
+ 749
+ 750
+ 751
+ 752
+ 753
+ 754
+ 755
+ 756
+ 757
+ 758
+ 759
+ 760
+ 761
+ 762
+ 763
+ 764
+ 765
+ 766
+ 767
+ 768
+ 769
+ 770
+ 771
+ 772
+ 773
+ 774
+ 775
+ 776
+ 777
+ 778
+ 779
+ 780
+ 781
+ 782
+ 783
+ 784
+ 785
+ 786
+ 787
+ 788
+ 789
+ 790
+ 791
+ 792
+ 793
+ 794
+ 795
+ 796
+ 797
+ 798
+ 799
+ 800
+ 801
+ 802
+ 803
+ 804
+ 805
+ 806
+ 807
+ 808
+ 809
+ 810
+ 811
+ 812
+ 813
+ 814
+ 815
+ 816
+ 817
+ 818
+ 819
+ 820
+ 821
+ 822
+ 823
+ 824
+ 825
+ 826
+ 827
+ 828
+ 829
+ 830
+ 831
+ 832
+ 833
+ 834
+ 835
+ 836
+ 837
+ 838
+ 839
+ 840
+ 841
+ 842
+ 843
+ 844
+ 845
+ 846
+ 847
+ 848
+ 849
+ 850
+ 851
+ 852
+ 853
+ 854
+ 855
+ 856
+ 857
+ 858
+ 859
+ 860
+ 861
+ 862
+ 863
+ 864
+ 865
+ 866
+ 867
+ 868
+ 869
+ 870
+ 871
+ 872
+ 873
+ 874
+ 875
+ 876
+ 877
+ 878
+ 879
+ 880
+ 881
+ 882
+ 883
+ 884
+ 885
+ 886
+ 887
+ 888
+ 889
+ 890
+ 891
+ 892
+ 893
+ 894
+ 895
+ 896
+ 897
+ 898
+ 899
+ 900
+ 901
+ 902
+ 903
+ 904
+ 905
+ 906
+ 907
+ 908
+ 909
+ 910
+ 911
+ 912
+ 913
+ 914
+ 915
+ 916
+ 917
+ 918
+ 919
+ 920
+ 921
+ 922
+ 923
+ 924
+ 925
+ 926
+ 927
+ 928
+ 929
+ 930
+ 931
+ 932
+ 933
+ 934
+ 935
+ 936
+ 937
+ 938
+ 939
+ 940
+ 941
+ 942
+ 943
+ 944
+ 945
+ 946
+ 947
+ 948
+ 949
+ 950
+ 951
+ 952
+ 953
+ 954
+ 955
+ 956
+ 957
+ 958
+ 959
+ 960
+ 961
+ 962
+ 963
+ 964
+ 965
+ 966
+ 967
+ 968
+ 969
+ 970
+ 971
+ 972
+ 973
+ 974
+ 975
+ 976
+ 977
+ 978
+ 979
+ 980
+ 981
+ 982
+ 983
+ 984
+ 985
+ 986
+ 987
+ 988
+ 989
+ 990
+ 991
+ 992
+ 993
+ 994
+ 995
+ 996
+ 997
+ 998
+ 999
+1000
+1001
+1002
+1003
+1004
+1005
+1006
+1007
+1008
+1009
+1010
+1011
+1012
+1013
+1014
+1015
+1016
+1017
+1018
+1019
+1020
+1021
+1022
+1023
+1024
+1025
+1026
+1027
+1028
+1029
+1030
+1031
+1032
+1033
+1034
+1035
+1036
+1037
+1038
+1039
+1040
+1041
+1042
+1043
+1044
+1045
+1046
+1047
+1048
+1049
+1050
+1051
+1052
+1053
+1054
+1055
+1056
+1057
+1058
+1059
+1060
+1061
+1062
+1063
+1064
+1065
+1066
+1067
+1068
+1069
+1070
+1071
+1072
+1073
+1074
+1075
+1076
+1077
+1078
+1079
+1080
+1081
+1082
+1083
+1084
+1085
+1086
+1087
+1088
+1089
+1090
+1091
+1092
+1093
+1094
+1095
+1096
+1097
+1098
+1099
+1100
+1101
+1102
+1103
+1104
+1105
+1106
+1107
+1108
+1109
+1110
+1111
+1112
+1113
+1114
+1115
+1116
+1117
+1118
+1119
+1120
+1121
+1122
+1123
+1124
+1125
+1126
+1127
+1128
+1129
+1130
+1131
+1132
+1133
+1134
+1135
+1136
+1137
+1138
+1139
+1140
+1141
+1142
+1143
+1144
+1145
+1146
+1147
+1148
+1149
+1150
+1151
+1152
+1153
+1154
+1155
+1156
+1157
+1158
+1159
+1160
+1161
+1162
+1163
+1164
+1165
+1166
+1167
+1168
+1169
+1170
+1171
+1172
+1173
+1174
+1175
+1176
+1177
+1178
+1179
+1180
+1181
+1182
+1183
+1184
+1185
+1186
+1187
+1188
+1189
+1190
+1191
+1192
+1193
+1194
+1195
+1196
+1197
+1198
+1199
+1200
+1201
+1202
+1203
+1204
+1205
+1206
+1207
+1208
+1209
+1210
+1211
+1212
+1213
+1214
+1215
+1216
+1217
+1218
+1219
+1220
+1221
+1222
+1223
+1224
+1225
+1226
+1227
+1228
+1229
+1230
+1231
+1232
+1233
+1234
+1235
+1236
+1237
+1238
+1239
+1240
+1241
+1242
+1243
+1244
+1245
+1246
+1247
+1248
+1249
+1250
+1251
+1252
+1253
+1254
+1255
+1256
+1257
+1258
+1259
+1260
+1261
+1262
+1263
+1264
+1265
+1266
+1267
+1268
+1269
+1270
+1271
+1272
+1273
+1274
+1275
+1276
+1277
+1278
+1279
+1280
+1281
+1282
+1283
+1284
+1285
+1286
+1287
+1288
+1289
+1290
+1291
+1292
+1293
+1294
+1295
+1296
+1297
+1298
+1299
+1300
+1301
+1302
+1303
+1304
+1305
+1306
+1307
+1308
+1309
+1310
+1311
+1312
+1313
+1314
+1315
+1316
+1317
+1318
+1319
+1320
+1321
+1322
+1323
+1324
+1325
+1326
+1327
+1328
+1329
+1330
+1331
+1332
+1333
+1334
+1335
+1336
+1337
+1338
+1339
+1340
+1341
+1342
+1343
+1344
+1345
+1346
+1347
+1348
+1349
+1350
+1351
+1352
+1353
+1354
+1355
+1356
+1357
+1358
+1359
+1360
+1361
+1362
+1363
+1364
+1365
+1366
+1367
+1368
+1369
+1370
+1371
+1372
+1373
+1374
+1375
+1376
+1377
+1378
+1379
+1380
+1381
+1382
+1383
+1384
+1385
+1386
+1387
+1388
+1389
+1390
+1391
+1392
+1393
+1394
+1395
+1396
+1397
+1398
+1399
+1400
+1401
+1402
+1403
+1404
+1405
+1406
+1407
+1408
+1409
+1410
+1411
+1412
+1413
+1414
+1415
+1416
+1417
+1418
+1419
+1420
+1421
+1422
+1423
+1424
+1425
+1426
+1427
+1428
+1429
+1430
+1431
+1432
+1433
+1434
+1435
+1436
+1437
+1438
+1439
+1440
+1441
+1442
+1443
+1444
+1445
+1446
+1447
+1448
+1449
+1450
+1451
+1452
+1453
+1454
+1455
+1456
+1457
+1458
+1459
+1460
+1461
+1462
+1463
+1464
+1465
+1466
+1467
+1468
+1469
+1470
+1471
+1472
+1473
+1474
+1475
+1476
+1477
+1478
+1479
+1480
+1481
+1482
+1483
+1484
+1485
+1486
+1487
+1488
+1489
+1490
+1491
+1492
+1493
+1494
+1495
+1496
+1497
+1498
+1499
+1500
+1501
+1502
+1503
+1504
+1505
+1506
+1507
+1508
+1509
+1510
+1511
+1512
+1513
+1514
+1515
+1516
+1517
+1518
+1519
+1520
+1521
+1522
+1523
+1524
+1525
+1526
+1527
+1528
+1529
+1530
+1531
+1532
+1533
+1534
+1535
+1536
+1537
+1538
+1539
+1540
+1541
+1542
+1543
+1544
+1545
+1546
+1547
+1548
+1549
+1550
+1551
+1552
+1553
+1554
+1555
+1556
+1557
+1558
+1559
+1560
+1561
+1562
+1563
+1564
+1565
+1566
+1567
+1568
+1569
+1570
+1571
+1572
+1573
+1574
+1575
+1576
+1577
+1578
+1579
+1580
+1581
+1582
+1583
+1584
+1585
+1586
+1587
+1588
+1589
+1590
+1591
+1592
+1593
+1594
+1595
+1596
+1597
+1598
+1599
+1600
+1601
+1602
+1603
+1604
+1605
+1606
+1607
+1608
+1609
+1610
+1611
+1612
+1613
+1614
+1615
+1616
+1617
+1618
+1619
+1620
+1621
+1622
+1623
+1624
+1625
+1626
+1627
+1628
+1629
+1630
+1631
+1632
+1633
+1634
+1635
+1636
+1637
+1638
+1639
+1640
+1641
+1642
+1643
+1644
+1645
+1646
+1647
+1648
+1649
+1650
+1651
+1652
+1653
+1654
+1655
+1656
+1657
+1658
+1659
+1660
+1661
+1662
+1663
+1664
+1665
+1666
+1667
+1668
+1669
+1670
+1671
+1672
+1673
+1674
+1675
+1676
+1677
+1678
+1679
+1680
+1681
+1682
+1683
+1684
+1685
+1686
+1687
+1688
+1689
+1690
+1691
+1692
+1693
+1694
+1695
+1696
+1697
+1698
+1699
+1700
+1701
+1702
+1703
+1704
+1705
+1706
+1707
+1708
+1709
+1710
+1711
+1712
+1713
+1714
+1715
+1716
+1717
+1718
+1719
+1720
+1721
+1722
+1723
+1724
+1725
+1726
+1727
+1728
+1729
+1730
+1731
+1732
+1733
+1734
+1735
+1736
+1737
+1738
+1739
+1740
+1741
+1742
+1743
+1744
+1745
+1746
+1747
+1748
+1749
+1750
+1751
+1752
+1753
+1754
+1755
+1756
+1757
+1758
+1759
+1760
+1761
+1762
+1763
+1764
+1765
+1766
+1767
+1768
+1769
+1770
+1771
+1772
+1773
+1774
+1775
+1776
+1777
class JoinOptimizer:
+    def __init__(
+        self,
+        runner,
+        op_config: Dict[str, Any],
+        target_recall: float = 0.95,
+        sample_size: int = 500,
+        sampling_weight: float = 20,
+        agent_max_retries: int = 5,
+        estimated_selectivity: float = None,
+    ):
+        self.runner = runner
+        self.config = runner.config
+        self.op_config = op_config
+        self.llm_client = runner.optimizer.llm_client
+        self.max_threads = runner.max_threads
+        self.console = runner.console
+        self.target_recall = target_recall
+        self.sample_size = sample_size
+        self.sampling_weight = sampling_weight
+        self.agent_max_retries = agent_max_retries
+        self.estimated_selectivity = estimated_selectivity
+        self.console.log(f"Target Recall: {self.target_recall}")
+        self.status = self.runner.status
+        self.max_comparison_sampling_attempts = 5
+        self.synthesized_keys = []
+        # if self.estimated_selectivity is not None:
+        #     self.console.log(
+        #         f"[yellow]Using estimated selectivity of {self.estimated_selectivity}[/yellow]"
+        #     )
+
+    def _analyze_map_prompt_categorization(self, map_prompt: str) -> Tuple[bool, str]:
+        """
+        Analyze the map prompt to determine if it's explicitly categorical.
+
+        Args:
+            map_prompt (str): The map prompt to analyze.
+
+        Returns:
+            bool: True if the prompt is explicitly categorical, False otherwise.
+        """
+        messages = [
+            {
+                "role": "system",
+                "content": "You are an AI assistant tasked with analyzing prompts for data processing operations.",
+            },
+            {
+                "role": "user",
+                "content": f"""Analyze the following map operation prompt and determine if it is explicitly categorical,
+                meaning it details a specific set of possible outputs:
+
+                {map_prompt}
+
+                Respond with 'Yes' if the prompt is explicitly categorical, detailing a finite set of possible outputs.
+                Respond with 'No' if the prompt allows for open-ended or non-categorical responses.
+                Provide a brief explanation for your decision.""",
+            },
+        ]
+
+        response = self.llm_client.generate(
+            messages,
+            "You are an expert in analyzing natural language prompts for data processing tasks.",
+            {
+                "type": "object",
+                "properties": {
+                    "is_categorical": {
+                        "type": "string",
+                        "enum": ["Yes", "No"],
+                        "description": "Whether the prompt is explicitly categorical",
+                    },
+                    "explanation": {
+                        "type": "string",
+                        "description": "Brief explanation for the decision",
+                    },
+                },
+                "required": ["is_categorical", "explanation"],
+            },
+        )
+
+        analysis = json.loads(response.choices[0].message.content)
+
+        self.console.log("[bold]Map Prompt Analysis:[/bold]")
+        self.console.log(f"Is Categorical: {analysis['is_categorical']}")
+        self.console.log(f"Explanation: {analysis['explanation']}")
+
+        return analysis["is_categorical"].lower() == "yes", analysis["explanation"]
+
+    def _determine_duplicate_keys(
+        self,
+        input_data: List[Dict[str, Any]],
+        reduce_key: List[str],
+        map_prompt: Optional[str] = None,
+    ) -> Tuple[bool, str]:
+        # Prepare a sample of the input data for analysis
+        sample_size = min(10, len(input_data))
+        data_sample = random.sample(
+            [{rk: item[rk] for rk in reduce_key} for item in input_data], sample_size
+        )
+
+        context_prefix = ""
+        if map_prompt:
+            context_prefix = f"For context, these values came out of a pipeline with the following prompt:\n\n{map_prompt}\n\n"
+
+        messages = [
+            {
+                "role": "user",
+                "content": f"{context_prefix}I want to do a reduce operation on these values, and I need to determine if there are semantic duplicates in the data, where the strings are different but they technically belong in the same group. Note that exact string duplicates should not be considered here.\n\nHere's a sample of the data (showing the '{reduce_key}' field(s)): {data_sample}\n\nBased on this {'context and ' if map_prompt else ''}sample, are there likely to be such semantic duplicates (not exact string matches) in the dataset? Respond with 'yes' only if you think there are semantic duplicates, or 'no' if you don't see evidence of semantic duplicates or if you only see exact string duplicates.",
+            },
+        ]
+        response = self.llm_client.generate(
+            messages,
+            "You are an expert data analyst. Analyze the given data sample and determine if there are likely to be semantic duplicate values that belong in the same group, even if the strings are different.",
+            {
+                "type": "object",
+                "properties": {
+                    "likely_duplicates": {
+                        "type": "string",
+                        "enum": ["Yes", "No"],
+                        "description": "Whether duplicates are likely to exist in the full dataset",
+                    },
+                    "explanation": {
+                        "type": "string",
+                        "description": "Brief explanation for the decision",
+                    },
+                },
+                "required": ["likely_duplicates", "explanation"],
+            },
+        )
+
+        analysis = json.loads(response.choices[0].message.content)
+
+        self.console.log(f"[bold]Duplicate Analysis for '{reduce_key}':[/bold]")
+        self.console.log(f"Likely Duplicates: {analysis['likely_duplicates']}")
+        self.console.log(f"Explanation: {analysis['explanation']}")
+
+        if analysis["likely_duplicates"].lower() == "yes":
+            self.console.log(
+                "[yellow]Duplicates are likely. Consider using a deduplication strategy in the resolution step.[/yellow]"
+            )
+            return True, analysis["explanation"]
+        return False, ""
+
+    def _sample_random_pairs(
+        self, input_data: List[Dict[str, Any]], n: int
+    ) -> List[Tuple[int, int]]:
+        """Sample random pairs of indices, excluding exact matches."""
+        pairs = set()
+        max_attempts = n * 10  # Avoid infinite loop
+        attempts = 0
+
+        while len(pairs) < n and attempts < max_attempts:
+            i, j = random.sample(range(len(input_data)), 2)
+            if i != j and input_data[i] != input_data[j]:
+                pairs.add((min(i, j), max(i, j)))  # Ensure ordered pairs
+            attempts += 1
+
+        return list(pairs)
+
+    def _check_duplicates_with_llm(
+        self,
+        input_data: List[Dict[str, Any]],
+        pairs: List[Tuple[int, int]],
+        reduce_key: List[str],
+        map_prompt: Optional[str],
+    ) -> Tuple[bool, str]:
+        """Use LLM to check if any pairs are duplicates."""
+
+        content = "Analyze the following pairs of entries and determine if any of them are likely duplicates. Respond with 'Yes' if you find any likely duplicates, or 'No' if none of the pairs seem to be duplicates. Provide a brief explanation for your decision.\n\n"
+
+        if map_prompt:
+            content = (
+                f"For reference, here is the map prompt used earlier in the pipeline: {map_prompt}\n\n"
+                + content
+            )
+
+        for i, (idx1, idx2) in enumerate(pairs, 1):
+            content += f"Pair {i}:\n"
+            content += "Entry 1:\n"
+            for key in reduce_key:
+                content += f"{key}: {json.dumps(input_data[idx1][key], indent=2)}\n"
+            content += "\nEntry 2:\n"
+            for key in reduce_key:
+                content += f"{key}: {json.dumps(input_data[idx2][key], indent=2)}\n"
+            content += "\n"
+
+        messages = [{"role": "user", "content": content}]
+
+        system_prompt = "You are an AI assistant tasked with identifying potential duplicate entries in a dataset."
+        response_schema = {
+            "type": "object",
+            "properties": {
+                "duplicates_found": {"type": "string", "enum": ["Yes", "No"]},
+                "explanation": {"type": "string"},
+            },
+            "required": ["duplicates_found", "explanation"],
+        }
+
+        response = self.llm_client.generate(messages, system_prompt, response_schema)
+
+        # Print the duplicates_found and explanation
+        self.console.log(
+            f"[bold]Duplicates in keys found:[/bold] {response['duplicates_found']}\n"
+            f"[bold]Explanation:[/bold] {response['explanation']}"
+        )
+
+        return response["duplicates_found"].lower() == "yes", response["explanation"]
+
+    def synthesize_compare_prompt(
+        self, map_prompt: Optional[str], reduce_key: List[str]
+    ) -> str:
+
+        system_prompt = f"You are an AI assistant tasked with creating a comparison prompt for LLM-assisted entity resolution. Your task is to create a comparison prompt that will be used to compare two entities, referred to as input1 and input2, to see if they are likely the same entity based on the following reduce key(s): {', '.join(reduce_key)}."
+        if map_prompt:
+            system_prompt += f"\n\nFor context, here is the prompt used earlier in the pipeline to create the inputs to resolve: {map_prompt}"
+
+        messages = [
+            {
+                "role": "user",
+                "content": f"""
+    Create a comparison prompt for entity resolution: The prompt should:
+    1. Be tailored to the specific domain and type of data being compared ({reduce_key}), based on the context provided.
+    2. Instruct to compare two entities, referred to as input1 and input2.
+    3. Specifically mention comparing each reduce key in input1 and input2 (e.g., input1.{{key}} and input2.{{key}} for each key in {reduce_key}). You can reference other fields in the input as well, as long as they are short.
+    4. Include instructions to consider relevant attributes or characteristics for comparison.
+    5. Ask to respond with "True" if the entities are likely the same, or "False" if they are likely different.
+
+    Example structure:
+    ```
+    Compare the following two {reduce_key} from [entity or document type]:
+
+    [Entity 1]:
+    {{{{ input1.key1 }}}}
+    {{{{ input1.optional_key2 }}}}
+
+    [Entity 2]:
+    {{{{ input2.key1 }}}}
+    {{{{ input2.optional_key2 }}}}
+
+    Are these [entities] likely referring to the same [entity type]? Consider [list relevant attributes or characteristics to compare]. Respond with "True" if they are likely the same [entity type], or "False" if they are likely different [entity types].
+    ```
+
+    Please generate the comparison prompt, which should be a Jinja2 template:
+    """,
+            }
+        ]
+
+        response = self.llm_client.generate(
+            messages,
+            system_prompt,
+            {
+                "type": "object",
+                "properties": {
+                    "comparison_prompt": {
+                        "type": "string",
+                        "description": "Detailed comparison prompt for entity resolution",
+                    }
+                },
+                "required": ["comparison_prompt"],
+            },
+        )
+
+        comparison_prompt = json.loads(response.choices[0].message.content)[
+            "comparison_prompt"
+        ]
+
+        # Log the synthesized comparison prompt
+        self.console.log("[green]Synthesized comparison prompt:[/green]")
+        self.console.log(comparison_prompt)
+
+        if not comparison_prompt:
+            raise ValueError(
+                "Could not synthesize a comparison prompt. Please provide a comparison prompt in the config."
+            )
+
+        return comparison_prompt
+
+    def synthesize_resolution_prompt(
+        self,
+        map_prompt: Optional[str],
+        reduce_key: List[str],
+        output_schema: Dict[str, str],
+    ) -> str:
+        system_prompt = f"""You are an AI assistant tasked with creating a resolution prompt for LLM-assisted entity resolution.
+        Your task is to create a prompt that will be used to merge multiple duplicate keys into a single, consolidated key.
+        The key(s) being resolved (known as the reduce_key) are {', '.join(reduce_key)}.
+        The duplicate keys will be provided in a list called 'inputs' in a Jinja2 template.
+        """
+
+        if map_prompt:
+            system_prompt += f"\n\nFor context, here is the prompt used earlier in the pipeline to create the inputs to resolve: {map_prompt}"
+
+        messages = [
+            {
+                "role": "user",
+                "content": f"""
+    Create a resolution prompt for merging duplicate keys into a single key. The prompt should:
+    1. Be tailored to the specific domain and type of data being merged, based on the context provided.
+    2. Use a Jinja2 template to iterate over the duplicate keys (accessed as 'inputs', where each item is a dictionary containing the reduce_key fields, which you can access as entry.reduce_key for each reduce_key in {reduce_key}).
+    3. Instruct to create a single, consolidated key from the duplicate keys.
+    4. Include guidelines for resolving conflicts (e.g., choosing the most recent, most complete, or most reliable information).
+    5. Specify that the output of the resolution prompt should conform to the given output schema: {json.dumps(output_schema, indent=2)}
+
+    Example structure:
+    ```
+    Analyze the following duplicate entries for the {reduce_key} key:
+
+    {{% for key in inputs %}}
+    Entry {{{{ loop.index }}}}:
+    {{ % for key in reduce_key %}}
+    {{{{ key }}}}: {{{{ key[reduce_key] }}}}
+    {{% endfor %}}
+
+    {{% endfor %}}
+
+    Merge these into a single key.
+    When merging, follow these guidelines:
+    1. [Provide specific merging instructions relevant to the data type]
+    2. [Do not make the prompt too long]
+
+    Ensure that the merged key conforms to the following schema:
+    {json.dumps(output_schema, indent=2)}
+
+    Return the consolidated key as a single [appropriate data type] value.
+    ```
+
+    Please generate the resolution prompt:
+    """,
+            }
+        ]
+
+        response = self.llm_client.generate(
+            messages,
+            system_prompt,
+            {
+                "type": "object",
+                "properties": {
+                    "resolution_prompt": {
+                        "type": "string",
+                        "description": "Detailed resolution prompt for merging duplicate keys",
+                    }
+                },
+                "required": ["resolution_prompt"],
+            },
+        )
+
+        resolution_prompt = json.loads(response.choices[0].message.content)[
+            "resolution_prompt"
+        ]
+
+        # Log the synthesized resolution prompt
+        self.console.log("[green]Synthesized resolution prompt:[/green]")
+        self.console.log(resolution_prompt)
+
+        if not resolution_prompt:
+            raise ValueError(
+                "Could not synthesize a resolution prompt. Please provide a resolution prompt in the config."
+            )
+
+        return resolution_prompt
+
+    def should_optimize(self, input_data: List[Dict[str, Any]]) -> Tuple[bool, str]:
+        """
+        Determine if the given operation configuration should be optimized.
+        """
+        # If there are no blocking keys or embeddings, then we don't need to optimize
+        if not self.op_config.get("blocking_conditions") or not self.op_config.get(
+            "blocking_threshold"
+        ):
+            return True, ""
+
+        # Check if the operation is marked as empty
+        elif self.op_config.get("empty", False):
+            # Extract the map prompt from the intermediates
+            map_prompt = self.op_config["_intermediates"]["map_prompt"]
+            reduce_key = self.op_config["_intermediates"]["reduce_key"]
+
+            if reduce_key is None:
+                raise ValueError(
+                    "[yellow]Warning: No reduce key found in intermediates for synthesized resolve operation.[/yellow]"
+                )
+
+            dedup = True
+            explanation = "There is a reduce operation that does not follow a resolve operation. Consider adding a resolve operation to deduplicate the data."
+
+            if map_prompt:
+                # Analyze the map prompt
+                analysis, explanation = self._analyze_map_prompt_categorization(
+                    map_prompt
+                )
+
+                if analysis:
+                    dedup = False
+            else:
+                self.console.log(
+                    "[yellow]No map prompt found in intermediates for analysis.[/yellow]"
+                )
+
+            # TODO: figure out why this would ever be the case
+            if not map_prompt:
+                map_prompt = "N/A"
+
+            if dedup is False:
+                dedup, explanation = self._determine_duplicate_keys(
+                    input_data, reduce_key, map_prompt
+                )
+
+            # Now do the last attempt of pairwise comparisons
+            if dedup is False:
+                # Sample up to 20 random pairs of keys for duplicate analysis
+                sampled_pairs = self._sample_random_pairs(input_data, 20)
+
+                # Use LLM to check for duplicates
+                duplicates_found, explanation = self._check_duplicates_with_llm(
+                    input_data, sampled_pairs, reduce_key, map_prompt
+                )
+
+                if duplicates_found:
+                    dedup = True
+
+            return dedup, explanation
+
+        return False, ""
+
+    def optimize_resolve(
+        self, input_data: List[Dict[str, Any]]
+    ) -> Tuple[Dict[str, Any], float]:
+        # Check if the operation is marked as empty
+        if self.op_config.get("empty", False):
+            # Extract the map prompt from the intermediates
+            dedup, _ = self.should_optimize(input_data)
+            reduce_key = self.op_config["_intermediates"]["reduce_key"]
+            map_prompt = self.op_config["_intermediates"]["map_prompt"]
+
+            if dedup is False:
+                # If no deduplication is needed, return the same config with 0 cost
+                return self.op_config, 0.0
+
+            # Add the reduce key to the output schema in the config
+            self.op_config["output"] = {"schema": {rk: "string" for rk in reduce_key}}
+            for attempt in range(2):  # Try up to 2 times
+                self.op_config["comparison_prompt"] = self.synthesize_compare_prompt(
+                    map_prompt, reduce_key
+                )
+                if (
+                    "input1" in self.op_config["comparison_prompt"]
+                    and "input2" in self.op_config["comparison_prompt"]
+                ):
+                    break
+                elif attempt == 0:
+                    self.console.log(
+                        "[yellow]Warning: 'input1' or 'input2' not found in comparison prompt. Retrying...[/yellow]"
+                    )
+            if (
+                "input1" not in self.op_config["comparison_prompt"]
+                or "input2" not in self.op_config["comparison_prompt"]
+            ):
+                self.console.log(
+                    "[red]Error: Failed to generate comparison prompt with 'input1' and 'input2'. Using last generated prompt.[/red]"
+                )
+            for attempt in range(2):  # Try up to 2 times
+                self.op_config["resolution_prompt"] = self.synthesize_resolution_prompt(
+                    map_prompt, reduce_key, self.op_config["output"]["schema"]
+                )
+                if "inputs" in self.op_config["resolution_prompt"]:
+                    break
+                elif attempt == 0:
+                    self.console.log(
+                        "[yellow]Warning: 'inputs' not found in resolution prompt. Retrying...[/yellow]"
+                    )
+            if "inputs" not in self.op_config["resolution_prompt"]:
+                self.console.log(
+                    "[red]Error: Failed to generate resolution prompt with 'inputs'. Using last generated prompt.[/red]"
+                )
+
+            # Pop off the empty flag
+            self.op_config.pop("empty")
+
+        embeddings, blocking_keys, embedding_cost = self._compute_embeddings(input_data)
+        self.console.log(
+            f"[bold]Cost of creating embeddings on the sample: ${embedding_cost:.4f}[/bold]"
+        )
+
+        similarities = self._calculate_cosine_similarities(embeddings)
+
+        sampled_pairs = self._sample_pairs(similarities)
+        comparison_results, comparison_cost = self._perform_comparisons_resolve(
+            input_data, sampled_pairs
+        )
+
+        self._print_similarity_histogram(similarities, comparison_results)
+
+        threshold, estimated_selectivity = self._find_optimal_threshold(
+            comparison_results, similarities
+        )
+
+        blocking_rules = self._generate_blocking_rules(
+            blocking_keys, input_data, comparison_results
+        )
+
+        if blocking_rules:
+            false_negatives, rule_selectivity = self._verify_blocking_rule(
+                input_data,
+                blocking_rules[0],
+                blocking_keys,
+                comparison_results,
+            )
+            # If more than 50% of the sample is false negatives, reject the blocking rule
+            if len(false_negatives) > len(sampled_pairs) / 2:
+                if false_negatives:
+                    self.console.log(
+                        f"[red]Blocking rule rejected. {len(false_negatives)} false negatives detected in the sample ({len(false_negatives) / len(sampled_pairs):.2f} of the sample).[/red]"
+                    )
+                    for i, j in false_negatives[:5]:  # Show up to 5 examples
+                        self.console.log(
+                            f"  Filtered pair: {{ {blocking_keys[0]}: {input_data[i][blocking_keys[0]]} }} and {{ {blocking_keys[0]}: {input_data[j][blocking_keys[0]]} }}"
+                        )
+                    if len(false_negatives) > 5:
+                        self.console.log(f"  ... and {len(false_negatives) - 5} more.")
+                blocking_rules = (
+                    []
+                )  # Clear the blocking rule if it introduces false negatives or is too selective
+            elif not false_negatives and rule_selectivity > estimated_selectivity:
+                self.console.log(
+                    "[green]Blocking rule verified. No false negatives detected in the sample and selectivity is within estimated selectivity.[/green]"
+                )
+            else:
+                # TODO: ask user if they want to use the blocking rule, or come up with some good default behavior
+                blocking_rules = []
+
+        optimized_config = self._update_config(threshold, blocking_keys, blocking_rules)
+        return optimized_config, embedding_cost + comparison_cost
+
+    def optimize_equijoin(
+        self,
+        left_data: List[Dict[str, Any]],
+        right_data: List[Dict[str, Any]],
+        skip_map_gen: bool = False,
+        skip_containment_gen: bool = False,
+    ) -> Tuple[Dict[str, Any], float, Dict[str, Any]]:
+        left_keys = self.op_config.get("blocking_keys", {}).get("left", [])
+        right_keys = self.op_config.get("blocking_keys", {}).get("right", [])
+
+        if not left_keys and not right_keys:
+            # Ask the LLM agent if it would be beneficial to do a map operation on
+            # one of the datasets before doing an equijoin
+            apply_transformation, dataset_to_transform, reason = (
+                (
+                    self._should_apply_map_transformation(
+                        left_keys, right_keys, left_data, right_data
+                    )
+                )
+                if not skip_map_gen
+                else (False, None, None)
+            )
+
+            if apply_transformation and not skip_map_gen:
+                self.console.log(
+                    f"LLM agent suggested applying a map transformation to {dataset_to_transform} dataset because: {reason}"
+                )
+                extraction_prompt, output_key, new_comparison_prompt = (
+                    self._generate_map_and_new_join_transformation(
+                        dataset_to_transform, reason, left_data, right_data
+                    )
+                )
+                self.console.log(
+                    f"Generated map transformation prompt: {extraction_prompt}"
+                )
+                self.console.log(f"\nNew output key: {output_key}")
+                self.console.log(
+                    f"\nNew equijoin comparison prompt: {new_comparison_prompt}"
+                )
+
+                # Update the comparison prompt
+                self.op_config["comparison_prompt"] = new_comparison_prompt
+
+                # Add the output key to the left_keys or right_keys
+                if dataset_to_transform == "left":
+                    left_keys.append(output_key)
+                else:
+                    right_keys.append(output_key)
+
+                # Reset the blocking keys in the config
+                self.op_config["blocking_keys"] = {
+                    "left": left_keys,
+                    "right": right_keys,
+                }
+
+                # Bubble up this config and return the transformation prompt, so we can optimize the map operation
+                return (
+                    self.op_config,
+                    0.0,
+                    {
+                        "optimize_map": True,
+                        "map_prompt": extraction_prompt,
+                        "output_key": output_key,
+                        "dataset_to_transform": dataset_to_transform,
+                    },
+                )
+
+            # Print the reason for not applying a map transformation
+            self.console.log(
+                f"Reason for not synthesizing a map transformation for either left or right dataset: {reason}"
+            )
+
+        # If there are no blocking keys, generate them
+        if not left_keys or not right_keys:
+            generated_left_keys, generated_right_keys = (
+                self._generate_blocking_keys_equijoin(left_data, right_data)
+            )
+            left_keys.extend(generated_left_keys)
+            right_keys.extend(generated_right_keys)
+            left_keys = list(set(left_keys))
+            right_keys = list(set(right_keys))
+
+            # Log the generated blocking keys
+            self.console.log(
+                "[bold]Generated blocking keys (for embeddings-based blocking):[/bold]"
+            )
+            self.console.log(f"Left keys: {left_keys}")
+            self.console.log(f"Right keys: {right_keys}")
+
+        left_embeddings, _, left_embedding_cost = self._compute_embeddings(
+            left_data, keys=left_keys
+        )
+        right_embeddings, _, right_embedding_cost = self._compute_embeddings(
+            right_data, keys=right_keys
+        )
+        self.console.log(
+            f"[bold]Cost of creating embeddings on the sample: ${left_embedding_cost + right_embedding_cost:.4f}[/bold]"
+        )
+
+        similarities = self._calculate_cross_similarities(
+            left_embeddings, right_embeddings
+        )
+
+        sampled_pairs = self._sample_pairs(similarities)
+        comparison_results, comparison_cost = self._perform_comparisons_equijoin(
+            left_data, right_data, sampled_pairs
+        )
+        self._print_similarity_histogram(similarities, comparison_results)
+        attempts = 0
+        while (
+            not any(result[2] for result in comparison_results)
+            and attempts < self.max_comparison_sampling_attempts
+        ):
+            self.console.log(
+                "[yellow]No matches found in the current sample. Resampling pairs to compare...[/yellow]"
+            )
+            sampled_pairs = self._sample_pairs(similarities)
+            comparison_results, current_cost = self._perform_comparisons_equijoin(
+                left_data, right_data, sampled_pairs
+            )
+            comparison_cost += current_cost
+            self._print_similarity_histogram(similarities, comparison_results)
+            attempts += 1
+
+        if not any(result[2] for result in comparison_results):
+            # If still no matches after max_comparison_sampling_attempts attempts, use 99th percentile similarity as threshold
+            # This is a heuristic to avoid being in an infinite loop
+            # TODO: have a better plan for sampling pairs or avoiding getting into this situation
+            self.console.log(
+                f"[yellow]No matches found after {self.max_comparison_sampling_attempts} attempts. Using 99th percentile similarity as threshold.[/yellow]"
+            )
+            threshold = np.percentile([sim[2] for sim in similarities], 99)
+            # TODO: figure out how to estimate selectivity
+            estimated_selectivity = 0.0
+            self.estimated_selectivity = estimated_selectivity
+
+        else:
+            threshold, estimated_selectivity = self._find_optimal_threshold(
+                comparison_results, similarities
+            )
+            self.estimated_selectivity = estimated_selectivity
+
+        blocking_rules = self._generate_blocking_rules_equijoin(
+            left_keys, right_keys, left_data, right_data, comparison_results
+        )
+
+        if blocking_rules:
+            false_negatives, rule_selectivity = self._verify_blocking_rule_equijoin(
+                left_data,
+                right_data,
+                blocking_rules[0],
+                left_keys,
+                right_keys,
+                comparison_results,
+            )
+            if not false_negatives and rule_selectivity <= estimated_selectivity:
+                self.console.log(
+                    "[green]Blocking rule verified. No false negatives detected in the sample and selectivity is within bounds.[/green]"
+                )
+            else:
+                if false_negatives:
+                    self.console.log(
+                        f"[red]Blocking rule rejected. {len(false_negatives)} false negatives detected in the sample.[/red]"
+                    )
+                    for i, j in false_negatives[:5]:  # Show up to 5 examples
+                        self.console.log(
+                            f"  Filtered pair: Left: {{{', '.join(f'{key}: {left_data[i][key]}' for key in left_keys)}}} and Right: {{{', '.join(f'{key}: {right_data[j][key]}' for key in right_keys)}}}"
+                        )
+                    if len(false_negatives) > 5:
+                        self.console.log(f"  ... and {len(false_negatives) - 5} more.")
+                if rule_selectivity > estimated_selectivity:
+                    self.console.log(
+                        f"[red]Blocking rule rejected. Rule selectivity ({rule_selectivity:.4f}) is higher than the estimated selectivity ({estimated_selectivity:.4f}).[/red]"
+                    )
+                blocking_rules = (
+                    []
+                )  # Clear the blocking rule if it introduces false negatives or is too selective
+
+        containment_rules = self._generate_containment_rules_equijoin(
+            left_data, right_data
+        )
+        if not skip_containment_gen:
+            self.console.log(
+                f"[bold]Generated {len(containment_rules)} containment rules. Please select which ones to use as blocking conditions:[/bold]"
+            )
+            selected_containment_rules = []
+            for rule in containment_rules:
+                self.console.log(f"[green]{rule}[/green]")
+                # Temporarily stop the status
+                if self.status:
+                    self.status.stop()
+                # Use Rich's Confirm for input
+                if Confirm.ask("Use this rule?", console=self.console):
+                    selected_containment_rules.append(rule)
+                # Restart the status
+                if self.status:
+                    self.status.start()
+        else:
+            # Take first 2
+            selected_containment_rules = containment_rules[:2]
+
+        if len(containment_rules) > 0:
+            self.console.log(
+                f"[bold]Selected {len(selected_containment_rules)} containment rules for blocking.[/bold]"
+            )
+        blocking_rules.extend(selected_containment_rules)
+
+        optimized_config = self._update_config_equijoin(
+            threshold, left_keys, right_keys, blocking_rules
+        )
+        return (
+            optimized_config,
+            left_embedding_cost + right_embedding_cost + comparison_cost,
+            {},
+        )
+
+    def _should_apply_map_transformation(
+        self,
+        left_keys: List[str],
+        right_keys: List[str],
+        left_data: List[Dict[str, Any]],
+        right_data: List[Dict[str, Any]],
+        sample_size: int = 5,
+    ) -> Tuple[bool, str, str]:
+        # Sample data
+        left_sample = random.sample(left_data, min(sample_size, len(left_data)))
+        right_sample = random.sample(right_data, min(sample_size, len(right_data)))
+
+        # Get keys and their average lengths
+        all_left_keys = {
+            k: sum(len(str(d[k])) for d in left_sample) / len(left_sample)
+            for k in left_sample[0].keys()
+        }
+        all_right_keys = {
+            k: sum(len(str(d[k])) for d in right_sample) / len(right_sample)
+            for k in right_sample[0].keys()
+        }
+
+        messages = [
+            {
+                "role": "user",
+                "content": f"""Analyze the following datasets and determine if an additional LLM transformation should be applied to generate a new key-value pair for easier joining:
+
+                Comparison prompt for the join operation: {self.op_config.get('comparison_prompt', 'No comparison prompt provided.')}
+
+                Left dataset keys and average lengths: {json.dumps(all_left_keys, indent=2)}
+                Right dataset keys and average lengths: {json.dumps(all_right_keys, indent=2)}
+
+                Left dataset sample:
+                {json.dumps(left_sample, indent=2)}
+
+                Right dataset sample:
+                {json.dumps(right_sample, indent=2)}
+
+                Current keys used for embedding-based ranking of likely matches:
+                Left keys: {left_keys}
+                Right keys: {right_keys}
+
+                Consider the following:
+                1. Are the current keys sufficient for accurate embedding-based ranking of likely matches? We don't want to use too many keys, or keys with too much information, as this will dilute the signal in the embeddings.
+                2. Are there any keys particularly long (e.g., full text fields), containing information that is not relevant for the join operation? The dataset with the longer keys should be transformed.
+                3. Would a summary or extraction of important information from long key-value pairs be beneficial? If so, the dataset with the longer keys should be transformed.
+                4. Is there a mismatch in information representation between the datasets?
+                5. Could an additional LLM-generated field improve the accuracy of embeddings or join comparisons?
+
+                If you believe an additional LLM transformation would be beneficial, specify which dataset (left or right) should be transformed and explain why. Otherwise, indicate that no additional transformation is needed and explain why the current blocking keys are sufficient.""",
+            }
+        ]
+
+        response = self.llm_client.generate(
+            messages,
+            "You are an AI expert in data analysis and entity matching.",
+            {
+                "type": "object",
+                "properties": {
+                    "apply_transformation": {"type": "boolean"},
+                    "dataset_to_transform": {
+                        "type": "string",
+                        "enum": ["left", "right", "none"],
+                    },
+                    "reason": {"type": "string"},
+                },
+                "required": ["apply_transformation", "dataset_to_transform", "reason"],
+            },
+        )
+
+        result = json.loads(response.choices[0].message.content)
+
+        return (
+            result["apply_transformation"],
+            result["dataset_to_transform"],
+            result["reason"],
+        )
+
+    def _generate_map_and_new_join_transformation(
+        self,
+        dataset_to_transform: str,
+        reason: str,
+        left_data: List[Dict[str, Any]],
+        right_data: List[Dict[str, Any]],
+        sample_size: int = 5,
+    ) -> Tuple[str, str, str]:
+        # Sample data
+        left_sample = random.sample(left_data, min(sample_size, len(left_data)))
+        right_sample = random.sample(right_data, min(sample_size, len(right_data)))
+
+        target_data = left_sample if dataset_to_transform == "left" else right_sample
+
+        messages = [
+            {
+                "role": "user",
+                "content": f"""Generate an LLM prompt to transform the {dataset_to_transform} dataset for easier joining. The transformation should create a new key-value pair.
+
+                Current comparison prompt for the join operation: {self.op_config.get('comparison_prompt', 'No comparison prompt provided.')}
+
+                Target ({dataset_to_transform}) dataset sample:
+                {json.dumps(target_data, indent=2)}
+
+                Other ({'left' if dataset_to_transform == "right" else "right"}) dataset sample:
+                {json.dumps(right_sample if dataset_to_transform == "left" else left_sample, indent=2)}
+
+                Reason for transforming {dataset_to_transform} dataset: {reason}
+
+                Please provide:
+                1. An LLM prompt to extract a smaller representation of what is relevant to the join task. The prompt should be a Jinja2 template, referring to any fields in the input data as {{{{ input.field_name }}}}. The prompt should instruct the LLM to return some **non-empty** string-valued output. The transformation should be tailored to the join task if possible, not just a generic summary of the data.
+                2. A name for the new output key that will store the transformed data.
+                3. An edited comparison prompt that leverages the new attribute created by the transformation. This prompt should be a Jinja2 template, referring to any fields in the input data as {{{{ left.field_name }}}} and {{{{ right.field_name }}}}. The prompt should be the same as the current comparison prompt, but with a new instruction that leverages the new attribute created by the transformation (in addition to the other fields in the prompt). The prompt should instruct the LLM to return a boolean-valued output, like the current comparison prompt.""",
+            }
+        ]
+
+        response = self.llm_client.generate(
+            messages,
+            "You are an AI expert in data analysis and decomposing complex data processing pipelines.",
+            {
+                "type": "object",
+                "properties": {
+                    "extraction_prompt": {"type": "string"},
+                    "output_key": {"type": "string"},
+                    "new_comparison_prompt": {"type": "string"},
+                },
+                "required": [
+                    "extraction_prompt",
+                    "output_key",
+                    "new_comparison_prompt",
+                ],
+            },
+        )
+
+        result = json.loads(response.choices[0].message.content)
+
+        return (
+            result["extraction_prompt"]
+            .replace("left.", "input.")
+            .replace("right.", "input."),
+            result["output_key"],
+            result["new_comparison_prompt"],
+        )
+
+    def _generate_blocking_keys_equijoin(
+        self,
+        left_data: List[Dict[str, Any]],
+        right_data: List[Dict[str, Any]],
+        sample_size: int = 5,
+    ) -> Tuple[List[str], List[str]]:
+        # Sample data
+        left_sample = random.sample(left_data, min(sample_size, len(left_data)))
+        right_sample = random.sample(right_data, min(sample_size, len(right_data)))
+
+        # Prepare sample data for LLM
+        left_keys = list(left_sample[0].keys())
+        right_keys = list(right_sample[0].keys())
+
+        messages = [
+            {
+                "role": "user",
+                "content": f"""Given the following sample data from two datasets, select appropriate blocking keys for an equijoin operation.
+                The blocking process works as follows:
+                1. We create embeddings for the selected keys from both datasets.
+                2. We use cosine similarity between these embeddings to filter pairs for more detailed LLM comparison.
+                3. Pairs with high similarity will be passed to the LLM for final comparison.
+
+                The blocking keys should have relatively short values and be useful for generating embeddings that capture the essence of potential matches.
+
+                Left dataset keys: {left_keys}
+                Right dataset keys: {right_keys}
+
+                Sample from left dataset:
+                {json.dumps(left_sample, indent=2)}
+
+                Sample from right dataset:
+                {json.dumps(right_sample, indent=2)}
+
+                For context, here is the comparison prompt that will be used for the more detailed LLM comparison:
+                {self.op_config.get('comparison_prompt', 'No comparison prompt provided.')}
+
+                Please select one or more keys from each dataset that would be suitable for blocking. The keys should contain information that's likely to be similar in matching records and align with the comparison prompt's focus.""",
+            }
+        ]
+
+        response = self.llm_client.generate(
+            messages,
+            "You are an expert in entity matching and database operations.",
+            {
+                "type": "object",
+                "properties": {
+                    "left_blocking_keys": {
+                        "type": "array",
+                        "items": {"type": "string"},
+                        "description": "List of selected blocking keys from the left dataset",
+                    },
+                    "right_blocking_keys": {
+                        "type": "array",
+                        "items": {"type": "string"},
+                        "description": "List of selected blocking keys from the right dataset",
+                    },
+                },
+                "required": ["left_blocking_keys", "right_blocking_keys"],
+            },
+        )
+
+        result = json.loads(response.choices[0].message.content)
+        left_blocking_keys = result["left_blocking_keys"]
+        right_blocking_keys = result["right_blocking_keys"]
+
+        return left_blocking_keys, right_blocking_keys
+
+    def _compute_embeddings(
+        self,
+        input_data: List[Dict[str, Any]],
+        keys: List[str] = None,
+        is_join: bool = True,
+    ) -> Tuple[List[List[float]], List[str], float]:
+        if keys is None:
+            keys = self.op_config.get("blocking_keys", [])
+            if not keys:
+                prompt_template = self.op_config.get("comparison_prompt", "")
+                prompt_vars = extract_jinja_variables(prompt_template)
+                # Get rid of input, input1, input2
+                prompt_vars = [
+                    var
+                    for var in prompt_vars
+                    if var not in ["input", "input1", "input2"]
+                ]
+
+                # strip all things before . in the prompt_vars
+                keys += list(set([var.split(".")[-1] for var in prompt_vars]))
+            if not keys:
+                self.console.log(
+                    "[yellow]Warning: No blocking keys found. Using all keys for blocking.[/yellow]"
+                )
+                keys = list(input_data[0].keys())
+
+        model_input_context_length = model_cost.get(
+            self.op_config.get("embedding_model", "text-embedding-3-small"), {}
+        ).get("max_input_tokens", 8192)
+        texts = [
+            " ".join(str(item[key]) for key in keys if key in item)[
+                :model_input_context_length
+            ]
+            for item in input_data
+        ]
+
+        embeddings = []
+        total_cost = 0
+        batch_size = 2000
+        for i in range(0, len(texts), batch_size):
+            batch = texts[i : i + batch_size]
+            self.console.log(
+                f"[cyan]Processing batch {i//batch_size + 1} of {len(texts)//batch_size + 1}[/cyan]"
+            )
+            response = self.runner.api.gen_embedding(
+                model=self.op_config.get("embedding_model", "text-embedding-3-small"),
+                input=batch,
+            )
+            embeddings.extend([data["embedding"] for data in response["data"]])
+            total_cost += completion_cost(response)
+        embeddings = [data["embedding"] for data in response["data"]]
+        cost = completion_cost(response)
+        return embeddings, keys, cost
+
+    def _calculate_cosine_similarities(
+        self, embeddings: List[List[float]]
+    ) -> List[Tuple[int, int, float]]:
+        embeddings_array = np.array(embeddings)
+        norms = np.linalg.norm(embeddings_array, axis=1)
+        dot_products = np.dot(embeddings_array, embeddings_array.T)
+        similarities_matrix = dot_products / np.outer(norms, norms)
+        i, j = np.triu_indices(len(embeddings), k=1)
+        similarities = list(
+            zip(i.tolist(), j.tolist(), similarities_matrix[i, j].tolist())
+        )
+        return similarities
+
+    def _print_similarity_histogram(
+        self,
+        similarities: List[Tuple[int, int, float]],
+        comparison_results: List[Tuple[int, int, bool]],
+    ):
+        flat_similarities = [sim[-1] for sim in similarities if sim[-1] != 1]
+        hist, bin_edges = np.histogram(flat_similarities, bins=20)
+        max_bar_width, max_count = 50, max(hist)
+        normalized_hist = [int(count / max_count * max_bar_width) for count in hist]
+
+        # Create a dictionary to store true labels
+        true_labels = {(i, j): is_match for i, j, is_match in comparison_results}
+
+        self.console.log("\n[bold]Embedding Cosine Similarity Distribution:[/bold]")
+        for i, count in enumerate(normalized_hist):
+            bar = "█" * count
+            label = f"{bin_edges[i]:.2f}-{bin_edges[i+1]:.2f}"
+
+            # Count true matches and not matches in this bin
+            true_matches = 0
+            not_matches = 0
+            labeled_count = 0
+            for sim in similarities:
+                if bin_edges[i] <= sim[2] < bin_edges[i + 1]:
+                    if (sim[0], sim[1]) in true_labels:
+                        labeled_count += 1
+                        if true_labels[(sim[0], sim[1])]:
+                            true_matches += 1
+                        else:
+                            not_matches += 1
+
+            # Calculate percentages of labeled pairs
+            if labeled_count > 0:
+                true_match_percent = (true_matches / labeled_count) * 100
+                not_match_percent = (not_matches / labeled_count) * 100
+            else:
+                true_match_percent = 0
+                not_match_percent = 0
+
+            self.console.log(
+                f"{label}: {bar} "
+                f"(Labeled: {labeled_count}/{hist[i]}, [green]{true_match_percent:.1f}% match[/green], [red]{not_match_percent:.1f}% not match[/red])"
+            )
+        self.console.log("\n")
+
+    def _sample_pairs(
+        self, similarities: List[Tuple[int, int, float]]
+    ) -> List[Tuple[int, int]]:
+        # Sort similarities in descending order
+        sorted_similarities = sorted(similarities, key=lambda x: x[2], reverse=True)
+
+        # Calculate weights using exponential weighting with self.sampling_weight
+        similarities_array = np.array([sim[2] for sim in sorted_similarities])
+        weights = np.exp(self.sampling_weight * similarities_array)
+        weights /= weights.sum()  # Normalize weights to sum to 1
+
+        # Sample pairs based on the calculated weights
+        sampled_indices = np.random.choice(
+            len(sorted_similarities),
+            size=min(self.sample_size, len(sorted_similarities)),
+            replace=False,
+            p=weights,
+        )
+
+        sampled_pairs = [
+            (sorted_similarities[i][0], sorted_similarities[i][1])
+            for i in sampled_indices
+        ]
+        return sampled_pairs
+
+    def _calculate_cross_similarities(
+        self, left_embeddings: List[List[float]], right_embeddings: List[List[float]]
+    ) -> List[Tuple[int, int, float]]:
+        left_array = np.array(left_embeddings)
+        right_array = np.array(right_embeddings)
+        dot_product = np.dot(left_array, right_array.T)
+        norm_left = np.linalg.norm(left_array, axis=1)
+        norm_right = np.linalg.norm(right_array, axis=1)
+        similarities = dot_product / np.outer(norm_left, norm_right)
+        return [
+            (i, j, sim)
+            for i, row in enumerate(similarities)
+            for j, sim in enumerate(row)
+        ]
+
+    def _perform_comparisons_resolve(
+        self, input_data: List[Dict[str, Any]], pairs: List[Tuple[int, int]]
+    ) -> Tuple[List[Tuple[int, int, bool]], float]:
+        comparisons, total_cost = [], 0
+        op = ResolveOperation(
+            self.runner,
+            self.op_config,
+            self.runner.default_model,
+            self.max_threads,
+            self.console,
+            self.status,
+        )
+        with ThreadPoolExecutor(max_workers=self.max_threads) as executor:
+            futures = [
+                executor.submit(
+                    op.compare_pair,
+                    self.op_config["comparison_prompt"],
+                    self.op_config.get(
+                        "comparison_model", self.config.get("model", "gpt-4o-mini")
+                    ),
+                    input_data[i],
+                    input_data[j],
+                )
+                for i, j in pairs
+            ]
+            for future, (i, j) in zip(futures, pairs):
+                is_match, cost, _ = future.result()
+                comparisons.append((i, j, is_match))
+                total_cost += cost
+
+        self.console.log(
+            f"[bold]Cost of pairwise comparisons on the sample: ${total_cost:.4f}[/bold]"
+        )
+        return comparisons, total_cost
+
+    def _perform_comparisons_equijoin(
+        self,
+        left_data: List[Dict[str, Any]],
+        right_data: List[Dict[str, Any]],
+        pairs: List[Tuple[int, int]],
+    ) -> Tuple[List[Tuple[int, int, bool]], float]:
+        comparisons, total_cost = [], 0
+        op = EquijoinOperation(
+            self.runner,
+            self.op_config,
+            self.runner.default_model,
+            self.max_threads,
+            self.console,
+            self.status,
+        )
+        with ThreadPoolExecutor(max_workers=self.max_threads) as executor:
+            futures = [
+                executor.submit(
+                    op.compare_pair,
+                    self.op_config["comparison_prompt"],
+                    self.op_config.get(
+                        "comparison_model", self.config.get("model", "gpt-4o-mini")
+                    ),
+                    left_data[i],
+                    right_data[j] if right_data else left_data[j],
+                )
+                for i, j in pairs
+            ]
+            for future, (i, j) in zip(futures, pairs):
+                is_match, cost = future.result()
+                comparisons.append((i, j, is_match))
+                total_cost += cost
+
+        self.console.log(
+            f"[bold]Cost of pairwise comparisons on the sample: ${total_cost:.4f}[/bold]"
+        )
+        return comparisons, total_cost
+
+    def _find_optimal_threshold(
+        self,
+        comparisons: List[Tuple[int, int, bool]],
+        similarities: List[Tuple[int, int, float]],
+    ) -> Tuple[float, float, float]:
+        true_labels = np.array([comp[2] for comp in comparisons])
+        sim_dict = {(i, j): sim for i, j, sim in similarities}
+        sim_scores = np.array([sim_dict[(i, j)] for i, j, _ in comparisons])
+
+        thresholds = np.linspace(0, 1, 100)
+        precisions, recalls = [], []
+
+        for threshold in thresholds:
+            predictions = sim_scores >= threshold
+            tp = np.sum(predictions & true_labels)
+            fp = np.sum(predictions & ~true_labels)
+            fn = np.sum(~predictions & true_labels)
+
+            precision = tp / (tp + fp) if (tp + fp) > 0 else 0
+            recall = tp / (tp + fn) if (tp + fn) > 0 else 0
+
+            precisions.append(precision)
+            recalls.append(recall)
+
+        valid_indices = [i for i, r in enumerate(recalls) if r >= self.target_recall]
+        if not valid_indices:
+            optimal_threshold = float(thresholds[np.argmax(recalls)])
+        else:
+            optimal_threshold = float(thresholds[max(valid_indices)])
+
+        # Improved selectivity estimation
+        all_similarities = np.array([s[2] for s in similarities])
+        sampled_similarities = sim_scores
+
+        # Calculate sampling probabilities
+        sampling_probs = np.exp(self.sampling_weight * sampled_similarities)
+        sampling_probs /= sampling_probs.sum()
+
+        # Estimate selectivity using importance sampling
+        weights = 1 / (len(all_similarities) * sampling_probs)
+        numerator = np.sum(weights * true_labels)
+        denominator = np.sum(weights)
+        selectivity_estimate = numerator / denominator
+
+        self.console.log(
+            "[bold cyan]┌─ Estimated Self-Join Selectivity ─────────────────────────┐[/bold cyan]"
+        )
+        self.console.log(
+            f"[bold cyan]│[/bold cyan] [yellow]Target Recall:[/yellow] {self.target_recall:.0%}"
+        )
+        self.console.log(
+            f"[bold cyan]│[/bold cyan] [yellow]Estimate:[/yellow] {selectivity_estimate:.4f}"
+        )
+        self.console.log(
+            "[bold cyan]└───────────────────────────────────────────────────────────┘[/bold cyan]"
+        )
+        self.console.log(
+            f"[bold]Chosen similarity threshold for blocking: {optimal_threshold:.4f}[/bold]"
+        )
+
+        return round(optimal_threshold, 4), selectivity_estimate
+
+    def _generate_blocking_rules(
+        self,
+        blocking_keys: List[str],
+        input_data: List[Dict[str, Any]],
+        comparisons: List[Tuple[int, int, bool]],
+    ) -> List[str]:
+        # Sample 2 true and 2 false comparisons
+        true_comparisons = [comp for comp in comparisons if comp[2]][:2]
+        false_comparisons = [comp for comp in comparisons if not comp[2]][:2]
+        sample_datas = [
+            (
+                {key: input_data[i][key] for key in blocking_keys},
+                {key: input_data[j][key] for key in blocking_keys},
+                is_match,
+            )
+            for i, j, is_match in true_comparisons + false_comparisons
+        ]
+
+        messages = [
+            {
+                "role": "user",
+                "content": f"""Given the following sample comparisons between entities, generate a single-line Python statement that acts as a blocking rule for entity resolution. This rule will be used in the form: `eval(blocking_rule, {{"input1": item1, "input2": item2}})`.
+
+    Sample comparisons (note: these are just a few examples and may not represent all possible cases):
+    {json.dumps(sample_datas, indent=2)}
+
+    For context, here is the comparison prompt that will be used for the more expensive, detailed comparison:
+    {self.op_config.get('comparison_prompt', 'No comparison prompt provided.')}
+
+    Please generate ONE one-line blocking rule that adheres to the following criteria:
+    1. The rule should evaluate to True if the entities are possibly a match and require further comparison.
+    2. The rule should evaluate to False ONLY if the entities are definitely not a match.
+    3. The rule must be a single Python expression that can be evaluated using the eval() function.
+    4. The rule should be much faster to evaluate than the full comparison prompt.
+    5. The rule should capture the essence of the comparison prompt but in a simplified manner.
+    6. The rule should be general enough to work well on the entire dataset, not just these specific examples.
+    7. The rule should handle inconsistent casing by using string methods like .lower() when comparing string values.
+    8. The rule should err on the side of inclusivity - it's better to have false positives than false negatives.
+
+    Example structure of a one-line blocking rule:
+    "(condition1) or (condition2) or (condition3)"
+
+    Where conditions could be comparisons like:
+    "input1['field'].lower() == input2['field'].lower()"
+    "abs(len(input1['text']) - len(input2['text'])) <= 5"
+    "any(word in input1['description'].lower() for word in input2['description'].lower().split())"
+
+    If there's no clear rule that can be generated based on the given information, return the string "True" to ensure all pairs are compared.
+
+    Remember, the primary goal of the blocking rule is to safely reduce the number of comparisons by quickly identifying pairs that are definitely not matches, while keeping all potential matches for further evaluation.""",
+            }
+        ]
+
+        for attempt in range(self.agent_max_retries):  # Up to 3 attempts
+            # Generate blocking rule using the LLM
+            response = self.llm_client.generate(
+                messages,
+                "You are an expert in entity resolution and Python programming. Your task is to generate one efficient blocking rule based on the given sample comparisons and data structure.",
+                {
+                    "type": "object",
+                    "properties": {
+                        "blocking_rule": {
+                            "type": "string",
+                            "description": "One-line Python statement acting as a blocking rule",
+                        }
+                    },
+                    "required": ["blocking_rule"],
+                },
+            )
+
+            # Extract the blocking rule from the LLM response
+            blocking_rule = response.choices[0].message.content
+            blocking_rule = json.loads(blocking_rule).get("blocking_rule")
+
+            if blocking_rule:
+                self.console.log("")  # Print a newline
+
+                if blocking_rule.strip() == "True":
+                    self.console.log(
+                        "[yellow]No suitable blocking rule could be found. Proceeding without a blocking rule.[/yellow]"
+                    )
+                    return []
+
+                self.console.log(
+                    f"[bold]Generated blocking rule (Attempt {attempt + 1}):[/bold] {blocking_rule}"
+                )
+
+                # Test the blocking rule
+                filtered_pairs = self._test_blocking_rule(
+                    input_data, blocking_keys, blocking_rule, comparisons
+                )
+
+                if not filtered_pairs:
+                    self.console.log(
+                        "[green]Blocking rule looks good! No known matches were filtered out.[/green]"
+                    )
+                    return [blocking_rule]
+                else:
+                    feedback = f"The previous rule incorrectly filtered out {len(filtered_pairs)} known matches. "
+                    feedback += (
+                        "Here are up to 3 examples of incorrectly filtered pairs:\n"
+                    )
+                    for i, j in filtered_pairs[:3]:
+                        feedback += f"Item 1: {json.dumps({key: input_data[i][key] for key in blocking_keys})}\nItem 2: {json.dumps({key: input_data[j][key] for key in blocking_keys})}\n"
+                        feedback += "These pairs are known matches but were filtered out by the rule.\n"
+                    feedback += "Please generate a new rule that doesn't filter out these matches."
+
+                    messages.append({"role": "assistant", "content": blocking_rule})
+                    messages.append({"role": "user", "content": feedback})
+            else:
+                self.console.log("[yellow]No blocking rule generated.[/yellow]")
+                return []
+
+        self.console.log(
+            f"[yellow]Failed to generate a suitable blocking rule after {self.agent_max_retries} attempts. Proceeding without a blocking rule.[/yellow]"
+        )
+        return []
+
+    def _test_blocking_rule(
+        self,
+        input_data: List[Dict[str, Any]],
+        blocking_keys: List[str],
+        blocking_rule: str,
+        comparisons: List[Tuple[int, int, bool]],
+    ) -> List[Tuple[int, int]]:
+        def apply_blocking_rule(item1, item2):
+            try:
+                return eval(blocking_rule, {"input1": item1, "input2": item2})
+            except Exception as e:
+                self.console.log(f"[red]Error applying blocking rule: {e}[/red]")
+                return True  # If there's an error, we default to comparing the pair
+
+        filtered_pairs = []
+
+        for i, j, is_match in comparisons:
+            if is_match:
+                item1 = {
+                    k: input_data[i][k] for k in blocking_keys if k in input_data[i]
+                }
+                item2 = {
+                    k: input_data[j][k] for k in blocking_keys if k in input_data[j]
+                }
+
+                if not apply_blocking_rule(item1, item2):
+                    filtered_pairs.append((i, j))
+
+        if filtered_pairs:
+            self.console.log(
+                f"[yellow italic]LLM Correction: The blocking rule incorrectly filtered out {len(filtered_pairs)} known positive matches.[/yellow italic]"
+            )
+            for i, j in filtered_pairs[:5]:  # Show up to 5 examples
+                self.console.log(
+                    f"  Incorrectly filtered pair 1: {json.dumps({key: input_data[i][key] for key in blocking_keys})}  and pair 2: {json.dumps({key: input_data[j][key] for key in blocking_keys})}"
+                )
+            if len(filtered_pairs) > 5:
+                self.console.log(
+                    f"  ... and {len(filtered_pairs) - 5} more incorrect pairs."
+                )
+
+        return filtered_pairs
+
+    def _generate_containment_rules_equijoin(
+        self,
+        left_data: List[Dict[str, Any]],
+        right_data: List[Dict[str, Any]],
+    ) -> List[str]:
+        # Get all available keys from the sample data
+        left_keys = set(left_data[0].keys())
+        right_keys = set(right_data[0].keys())
+
+        # Find the keys that are in the config's prompt
+        try:
+            left_prompt_keys = set(
+                self.op_config.get("comparison_prompt", "")
+                .split("{{ left.")[1]
+                .split(" }}")[0]
+                .split(".")
+            )
+        except Exception as e:
+            self.console.log(f"[red]Error parsing comparison prompt: {e}[/red]")
+            left_prompt_keys = left_keys
+
+        try:
+            right_prompt_keys = set(
+                self.op_config.get("comparison_prompt", "")
+                .split("{{ right.")[1]
+                .split(" }}")[0]
+                .split(".")
+            )
+        except Exception as e:
+            self.console.log(f"[red]Error parsing comparison prompt: {e}[/red]")
+            right_prompt_keys = right_keys
+
+        # Sample a few records from each dataset
+        sample_left = random.sample(left_data, min(3, len(left_data)))
+        sample_right = random.sample(right_data, min(3, len(right_data)))
+
+        messages = [
+            {
+                "role": "system",
+                "content": "You are an AI assistant tasked with generating containment-based blocking rules for an equijoin operation.",
+            },
+            {
+                "role": "user",
+                "content": f"""Generate multiple one-line Python statements that act as containment-based blocking rules for equijoin. These rules will be used in the form: `eval(blocking_rule, {{"left": item1, "right": item2}})`.
+
+Available keys in left dataset: {', '.join(left_keys)}
+Available keys in right dataset: {', '.join(right_keys)}
+
+Sample data from left dataset:
+{json.dumps(sample_left, indent=2)}
+
+Sample data from right dataset:
+{json.dumps(sample_right, indent=2)}
+
+Comparison prompt used for detailed comparison:
+{self.op_config.get('comparison_prompt', 'No comparison prompt provided.')}
+
+Please generate multiple one-line blocking rules that adhere to the following criteria:
+1. The rules should focus on containment relationships between fields in the left and right datasets. Containment can mean that the left field contains all the words in the right field, or the right field contains all the words in the left field.
+2. Each rule should evaluate to True if there's a potential match based on containment, False otherwise.
+3. Rules must be single Python expressions that can be evaluated using the eval() function.
+4. Rules should handle inconsistent casing by using string methods like .lower() when comparing string values.
+5. Consider the length of the fields when generating rules: for example, if the left field is much longer than the right field, it's more likely to contain all the words in the right field.
+
+Example structures of containment-based blocking rules:
+"all(word in left['{{left_key}}'].lower() for word in right['{{right_key}}'].lower().split())"
+"any(word in right['{{right_key}}'].lower().split() for word in left['{{left_key}}'].lower().split())"
+
+Please provide 3-5 different containment-based blocking rules, based on the keys and sample data provided. Prioritize rules with the following keys: {', '.join(left_prompt_keys)} and {', '.join(right_prompt_keys)}.""",
+            },
+        ]
+
+        response = self.llm_client.generate(
+            messages,
+            "You are an expert in data matching and Python programming.",
+            {
+                "type": "object",
+                "properties": {
+                    "containment_rules": {
+                        "type": "array",
+                        "items": {"type": "string"},
+                        "description": "List of containment-based blocking rules as Python expressions",
+                    }
+                },
+                "required": ["containment_rules"],
+            },
+        )
+
+        containment_rules = response.choices[0].message.content
+        containment_rules = json.loads(containment_rules).get("containment_rules")
+        return containment_rules
+
+    def _generate_blocking_rules_equijoin(
+        self,
+        left_keys: List[str],
+        right_keys: List[str],
+        left_data: List[Dict[str, Any]],
+        right_data: List[Dict[str, Any]],
+        comparisons: List[Tuple[int, int, bool]],
+    ) -> List[str]:
+        if not left_keys or not right_keys:
+            left_keys = list(left_data[0].keys())
+            right_keys = list(right_data[0].keys())
+
+        # Sample 2 true and 2 false comparisons
+        true_comparisons = [comp for comp in comparisons if comp[2]][:2]
+        false_comparisons = [comp for comp in comparisons if not comp[2]][:2]
+        sample_datas = [
+            (
+                {key: left_data[i][key] for key in left_keys if key in left_data[i]},
+                {key: right_data[j][key] for key in right_keys if key in right_data[j]},
+                is_match,
+            )
+            for i, j, is_match in true_comparisons + false_comparisons
+        ]
+
+        messages = [
+            {
+                "role": "user",
+                "content": f"""Given the following sample comparisons between entities, generate a single-line Python statement that acts as a blocking rule for equijoin. This rule will be used in the form: `eval(blocking_rule, {{"left": item1, "right": item2}})`.
+
+    Sample comparisons (note: these are just a few examples and may not represent all possible cases):
+    {json.dumps(sample_datas, indent=2)}
+
+    For context, here is the comparison prompt that will be used for the more expensive, detailed comparison:
+    {self.op_config.get('comparison_prompt', 'No comparison prompt provided.')}
+
+    Please generate ONE one-line blocking rule that adheres to the following criteria:
+    1. The rule should evaluate to True if the entities are possibly a match and require further comparison.
+    2. The rule should evaluate to False ONLY if the entities are definitely not a match.
+    3. The rule must be a single Python expression that can be evaluated using the eval() function.
+    4. The rule should be much faster to evaluate than the full comparison prompt.
+    5. The rule should capture the essence of the comparison prompt but in a simplified manner.
+    6. The rule should be general enough to work well on the entire dataset, not just these specific examples.
+    7. The rule should handle inconsistent casing by using string methods like .lower() when comparing string values.
+    8. The rule should err on the side of inclusivity - it's better to have false positives than false negatives.
+
+    Example structure of a one-line blocking rule:
+    "(condition1) or (condition2) or (condition3)"
+
+    Where conditions could be comparisons like:
+    "left['{left_keys[0]}'].lower() == right['{right_keys[0]}'].lower()"
+    "abs(len(left['{left_keys[0]}']) - len(right['{right_keys[0]}'])) <= 5"
+    "any(word in left['{left_keys[0]}'].lower() for word in right['{right_keys[0]}'].lower().split())"
+
+    If there's no clear rule that can be generated based on the given information, return the string "True" to ensure all pairs are compared.
+
+    Remember, the primary goal of the blocking rule is to safely reduce the number of comparisons by quickly identifying pairs that are definitely not matches, while keeping all potential matches for further evaluation.""",
+            }
+        ]
+
+        for attempt in range(self.agent_max_retries):
+            response = self.llm_client.generate(
+                messages,
+                "You are an expert in entity resolution and Python programming. Your task is to generate one efficient blocking rule based on the given sample comparisons and data structure.",
+                {
+                    "type": "object",
+                    "properties": {
+                        "blocking_rule": {
+                            "type": "string",
+                            "description": "One-line Python statement acting as a blocking rule",
+                        }
+                    },
+                    "required": ["blocking_rule"],
+                },
+            )
+
+            blocking_rule = response.choices[0].message.content
+            blocking_rule = json.loads(blocking_rule).get("blocking_rule")
+
+            if blocking_rule:
+                self.console.log("")
+
+                if blocking_rule.strip() == "True":
+                    self.console.log(
+                        "[yellow]No suitable blocking rule could be found. Proceeding without a blocking rule.[/yellow]"
+                    )
+                    return []
+
+                self.console.log(
+                    f"[bold]Generated blocking rule (Attempt {attempt + 1}):[/bold] {blocking_rule}"
+                )
+
+                # Test the blocking rule
+                filtered_pairs = self._test_blocking_rule_equijoin(
+                    left_data,
+                    right_data,
+                    left_keys,
+                    right_keys,
+                    blocking_rule,
+                    comparisons,
+                )
+
+                if not filtered_pairs:
+                    self.console.log(
+                        "[green]Blocking rule looks good! No known matches were filtered out.[/green]"
+                    )
+                    return [blocking_rule]
+                else:
+                    feedback = f"The previous rule incorrectly filtered out {len(filtered_pairs)} known matches. "
+                    feedback += (
+                        "Here are up to 3 examples of incorrectly filtered pairs:\n"
+                    )
+                    for i, j in filtered_pairs[:3]:
+                        feedback += f"Left: {json.dumps({key: left_data[i][key] for key in left_keys})}\n"
+                        feedback += f"Right: {json.dumps({key: right_data[j][key] for key in right_keys})}\n"
+                        feedback += "These pairs are known matches but were filtered out by the rule.\n"
+                    feedback += "Please generate a new rule that doesn't filter out these matches."
+
+                    messages.append({"role": "assistant", "content": blocking_rule})
+                    messages.append({"role": "user", "content": feedback})
+            else:
+                self.console.log("[yellow]No blocking rule generated.[/yellow]")
+                return []
+
+        self.console.log(
+            f"[yellow]Failed to generate a suitable blocking rule after {self.agent_max_retries} attempts. Proceeding without a blocking rule.[/yellow]"
+        )
+        return []
+
+    def _test_blocking_rule_equijoin(
+        self,
+        left_data: List[Dict[str, Any]],
+        right_data: List[Dict[str, Any]],
+        left_keys: List[str],
+        right_keys: List[str],
+        blocking_rule: str,
+        comparisons: List[Tuple[int, int, bool]],
+    ) -> List[Tuple[int, int]]:
+        def apply_blocking_rule(left, right):
+            try:
+                return eval(blocking_rule, {"left": left, "right": right})
+            except Exception as e:
+                self.console.log(f"[red]Error applying blocking rule: {e}[/red]")
+                return True  # If there's an error, we default to comparing the pair
+
+        filtered_pairs = []
+
+        for i, j, is_match in comparisons:
+            if is_match:
+                left = left_data[i]
+                right = right_data[j]
+                if not apply_blocking_rule(left, right):
+                    filtered_pairs.append((i, j))
+
+        if filtered_pairs:
+            self.console.log(
+                f"[yellow italic]LLM Correction: The blocking rule incorrectly filtered out {len(filtered_pairs)} known positive matches.[/yellow italic]"
+            )
+            for i, j in filtered_pairs[:5]:  # Show up to 5 examples
+                left_dict = {key: left_data[i][key] for key in left_keys}
+                right_dict = {key: right_data[j][key] for key in right_keys}
+                self.console.log(
+                    f"  Incorrectly filtered pair - Left: {json.dumps(left_dict)}  Right: {json.dumps(right_dict)}"
+                )
+            if len(filtered_pairs) > 5:
+                self.console.log(
+                    f"  ... and {len(filtered_pairs) - 5} more incorrect pairs."
+                )
+
+        return filtered_pairs
+
+    def _verify_blocking_rule_equijoin(
+        self,
+        left_data: List[Dict[str, Any]],
+        right_data: List[Dict[str, Any]],
+        blocking_rule: str,
+        left_keys: List[str],
+        right_keys: List[str],
+        comparison_results: List[Tuple[int, int, bool]],
+    ) -> Tuple[List[Tuple[int, int]], float]:
+        def apply_blocking_rule(left, right):
+            try:
+                return eval(blocking_rule, {"left": left, "right": right})
+            except Exception as e:
+                self.console.log(f"[red]Error applying blocking rule: {e}[/red]")
+                return True  # If there's an error, we default to comparing the pair
+
+        false_negatives = []
+        total_pairs = 0
+        blocked_pairs = 0
+
+        for i, j, is_match in comparison_results:
+            total_pairs += 1
+            left = left_data[i]
+            right = right_data[j]
+            if apply_blocking_rule(left, right):
+                blocked_pairs += 1
+                if is_match:
+                    false_negatives.append((i, j))
+
+        rule_selectivity = blocked_pairs / total_pairs if total_pairs > 0 else 0
+
+        return false_negatives, rule_selectivity
+
+    def _update_config_equijoin(
+        self,
+        threshold: float,
+        left_keys: List[str],
+        right_keys: List[str],
+        blocking_rules: List[str],
+    ) -> Dict[str, Any]:
+        optimized_config = self.op_config.copy()
+        optimized_config["blocking_keys"] = {
+            "left": left_keys,
+            "right": right_keys,
+        }
+        optimized_config["blocking_threshold"] = threshold
+        if blocking_rules:
+            optimized_config["blocking_conditions"] = blocking_rules
+        if "embedding_model" not in optimized_config:
+            optimized_config["embedding_model"] = "text-embedding-3-small"
+        return optimized_config
+
+    def _verify_blocking_rule(
+        self,
+        input_data: List[Dict[str, Any]],
+        blocking_rule: str,
+        blocking_keys: List[str],
+        comparison_results: List[Tuple[int, int, bool]],
+    ) -> Tuple[List[Tuple[int, int]], float]:
+        def apply_blocking_rule(item1, item2):
+            try:
+                return eval(blocking_rule, {"input1": item1, "input2": item2})
+            except Exception as e:
+                self.console.log(f"[red]Error applying blocking rule: {e}[/red]")
+                return True  # If there's an error, we default to comparing the pair
+
+        false_negatives = []
+        total_pairs = 0
+        blocked_pairs = 0
+
+        for i, j, is_match in comparison_results:
+            total_pairs += 1
+            item1 = {k: input_data[i][k] for k in blocking_keys if k in input_data[i]}
+            item2 = {k: input_data[j][k] for k in blocking_keys if k in input_data[j]}
+
+            if apply_blocking_rule(item1, item2):
+                blocked_pairs += 1
+                if is_match:
+                    false_negatives.append((i, j))
+
+        rule_selectivity = blocked_pairs / total_pairs if total_pairs > 0 else 0
+
+        return false_negatives, rule_selectivity
+
+    def _update_config(
+        self, threshold: float, blocking_keys: List[str], blocking_rules: List[str]
+    ) -> Dict[str, Any]:
+        optimized_config = self.op_config.copy()
+        optimized_config["blocking_keys"] = blocking_keys
+        optimized_config["blocking_threshold"] = threshold
+        if blocking_rules:
+            optimized_config["blocking_conditions"] = blocking_rules
+        if "embedding_model" not in optimized_config:
+            optimized_config["embedding_model"] = "text-embedding-3-small"
+        return optimized_config
+
+
+ + + +
+ + + + + + + + + +
+ + +

+ should_optimize(input_data) + +

+ + +
+ +

Determine if the given operation configuration should be optimized.

+ +
+ Source code in docetl/optimizers/join_optimizer.py +
375
+376
+377
+378
+379
+380
+381
+382
+383
+384
+385
+386
+387
+388
+389
+390
+391
+392
+393
+394
+395
+396
+397
+398
+399
+400
+401
+402
+403
+404
+405
+406
+407
+408
+409
+410
+411
+412
+413
+414
+415
+416
+417
+418
+419
+420
+421
+422
+423
+424
+425
+426
+427
+428
+429
+430
+431
+432
+433
+434
+435
+436
def should_optimize(self, input_data: List[Dict[str, Any]]) -> Tuple[bool, str]:
+    """
+    Determine if the given operation configuration should be optimized.
+    """
+    # If there are no blocking keys or embeddings, then we don't need to optimize
+    if not self.op_config.get("blocking_conditions") or not self.op_config.get(
+        "blocking_threshold"
+    ):
+        return True, ""
+
+    # Check if the operation is marked as empty
+    elif self.op_config.get("empty", False):
+        # Extract the map prompt from the intermediates
+        map_prompt = self.op_config["_intermediates"]["map_prompt"]
+        reduce_key = self.op_config["_intermediates"]["reduce_key"]
+
+        if reduce_key is None:
+            raise ValueError(
+                "[yellow]Warning: No reduce key found in intermediates for synthesized resolve operation.[/yellow]"
+            )
+
+        dedup = True
+        explanation = "There is a reduce operation that does not follow a resolve operation. Consider adding a resolve operation to deduplicate the data."
+
+        if map_prompt:
+            # Analyze the map prompt
+            analysis, explanation = self._analyze_map_prompt_categorization(
+                map_prompt
+            )
+
+            if analysis:
+                dedup = False
+        else:
+            self.console.log(
+                "[yellow]No map prompt found in intermediates for analysis.[/yellow]"
+            )
+
+        # TODO: figure out why this would ever be the case
+        if not map_prompt:
+            map_prompt = "N/A"
+
+        if dedup is False:
+            dedup, explanation = self._determine_duplicate_keys(
+                input_data, reduce_key, map_prompt
+            )
+
+        # Now do the last attempt of pairwise comparisons
+        if dedup is False:
+            # Sample up to 20 random pairs of keys for duplicate analysis
+            sampled_pairs = self._sample_random_pairs(input_data, 20)
+
+            # Use LLM to check for duplicates
+            duplicates_found, explanation = self._check_duplicates_with_llm(
+                input_data, sampled_pairs, reduce_key, map_prompt
+            )
+
+            if duplicates_found:
+                dedup = True
+
+        return dedup, explanation
+
+    return False, ""
+
+
+
+ +
+ + + +
+ +
+ +
+ + + + + + + + + + + + + +
+
+ + + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + + \ No newline at end of file diff --git a/api-reference/python/index.html b/api-reference/python/index.html new file mode 100644 index 00000000..49e44c0a --- /dev/null +++ b/api-reference/python/index.html @@ -0,0 +1,3457 @@ + + + + + + + + + + + + + + + + + + + + + + + Python API - docetl docs + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + Skip to content + + +
+
+ +
+ + + + + + +
+ + + + + + + +
+ +
+ + + + +
+
+ + + +
+
+
+ + + + + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

Python API

+

Operations

+ + +
+ + + +

+ docetl.schemas.MapOp = map.MapOperation.schema + + + module-attribute + + +

+ + +
+
+ +
+ +
+ + + +

+ docetl.schemas.ResolveOp = resolve.ResolveOperation.schema + + + module-attribute + + +

+ + +
+
+ +
+ +
+ + + +

+ docetl.schemas.ReduceOp = reduce.ReduceOperation.schema + + + module-attribute + + +

+ + +
+
+ +
+ +
+ + + +

+ docetl.schemas.ParallelMapOp = map.ParallelMapOperation.schema + + + module-attribute + + +

+ + +
+
+ +
+ +
+ + + +

+ docetl.schemas.FilterOp = filter.FilterOperation.schema + + + module-attribute + + +

+ + +
+
+ +
+ +
+ + + +

+ docetl.schemas.EquijoinOp = equijoin.EquijoinOperation.schema + + + module-attribute + + +

+ + +
+
+ +
+ +
+ + + +

+ docetl.schemas.SplitOp = split.SplitOperation.schema + + + module-attribute + + +

+ + +
+
+ +
+ +
+ + + +

+ docetl.schemas.GatherOp = gather.GatherOperation.schema + + + module-attribute + + +

+ + +
+
+ +
+ +
+ + + +

+ docetl.schemas.UnnestOp = unnest.UnnestOperation.schema + + + module-attribute + + +

+ + +
+
+ +
+ +
+ + + +

+ docetl.schemas.SampleOp = sample.SampleOperation.schema + + + module-attribute + + +

+ + +
+
+ +
+ +
+ + + +

+ docetl.schemas.ClusterOp = cluster.ClusterOperation.schema + + + module-attribute + + +

+ + +
+
+ +

Dataset and Pipeline

+ + +
+ + + +

+ docetl.schemas.Dataset = dataset.Dataset.schema + + + module-attribute + + +

+ + +
+
+ +
+ +
+ + + +

+ docetl.schemas.ParsingTool + + +

+ + +
+

+ Bases: BaseModel

+ + +

Represents a parsing tool used for custom data parsing in the pipeline.

+ + +

Attributes:

+ + + + + + + + + + + + + + + + + + + + +
NameTypeDescription
name + str + +
+

The name of the parsing tool. This should be unique within the pipeline configuration.

+
+
function_code + str + +
+

The Python code defining the parsing function. This code will be executed + to parse the input data according to the specified logic. It should return a list of strings, where each string is its own document.

+
+
+ + +
+ Example +
parsing_tools:
+  - name: ocr_parser
+    function_code: |
+      import pytesseract
+      from pdf2image import convert_from_path
+      def ocr_parser(filename: str) -> List[str]:
+          images = convert_from_path(filename)
+          text = ""
+          for image in images:
+              text += pytesseract.image_to_string(image)
+          return [text]
+
+
+ + + + + +
+ Source code in docetl/base_schemas.py +
20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
class ParsingTool(BaseModel):
+    """
+    Represents a parsing tool used for custom data parsing in the pipeline.
+
+    Attributes:
+        name (str): The name of the parsing tool. This should be unique within the pipeline configuration.
+        function_code (str): The Python code defining the parsing function. This code will be executed
+                             to parse the input data according to the specified logic. It should return a list of strings, where each string is its own document.
+
+    Example:
+        ```yaml
+        parsing_tools:
+          - name: ocr_parser
+            function_code: |
+              import pytesseract
+              from pdf2image import convert_from_path
+              def ocr_parser(filename: str) -> List[str]:
+                  images = convert_from_path(filename)
+                  text = ""
+                  for image in images:
+                      text += pytesseract.image_to_string(image)
+                  return [text]
+        ```
+    """
+
+    name: str
+    function_code: str
+
+
+ + + +
+ + + + + + + + + + + +
+ +
+ +
+ +
+ + + +

+ docetl.schemas.PipelineStep + + +

+ + +
+

+ Bases: BaseModel

+ + +

Represents a step in the pipeline.

+ + +

Attributes:

+ + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescription
name + str + +
+

The name of the step.

+
+
operations + List[Union[Dict[str, Any], str]] + +
+

A list of operations to be applied in this step. +Each operation can be either a string (the name of the operation) or a dictionary +(for more complex configurations).

+
+
input + Optional[str] + +
+

The input for this step. It can be either the name of a dataset +or the name of a previous step. If not provided, the step will use the output +of the previous step as its input.

+
+
+ + +
+ Example +
# Simple step with a single operation
+process_step = PipelineStep(
+    name="process_step",
+    input="my_dataset",
+    operations=["process"]
+)
+
+# Step with multiple operations
+summarize_step = PipelineStep(
+    name="summarize_step",
+    input="process_step",
+    operations=["summarize"]
+)
+
+# Step with a more complex operation configuration
+custom_step = PipelineStep(
+    name="custom_step",
+    input="previous_step",
+    operations=[
+        {
+            "custom_operation": {
+                "model": "gpt-4",
+                "prompt": "Perform a custom analysis on the following text:"
+            }
+        }
+    ]
+)
+
+

These examples show different ways to configure pipeline steps, from simple +single-operation steps to more complex configurations with custom parameters.

+ + + + + + +
+ Source code in docetl/base_schemas.py +
49
+50
+51
+52
+53
+54
+55
+56
+57
+58
+59
+60
+61
+62
+63
+64
+65
+66
+67
+68
+69
+70
+71
+72
+73
+74
+75
+76
+77
+78
+79
+80
+81
+82
+83
+84
+85
+86
+87
+88
+89
+90
+91
+92
+93
+94
+95
+96
+97
+98
+99
class PipelineStep(BaseModel):
+    """
+    Represents a step in the pipeline.
+
+    Attributes:
+        name (str): The name of the step.
+        operations (List[Union[Dict[str, Any], str]]): A list of operations to be applied in this step.
+            Each operation can be either a string (the name of the operation) or a dictionary
+            (for more complex configurations).
+        input (Optional[str]): The input for this step. It can be either the name of a dataset
+            or the name of a previous step. If not provided, the step will use the output
+            of the previous step as its input.
+
+    Example:
+        ```python
+        # Simple step with a single operation
+        process_step = PipelineStep(
+            name="process_step",
+            input="my_dataset",
+            operations=["process"]
+        )
+
+        # Step with multiple operations
+        summarize_step = PipelineStep(
+            name="summarize_step",
+            input="process_step",
+            operations=["summarize"]
+        )
+
+        # Step with a more complex operation configuration
+        custom_step = PipelineStep(
+            name="custom_step",
+            input="previous_step",
+            operations=[
+                {
+                    "custom_operation": {
+                        "model": "gpt-4",
+                        "prompt": "Perform a custom analysis on the following text:"
+                    }
+                }
+            ]
+        )
+        ```
+
+    These examples show different ways to configure pipeline steps, from simple
+    single-operation steps to more complex configurations with custom parameters.
+    """
+
+    name: str
+    operations: List[Union[Dict[str, Any], str]]
+    input: Optional[str] = None
+
+
+ + + +
+ + + + + + + + + + + +
+ +
+ +
+ +
+ + + +

+ docetl.schemas.PipelineOutput + + +

+ + +
+

+ Bases: BaseModel

+ + +

Represents the output configuration for a pipeline.

+ + +

Attributes:

+ + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescription
type + str + +
+

The type of output. This could be 'file', 'database', etc.

+
+
path + str + +
+

The path where the output will be stored. This could be a file path, + database connection string, etc., depending on the type.

+
+
intermediate_dir + Optional[str] + +
+

The directory to store intermediate results, + if applicable. Defaults to None.

+
+
+ + +
+ Example +
output = PipelineOutput(
+    type="file",
+    path="/path/to/output.json",
+    intermediate_dir="/path/to/intermediate/results"
+)
+
+
+ + + + + +
+ Source code in docetl/base_schemas.py +
102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
class PipelineOutput(BaseModel):
+    """
+    Represents the output configuration for a pipeline.
+
+    Attributes:
+        type (str): The type of output. This could be 'file', 'database', etc.
+        path (str): The path where the output will be stored. This could be a file path,
+                    database connection string, etc., depending on the type.
+        intermediate_dir (Optional[str]): The directory to store intermediate results,
+                                          if applicable. Defaults to None.
+
+    Example:
+        ```python
+        output = PipelineOutput(
+            type="file",
+            path="/path/to/output.json",
+            intermediate_dir="/path/to/intermediate/results"
+        )
+        ```
+    """
+
+    type: str
+    path: str
+    intermediate_dir: Optional[str] = None
+
+
+ + + +
+ + + + + + + + + + + +
+ +
+ +
+ +
+ + + +

+ docetl.api.Pipeline + + +

+ + +
+ + +

Represents a complete document processing pipeline.

+ + +

Attributes:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescription
name + str + +
+

The name of the pipeline.

+
+
datasets + Dict[str, Dataset] + +
+

A dictionary of datasets used in the pipeline, + where keys are dataset names and values are Dataset objects.

+
+
operations + List[OpType] + +
+

A list of operations to be performed in the pipeline.

+
+
steps + List[PipelineStep] + +
+

A list of steps that make up the pipeline.

+
+
output + PipelineOutput + +
+

The output configuration for the pipeline.

+
+
parsing_tools + List[ParsingTool] + +
+

A list of parsing tools used in the pipeline. + Defaults to an empty list.

+
+
default_model + Optional[str] + +
+

The default language model to use for operations + that require one. Defaults to None.

+
+
+ + +
+ Example +
def custom_parser(text: str) -> List[str]:
+    # this will convert the text in the column to uppercase
+    # You should return a list of strings, where each string is a separate document
+    return [text.upper()]
+
+pipeline = Pipeline(
+    name="document_processing_pipeline",
+    datasets={
+        "input_data": Dataset(type="file", path="/path/to/input.json", parsing=[{"name": "custom_parser", "input_key": "content", "output_key": "uppercase_content"}]),
+    },
+    parsing_tools=[custom_parser],
+    operations=[
+        MapOp(
+            name="process",
+            type="map",
+            prompt="Determine what type of document this is: {{ input.uppercase_content }}",
+            output={"schema": {"document_type": "string"}}
+        ),
+        ReduceOp(
+            name="summarize",
+            type="reduce",
+            reduce_key="document_type",
+            prompt="Summarize the processed contents: {% for item in inputs %}{{ item.uppercase_content }} {% endfor %}",
+            output={"schema": {"summary": "string"}}
+        )
+    ],
+    steps=[
+        PipelineStep(name="process_step", input="input_data", operations=["process"]),
+        PipelineStep(name="summarize_step", input="process_step", operations=["summarize"])
+    ],
+    output=PipelineOutput(type="file", path="/path/to/output.json"),
+    default_model="gpt-4o-mini"
+)
+
+

This example shows a complete pipeline configuration with datasets, operations, +steps, and output settings.

+ + + + + + +
+ Source code in docetl/api.py +
 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
+162
+163
+164
+165
+166
+167
+168
+169
+170
+171
+172
+173
+174
+175
+176
+177
+178
+179
+180
+181
+182
+183
+184
+185
+186
+187
+188
+189
+190
+191
+192
+193
+194
+195
+196
+197
+198
+199
+200
+201
+202
+203
+204
+205
+206
+207
+208
+209
+210
+211
+212
+213
+214
+215
+216
+217
+218
+219
+220
+221
+222
+223
+224
+225
+226
+227
+228
+229
+230
+231
+232
+233
+234
+235
+236
+237
+238
+239
+240
+241
+242
+243
+244
+245
+246
+247
+248
+249
+250
+251
+252
+253
+254
+255
+256
+257
+258
+259
+260
+261
+262
+263
+264
+265
+266
+267
+268
+269
+270
+271
+272
+273
+274
+275
+276
+277
+278
+279
+280
+281
+282
+283
+284
+285
+286
+287
+288
+289
+290
+291
+292
+293
+294
+295
+296
+297
+298
+299
+300
+301
+302
+303
+304
+305
+306
+307
+308
+309
+310
+311
+312
+313
+314
+315
+316
+317
+318
+319
+320
+321
+322
+323
+324
+325
+326
+327
+328
+329
+330
+331
+332
+333
+334
+335
+336
+337
class Pipeline:
+    """
+    Represents a complete document processing pipeline.
+
+    Attributes:
+        name (str): The name of the pipeline.
+        datasets (Dict[str, Dataset]): A dictionary of datasets used in the pipeline,
+                                       where keys are dataset names and values are Dataset objects.
+        operations (List[OpType]): A list of operations to be performed in the pipeline.
+        steps (List[PipelineStep]): A list of steps that make up the pipeline.
+        output (PipelineOutput): The output configuration for the pipeline.
+        parsing_tools (List[ParsingTool]): A list of parsing tools used in the pipeline.
+                                           Defaults to an empty list.
+        default_model (Optional[str]): The default language model to use for operations
+                                       that require one. Defaults to None.
+
+    Example:
+        ```python
+        def custom_parser(text: str) -> List[str]:
+            # this will convert the text in the column to uppercase
+            # You should return a list of strings, where each string is a separate document
+            return [text.upper()]
+
+        pipeline = Pipeline(
+            name="document_processing_pipeline",
+            datasets={
+                "input_data": Dataset(type="file", path="/path/to/input.json", parsing=[{"name": "custom_parser", "input_key": "content", "output_key": "uppercase_content"}]),
+            },
+            parsing_tools=[custom_parser],
+            operations=[
+                MapOp(
+                    name="process",
+                    type="map",
+                    prompt="Determine what type of document this is: {{ input.uppercase_content }}",
+                    output={"schema": {"document_type": "string"}}
+                ),
+                ReduceOp(
+                    name="summarize",
+                    type="reduce",
+                    reduce_key="document_type",
+                    prompt="Summarize the processed contents: {% for item in inputs %}{{ item.uppercase_content }} {% endfor %}",
+                    output={"schema": {"summary": "string"}}
+                )
+            ],
+            steps=[
+                PipelineStep(name="process_step", input="input_data", operations=["process"]),
+                PipelineStep(name="summarize_step", input="process_step", operations=["summarize"])
+            ],
+            output=PipelineOutput(type="file", path="/path/to/output.json"),
+            default_model="gpt-4o-mini"
+        )
+        ```
+
+    This example shows a complete pipeline configuration with datasets, operations,
+    steps, and output settings.
+    """
+
+    def __init__(
+        self,
+        name: str,
+        datasets: Dict[str, Dataset],
+        operations: List[OpType],
+        steps: List[PipelineStep],
+        output: PipelineOutput,
+        parsing_tools: List[Union[ParsingTool, Callable]] = [],
+        default_model: Optional[str] = None,
+        rate_limits: Optional[Dict[str, int]] = None,
+    ):
+        self.name = name
+        self.datasets = datasets
+        self.operations = operations
+        self.steps = steps
+        self.output = output
+        self.parsing_tools = [
+            (
+                tool
+                if isinstance(tool, ParsingTool)
+                else ParsingTool(
+                    name=tool.__name__, function_code=inspect.getsource(tool)
+                )
+            )
+            for tool in parsing_tools
+        ]
+        self.default_model = default_model
+        self.rate_limits = rate_limits
+        self._load_env()
+
+    def _load_env(self):
+        import os
+
+        from dotenv import load_dotenv
+
+        # Get the current working directory
+        cwd = os.getcwd()
+
+        # Load .env file from the current working directory if it exists
+        env_file = os.path.join(cwd, ".env")
+        if os.path.exists(env_file):
+            load_dotenv(env_file)
+
+    def optimize(
+        self,
+        max_threads: Optional[int] = None,
+        model: str = "gpt-4o",
+        resume: bool = False,
+        timeout: int = 60,
+    ) -> "Pipeline":
+        """
+        Optimize the pipeline using the Optimizer.
+
+        Args:
+            max_threads (Optional[int]): Maximum number of threads to use for optimization.
+            model (str): The model to use for optimization. Defaults to "gpt-4o".
+            resume (bool): Whether to resume optimization from a previous state. Defaults to False.
+            timeout (int): Timeout for optimization in seconds. Defaults to 60.
+
+        Returns:
+            Pipeline: An optimized version of the pipeline.
+        """
+        config = self._to_dict()
+        runner = DSLRunner(
+            config,
+            base_name=os.path.join(os.getcwd(), self.name),
+            yaml_file_suffix=self.name,
+            max_threads=max_threads,
+        )
+        optimized_config, _ = runner.optimize(return_pipeline=False)
+
+        updated_pipeline = Pipeline(
+            name=self.name,
+            datasets=self.datasets,
+            operations=self.operations,
+            steps=self.steps,
+            output=self.output,
+            default_model=self.default_model,
+            parsing_tools=self.parsing_tools,
+        )
+        updated_pipeline._update_from_dict(optimized_config)
+        return updated_pipeline
+
+    def run(self, max_threads: Optional[int] = None) -> float:
+        """
+        Run the pipeline using the DSLRunner.
+
+        Args:
+            max_threads (Optional[int]): Maximum number of threads to use for execution.
+
+        Returns:
+            float: The total cost of running the pipeline.
+        """
+        config = self._to_dict()
+        runner = DSLRunner(
+            config,
+            base_name=os.path.join(os.getcwd(), self.name),
+            yaml_file_suffix=self.name,
+            max_threads=max_threads,
+        )
+        result = runner.load_run_save()
+        return result
+
+    def to_yaml(self, path: str) -> None:
+        """
+        Convert the Pipeline object to a YAML string and save it to a file.
+
+        Args:
+            path (str): Path to save the YAML file.
+
+        Returns:
+            None
+        """
+        config = self._to_dict()
+        with open(path, "w") as f:
+            yaml.safe_dump(config, f)
+
+        print(f"[green]Pipeline saved to {path}[/green]")
+
+    def _to_dict(self) -> Dict[str, Any]:
+        """
+        Convert the Pipeline object to a dictionary representation.
+
+        Returns:
+            Dict[str, Any]: Dictionary representation of the Pipeline.
+        """
+        d = {
+            "datasets": {
+                name: dataset.dict() for name, dataset in self.datasets.items()
+            },
+            "operations": [
+                {k: v for k, v in op.dict().items() if v is not None}
+                for op in self.operations
+            ],
+            "pipeline": {
+                "steps": [
+                    {k: v for k, v in step.dict().items() if v is not None}
+                    for step in self.steps
+                ],
+                "output": self.output.dict(),
+            },
+            "default_model": self.default_model,
+            "parsing_tools": (
+                [tool.dict() for tool in self.parsing_tools]
+                if self.parsing_tools
+                else None
+            ),
+        }
+        if self.rate_limits:
+            d["rate_limits"] = self.rate_limits
+        return d
+
+    def _update_from_dict(self, config: Dict[str, Any]):
+        """
+        Update the Pipeline object from a dictionary representation.
+
+        Args:
+            config (Dict[str, Any]): Dictionary representation of the Pipeline.
+        """
+        self.datasets = {
+            name: Dataset(
+                type=dataset["type"],
+                source=dataset["source"],
+                path=dataset["path"],
+                parsing=dataset.get("parsing"),
+            )
+            for name, dataset in config["datasets"].items()
+        }
+        self.operations = []
+        for op in config["operations"]:
+            op_type = op.pop("type")
+            if op_type == "map":
+                self.operations.append(MapOp(**op, type=op_type))
+            elif op_type == "resolve":
+                self.operations.append(ResolveOp(**op, type=op_type))
+            elif op_type == "reduce":
+                self.operations.append(ReduceOp(**op, type=op_type))
+            elif op_type == "parallel_map":
+                self.operations.append(ParallelMapOp(**op, type=op_type))
+            elif op_type == "filter":
+                self.operations.append(FilterOp(**op, type=op_type))
+            elif op_type == "equijoin":
+                self.operations.append(EquijoinOp(**op, type=op_type))
+            elif op_type == "split":
+                self.operations.append(SplitOp(**op, type=op_type))
+            elif op_type == "gather":
+                self.operations.append(GatherOp(**op, type=op_type))
+            elif op_type == "unnest":
+                self.operations.append(UnnestOp(**op, type=op_type))
+            elif op_type == "cluster":
+                self.operations.append(ClusterOp(**op, type=op_type))
+            elif op_type == "sample":
+                self.operations.append(SampleOp(**op, type=op_type))
+        self.steps = [PipelineStep(**step) for step in config["pipeline"]["steps"]]
+        self.output = PipelineOutput(**config["pipeline"]["output"])
+        self.default_model = config.get("default_model")
+        self.parsing_tools = (
+            [ParsingTool(**tool) for tool in config.get("parsing_tools", [])]
+            if config.get("parsing_tools")
+            else []
+        )
+
+
+ + + +
+ + + + + + + + + +
+ + +

+ optimize(max_threads=None, model='gpt-4o', resume=False, timeout=60) + +

+ + +
+ +

Optimize the pipeline using the Optimizer.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ max_threads + + Optional[int] + +
+

Maximum number of threads to use for optimization.

+
+
+ None +
+ model + + str + +
+

The model to use for optimization. Defaults to "gpt-4o".

+
+
+ 'gpt-4o' +
+ resume + + bool + +
+

Whether to resume optimization from a previous state. Defaults to False.

+
+
+ False +
+ timeout + + int + +
+

Timeout for optimization in seconds. Defaults to 60.

+
+
+ 60 +
+ + +

Returns:

+ + + + + + + + + + + + + +
Name TypeDescription
Pipeline + Pipeline + +
+

An optimized version of the pipeline.

+
+
+ +
+ Source code in docetl/api.py +
180
+181
+182
+183
+184
+185
+186
+187
+188
+189
+190
+191
+192
+193
+194
+195
+196
+197
+198
+199
+200
+201
+202
+203
+204
+205
+206
+207
+208
+209
+210
+211
+212
+213
+214
+215
+216
+217
+218
def optimize(
+    self,
+    max_threads: Optional[int] = None,
+    model: str = "gpt-4o",
+    resume: bool = False,
+    timeout: int = 60,
+) -> "Pipeline":
+    """
+    Optimize the pipeline using the Optimizer.
+
+    Args:
+        max_threads (Optional[int]): Maximum number of threads to use for optimization.
+        model (str): The model to use for optimization. Defaults to "gpt-4o".
+        resume (bool): Whether to resume optimization from a previous state. Defaults to False.
+        timeout (int): Timeout for optimization in seconds. Defaults to 60.
+
+    Returns:
+        Pipeline: An optimized version of the pipeline.
+    """
+    config = self._to_dict()
+    runner = DSLRunner(
+        config,
+        base_name=os.path.join(os.getcwd(), self.name),
+        yaml_file_suffix=self.name,
+        max_threads=max_threads,
+    )
+    optimized_config, _ = runner.optimize(return_pipeline=False)
+
+    updated_pipeline = Pipeline(
+        name=self.name,
+        datasets=self.datasets,
+        operations=self.operations,
+        steps=self.steps,
+        output=self.output,
+        default_model=self.default_model,
+        parsing_tools=self.parsing_tools,
+    )
+    updated_pipeline._update_from_dict(optimized_config)
+    return updated_pipeline
+
+
+
+ +
+ +
+ + +

+ run(max_threads=None) + +

+ + +
+ +

Run the pipeline using the DSLRunner.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ max_threads + + Optional[int] + +
+

Maximum number of threads to use for execution.

+
+
+ None +
+ + +

Returns:

+ + + + + + + + + + + + + +
Name TypeDescription
float + float + +
+

The total cost of running the pipeline.

+
+
+ +
+ Source code in docetl/api.py +
220
+221
+222
+223
+224
+225
+226
+227
+228
+229
+230
+231
+232
+233
+234
+235
+236
+237
+238
def run(self, max_threads: Optional[int] = None) -> float:
+    """
+    Run the pipeline using the DSLRunner.
+
+    Args:
+        max_threads (Optional[int]): Maximum number of threads to use for execution.
+
+    Returns:
+        float: The total cost of running the pipeline.
+    """
+    config = self._to_dict()
+    runner = DSLRunner(
+        config,
+        base_name=os.path.join(os.getcwd(), self.name),
+        yaml_file_suffix=self.name,
+        max_threads=max_threads,
+    )
+    result = runner.load_run_save()
+    return result
+
+
+
+ +
+ +
+ + +

+ to_yaml(path) + +

+ + +
+ +

Convert the Pipeline object to a YAML string and save it to a file.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ path + + str + +
+

Path to save the YAML file.

+
+
+ required +
+ + +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ None + +
+

None

+
+
+ +
+ Source code in docetl/api.py +
240
+241
+242
+243
+244
+245
+246
+247
+248
+249
+250
+251
+252
+253
+254
def to_yaml(self, path: str) -> None:
+    """
+    Convert the Pipeline object to a YAML string and save it to a file.
+
+    Args:
+        path (str): Path to save the YAML file.
+
+    Returns:
+        None
+    """
+    config = self._to_dict()
+    with open(path, "w") as f:
+        yaml.safe_dump(config, f)
+
+    print(f"[green]Pipeline saved to {path}[/green]")
+
+
+
+ +
+ + + +
+ +
+ +
+ + + + + + + + + + + + + +
+
+ + + +
+ + + +
+ + + +
+
+
+
+ + + + + + + + + + \ No newline at end of file diff --git a/assets/_mkdocstrings.css b/assets/_mkdocstrings.css new file mode 100644 index 00000000..b500381b --- /dev/null +++ b/assets/_mkdocstrings.css @@ -0,0 +1,143 @@ + +/* Avoid breaking parameter names, etc. in table cells. */ +.doc-contents td code { + word-break: normal !important; +} + +/* No line break before first paragraph of descriptions. */ +.doc-md-description, +.doc-md-description>p:first-child { + display: inline; +} + +/* Max width for docstring sections tables. */ +.doc .md-typeset__table, +.doc .md-typeset__table table { + display: table !important; + width: 100%; +} + +.doc .md-typeset__table tr { + display: table-row; +} + +/* Defaults in Spacy table style. */ +.doc-param-default { + float: right; +} + +/* Parameter headings must be inline, not blocks. */ +.doc-heading-parameter { + display: inline; +} + +/* Prefer space on the right, not the left of parameter permalinks. */ +.doc-heading-parameter .headerlink { + margin-left: 0 !important; + margin-right: 0.2rem; +} + +/* Backward-compatibility: docstring section titles in bold. */ +.doc-section-title { + font-weight: bold; +} + +/* Symbols in Navigation and ToC. */ +:root, :host, +[data-md-color-scheme="default"] { + --doc-symbol-parameter-fg-color: #df50af; + --doc-symbol-attribute-fg-color: #953800; + --doc-symbol-function-fg-color: #8250df; + --doc-symbol-method-fg-color: #8250df; + --doc-symbol-class-fg-color: #0550ae; + --doc-symbol-module-fg-color: #5cad0f; + + --doc-symbol-parameter-bg-color: #df50af1a; + --doc-symbol-attribute-bg-color: #9538001a; + --doc-symbol-function-bg-color: #8250df1a; + --doc-symbol-method-bg-color: #8250df1a; + --doc-symbol-class-bg-color: #0550ae1a; + --doc-symbol-module-bg-color: #5cad0f1a; +} + +[data-md-color-scheme="slate"] { + --doc-symbol-parameter-fg-color: #ffa8cc; + --doc-symbol-attribute-fg-color: #ffa657; + --doc-symbol-function-fg-color: #d2a8ff; + --doc-symbol-method-fg-color: #d2a8ff; + --doc-symbol-class-fg-color: #79c0ff; + --doc-symbol-module-fg-color: #baff79; + + --doc-symbol-parameter-bg-color: #ffa8cc1a; + --doc-symbol-attribute-bg-color: #ffa6571a; + --doc-symbol-function-bg-color: #d2a8ff1a; + --doc-symbol-method-bg-color: #d2a8ff1a; + --doc-symbol-class-bg-color: #79c0ff1a; + --doc-symbol-module-bg-color: #baff791a; +} + +code.doc-symbol { + border-radius: .1rem; + font-size: .85em; + padding: 0 .3em; + font-weight: bold; +} + +code.doc-symbol-parameter { + color: var(--doc-symbol-parameter-fg-color); + background-color: var(--doc-symbol-parameter-bg-color); +} + +code.doc-symbol-parameter::after { + content: "param"; +} + +code.doc-symbol-attribute { + color: var(--doc-symbol-attribute-fg-color); + background-color: var(--doc-symbol-attribute-bg-color); +} + +code.doc-symbol-attribute::after { + content: "attr"; +} + +code.doc-symbol-function { + color: var(--doc-symbol-function-fg-color); + background-color: var(--doc-symbol-function-bg-color); +} + +code.doc-symbol-function::after { + content: "func"; +} + +code.doc-symbol-method { + color: var(--doc-symbol-method-fg-color); + background-color: var(--doc-symbol-method-bg-color); +} + +code.doc-symbol-method::after { + content: "meth"; +} + +code.doc-symbol-class { + color: var(--doc-symbol-class-fg-color); + background-color: var(--doc-symbol-class-bg-color); +} + +code.doc-symbol-class::after { + content: "class"; +} + +code.doc-symbol-module { + color: var(--doc-symbol-module-fg-color); + background-color: var(--doc-symbol-module-bg-color); +} + +code.doc-symbol-module::after { + content: "mod"; +} + +.doc-signature .autorefs { + color: inherit; + border-bottom: 1px dotted currentcolor; +} diff --git a/assets/docetl-favicon-color.png b/assets/docetl-favicon-color.png new file mode 100644 index 00000000..51fe742f Binary files /dev/null and b/assets/docetl-favicon-color.png differ diff --git a/assets/headerdiagram.png b/assets/headerdiagram.png new file mode 100644 index 00000000..6f1e3f52 Binary files /dev/null and b/assets/headerdiagram.png differ diff --git a/assets/images/favicon.png b/assets/images/favicon.png new file mode 100644 index 00000000..1cf13b9f Binary files /dev/null and b/assets/images/favicon.png differ diff --git a/assets/javascripts/bundle.88dd0f4e.min.js b/assets/javascripts/bundle.88dd0f4e.min.js new file mode 100644 index 00000000..fb8f3109 --- /dev/null +++ b/assets/javascripts/bundle.88dd0f4e.min.js @@ -0,0 +1,16 @@ +"use strict";(()=>{var Wi=Object.create;var gr=Object.defineProperty;var Di=Object.getOwnPropertyDescriptor;var Vi=Object.getOwnPropertyNames,Vt=Object.getOwnPropertySymbols,Ni=Object.getPrototypeOf,yr=Object.prototype.hasOwnProperty,ao=Object.prototype.propertyIsEnumerable;var io=(e,t,r)=>t in e?gr(e,t,{enumerable:!0,configurable:!0,writable:!0,value:r}):e[t]=r,$=(e,t)=>{for(var r in t||(t={}))yr.call(t,r)&&io(e,r,t[r]);if(Vt)for(var r of Vt(t))ao.call(t,r)&&io(e,r,t[r]);return e};var so=(e,t)=>{var r={};for(var o in e)yr.call(e,o)&&t.indexOf(o)<0&&(r[o]=e[o]);if(e!=null&&Vt)for(var o of Vt(e))t.indexOf(o)<0&&ao.call(e,o)&&(r[o]=e[o]);return r};var xr=(e,t)=>()=>(t||e((t={exports:{}}).exports,t),t.exports);var zi=(e,t,r,o)=>{if(t&&typeof t=="object"||typeof t=="function")for(let n of Vi(t))!yr.call(e,n)&&n!==r&&gr(e,n,{get:()=>t[n],enumerable:!(o=Di(t,n))||o.enumerable});return e};var Mt=(e,t,r)=>(r=e!=null?Wi(Ni(e)):{},zi(t||!e||!e.__esModule?gr(r,"default",{value:e,enumerable:!0}):r,e));var co=(e,t,r)=>new Promise((o,n)=>{var i=p=>{try{s(r.next(p))}catch(c){n(c)}},a=p=>{try{s(r.throw(p))}catch(c){n(c)}},s=p=>p.done?o(p.value):Promise.resolve(p.value).then(i,a);s((r=r.apply(e,t)).next())});var lo=xr((Er,po)=>{(function(e,t){typeof Er=="object"&&typeof po!="undefined"?t():typeof define=="function"&&define.amd?define(t):t()})(Er,function(){"use strict";function e(r){var o=!0,n=!1,i=null,a={text:!0,search:!0,url:!0,tel:!0,email:!0,password:!0,number:!0,date:!0,month:!0,week:!0,time:!0,datetime:!0,"datetime-local":!0};function s(k){return!!(k&&k!==document&&k.nodeName!=="HTML"&&k.nodeName!=="BODY"&&"classList"in k&&"contains"in k.classList)}function p(k){var ft=k.type,qe=k.tagName;return!!(qe==="INPUT"&&a[ft]&&!k.readOnly||qe==="TEXTAREA"&&!k.readOnly||k.isContentEditable)}function c(k){k.classList.contains("focus-visible")||(k.classList.add("focus-visible"),k.setAttribute("data-focus-visible-added",""))}function l(k){k.hasAttribute("data-focus-visible-added")&&(k.classList.remove("focus-visible"),k.removeAttribute("data-focus-visible-added"))}function f(k){k.metaKey||k.altKey||k.ctrlKey||(s(r.activeElement)&&c(r.activeElement),o=!0)}function u(k){o=!1}function d(k){s(k.target)&&(o||p(k.target))&&c(k.target)}function y(k){s(k.target)&&(k.target.classList.contains("focus-visible")||k.target.hasAttribute("data-focus-visible-added"))&&(n=!0,window.clearTimeout(i),i=window.setTimeout(function(){n=!1},100),l(k.target))}function L(k){document.visibilityState==="hidden"&&(n&&(o=!0),X())}function X(){document.addEventListener("mousemove",J),document.addEventListener("mousedown",J),document.addEventListener("mouseup",J),document.addEventListener("pointermove",J),document.addEventListener("pointerdown",J),document.addEventListener("pointerup",J),document.addEventListener("touchmove",J),document.addEventListener("touchstart",J),document.addEventListener("touchend",J)}function te(){document.removeEventListener("mousemove",J),document.removeEventListener("mousedown",J),document.removeEventListener("mouseup",J),document.removeEventListener("pointermove",J),document.removeEventListener("pointerdown",J),document.removeEventListener("pointerup",J),document.removeEventListener("touchmove",J),document.removeEventListener("touchstart",J),document.removeEventListener("touchend",J)}function J(k){k.target.nodeName&&k.target.nodeName.toLowerCase()==="html"||(o=!1,te())}document.addEventListener("keydown",f,!0),document.addEventListener("mousedown",u,!0),document.addEventListener("pointerdown",u,!0),document.addEventListener("touchstart",u,!0),document.addEventListener("visibilitychange",L,!0),X(),r.addEventListener("focus",d,!0),r.addEventListener("blur",y,!0),r.nodeType===Node.DOCUMENT_FRAGMENT_NODE&&r.host?r.host.setAttribute("data-js-focus-visible",""):r.nodeType===Node.DOCUMENT_NODE&&(document.documentElement.classList.add("js-focus-visible"),document.documentElement.setAttribute("data-js-focus-visible",""))}if(typeof window!="undefined"&&typeof document!="undefined"){window.applyFocusVisiblePolyfill=e;var t;try{t=new CustomEvent("focus-visible-polyfill-ready")}catch(r){t=document.createEvent("CustomEvent"),t.initCustomEvent("focus-visible-polyfill-ready",!1,!1,{})}window.dispatchEvent(t)}typeof document!="undefined"&&e(document)})});var qr=xr((hy,On)=>{"use strict";/*! + * escape-html + * Copyright(c) 2012-2013 TJ Holowaychuk + * Copyright(c) 2015 Andreas Lubbe + * Copyright(c) 2015 Tiancheng "Timothy" Gu + * MIT Licensed + */var $a=/["'&<>]/;On.exports=Pa;function Pa(e){var t=""+e,r=$a.exec(t);if(!r)return t;var o,n="",i=0,a=0;for(i=r.index;i{/*! + * clipboard.js v2.0.11 + * https://clipboardjs.com/ + * + * Licensed MIT © Zeno Rocha + */(function(t,r){typeof It=="object"&&typeof Yr=="object"?Yr.exports=r():typeof define=="function"&&define.amd?define([],r):typeof It=="object"?It.ClipboardJS=r():t.ClipboardJS=r()})(It,function(){return function(){var e={686:function(o,n,i){"use strict";i.d(n,{default:function(){return Ui}});var a=i(279),s=i.n(a),p=i(370),c=i.n(p),l=i(817),f=i.n(l);function u(V){try{return document.execCommand(V)}catch(A){return!1}}var d=function(A){var M=f()(A);return u("cut"),M},y=d;function L(V){var A=document.documentElement.getAttribute("dir")==="rtl",M=document.createElement("textarea");M.style.fontSize="12pt",M.style.border="0",M.style.padding="0",M.style.margin="0",M.style.position="absolute",M.style[A?"right":"left"]="-9999px";var F=window.pageYOffset||document.documentElement.scrollTop;return M.style.top="".concat(F,"px"),M.setAttribute("readonly",""),M.value=V,M}var X=function(A,M){var F=L(A);M.container.appendChild(F);var D=f()(F);return u("copy"),F.remove(),D},te=function(A){var M=arguments.length>1&&arguments[1]!==void 0?arguments[1]:{container:document.body},F="";return typeof A=="string"?F=X(A,M):A instanceof HTMLInputElement&&!["text","search","url","tel","password"].includes(A==null?void 0:A.type)?F=X(A.value,M):(F=f()(A),u("copy")),F},J=te;function k(V){"@babel/helpers - typeof";return typeof Symbol=="function"&&typeof Symbol.iterator=="symbol"?k=function(M){return typeof M}:k=function(M){return M&&typeof Symbol=="function"&&M.constructor===Symbol&&M!==Symbol.prototype?"symbol":typeof M},k(V)}var ft=function(){var A=arguments.length>0&&arguments[0]!==void 0?arguments[0]:{},M=A.action,F=M===void 0?"copy":M,D=A.container,Y=A.target,$e=A.text;if(F!=="copy"&&F!=="cut")throw new Error('Invalid "action" value, use either "copy" or "cut"');if(Y!==void 0)if(Y&&k(Y)==="object"&&Y.nodeType===1){if(F==="copy"&&Y.hasAttribute("disabled"))throw new Error('Invalid "target" attribute. Please use "readonly" instead of "disabled" attribute');if(F==="cut"&&(Y.hasAttribute("readonly")||Y.hasAttribute("disabled")))throw new Error(`Invalid "target" attribute. You can't cut text from elements with "readonly" or "disabled" attributes`)}else throw new Error('Invalid "target" value, use a valid Element');if($e)return J($e,{container:D});if(Y)return F==="cut"?y(Y):J(Y,{container:D})},qe=ft;function Fe(V){"@babel/helpers - typeof";return typeof Symbol=="function"&&typeof Symbol.iterator=="symbol"?Fe=function(M){return typeof M}:Fe=function(M){return M&&typeof Symbol=="function"&&M.constructor===Symbol&&M!==Symbol.prototype?"symbol":typeof M},Fe(V)}function ki(V,A){if(!(V instanceof A))throw new TypeError("Cannot call a class as a function")}function no(V,A){for(var M=0;M0&&arguments[0]!==void 0?arguments[0]:{};this.action=typeof D.action=="function"?D.action:this.defaultAction,this.target=typeof D.target=="function"?D.target:this.defaultTarget,this.text=typeof D.text=="function"?D.text:this.defaultText,this.container=Fe(D.container)==="object"?D.container:document.body}},{key:"listenClick",value:function(D){var Y=this;this.listener=c()(D,"click",function($e){return Y.onClick($e)})}},{key:"onClick",value:function(D){var Y=D.delegateTarget||D.currentTarget,$e=this.action(Y)||"copy",Dt=qe({action:$e,container:this.container,target:this.target(Y),text:this.text(Y)});this.emit(Dt?"success":"error",{action:$e,text:Dt,trigger:Y,clearSelection:function(){Y&&Y.focus(),window.getSelection().removeAllRanges()}})}},{key:"defaultAction",value:function(D){return vr("action",D)}},{key:"defaultTarget",value:function(D){var Y=vr("target",D);if(Y)return document.querySelector(Y)}},{key:"defaultText",value:function(D){return vr("text",D)}},{key:"destroy",value:function(){this.listener.destroy()}}],[{key:"copy",value:function(D){var Y=arguments.length>1&&arguments[1]!==void 0?arguments[1]:{container:document.body};return J(D,Y)}},{key:"cut",value:function(D){return y(D)}},{key:"isSupported",value:function(){var D=arguments.length>0&&arguments[0]!==void 0?arguments[0]:["copy","cut"],Y=typeof D=="string"?[D]:D,$e=!!document.queryCommandSupported;return Y.forEach(function(Dt){$e=$e&&!!document.queryCommandSupported(Dt)}),$e}}]),M}(s()),Ui=Fi},828:function(o){var n=9;if(typeof Element!="undefined"&&!Element.prototype.matches){var i=Element.prototype;i.matches=i.matchesSelector||i.mozMatchesSelector||i.msMatchesSelector||i.oMatchesSelector||i.webkitMatchesSelector}function a(s,p){for(;s&&s.nodeType!==n;){if(typeof s.matches=="function"&&s.matches(p))return s;s=s.parentNode}}o.exports=a},438:function(o,n,i){var a=i(828);function s(l,f,u,d,y){var L=c.apply(this,arguments);return l.addEventListener(u,L,y),{destroy:function(){l.removeEventListener(u,L,y)}}}function p(l,f,u,d,y){return typeof l.addEventListener=="function"?s.apply(null,arguments):typeof u=="function"?s.bind(null,document).apply(null,arguments):(typeof l=="string"&&(l=document.querySelectorAll(l)),Array.prototype.map.call(l,function(L){return s(L,f,u,d,y)}))}function c(l,f,u,d){return function(y){y.delegateTarget=a(y.target,f),y.delegateTarget&&d.call(l,y)}}o.exports=p},879:function(o,n){n.node=function(i){return i!==void 0&&i instanceof HTMLElement&&i.nodeType===1},n.nodeList=function(i){var a=Object.prototype.toString.call(i);return i!==void 0&&(a==="[object NodeList]"||a==="[object HTMLCollection]")&&"length"in i&&(i.length===0||n.node(i[0]))},n.string=function(i){return typeof i=="string"||i instanceof String},n.fn=function(i){var a=Object.prototype.toString.call(i);return a==="[object Function]"}},370:function(o,n,i){var a=i(879),s=i(438);function p(u,d,y){if(!u&&!d&&!y)throw new Error("Missing required arguments");if(!a.string(d))throw new TypeError("Second argument must be a String");if(!a.fn(y))throw new TypeError("Third argument must be a Function");if(a.node(u))return c(u,d,y);if(a.nodeList(u))return l(u,d,y);if(a.string(u))return f(u,d,y);throw new TypeError("First argument must be a String, HTMLElement, HTMLCollection, or NodeList")}function c(u,d,y){return u.addEventListener(d,y),{destroy:function(){u.removeEventListener(d,y)}}}function l(u,d,y){return Array.prototype.forEach.call(u,function(L){L.addEventListener(d,y)}),{destroy:function(){Array.prototype.forEach.call(u,function(L){L.removeEventListener(d,y)})}}}function f(u,d,y){return s(document.body,u,d,y)}o.exports=p},817:function(o){function n(i){var a;if(i.nodeName==="SELECT")i.focus(),a=i.value;else if(i.nodeName==="INPUT"||i.nodeName==="TEXTAREA"){var s=i.hasAttribute("readonly");s||i.setAttribute("readonly",""),i.select(),i.setSelectionRange(0,i.value.length),s||i.removeAttribute("readonly"),a=i.value}else{i.hasAttribute("contenteditable")&&i.focus();var p=window.getSelection(),c=document.createRange();c.selectNodeContents(i),p.removeAllRanges(),p.addRange(c),a=p.toString()}return a}o.exports=n},279:function(o){function n(){}n.prototype={on:function(i,a,s){var p=this.e||(this.e={});return(p[i]||(p[i]=[])).push({fn:a,ctx:s}),this},once:function(i,a,s){var p=this;function c(){p.off(i,c),a.apply(s,arguments)}return c._=a,this.on(i,c,s)},emit:function(i){var a=[].slice.call(arguments,1),s=((this.e||(this.e={}))[i]||[]).slice(),p=0,c=s.length;for(p;p0&&i[i.length-1])&&(c[0]===6||c[0]===2)){r=0;continue}if(c[0]===3&&(!i||c[1]>i[0]&&c[1]=e.length&&(e=void 0),{value:e&&e[o++],done:!e}}};throw new TypeError(t?"Object is not iterable.":"Symbol.iterator is not defined.")}function N(e,t){var r=typeof Symbol=="function"&&e[Symbol.iterator];if(!r)return e;var o=r.call(e),n,i=[],a;try{for(;(t===void 0||t-- >0)&&!(n=o.next()).done;)i.push(n.value)}catch(s){a={error:s}}finally{try{n&&!n.done&&(r=o.return)&&r.call(o)}finally{if(a)throw a.error}}return i}function q(e,t,r){if(r||arguments.length===2)for(var o=0,n=t.length,i;o1||p(d,L)})},y&&(n[d]=y(n[d])))}function p(d,y){try{c(o[d](y))}catch(L){u(i[0][3],L)}}function c(d){d.value instanceof nt?Promise.resolve(d.value.v).then(l,f):u(i[0][2],d)}function l(d){p("next",d)}function f(d){p("throw",d)}function u(d,y){d(y),i.shift(),i.length&&p(i[0][0],i[0][1])}}function uo(e){if(!Symbol.asyncIterator)throw new TypeError("Symbol.asyncIterator is not defined.");var t=e[Symbol.asyncIterator],r;return t?t.call(e):(e=typeof he=="function"?he(e):e[Symbol.iterator](),r={},o("next"),o("throw"),o("return"),r[Symbol.asyncIterator]=function(){return this},r);function o(i){r[i]=e[i]&&function(a){return new Promise(function(s,p){a=e[i](a),n(s,p,a.done,a.value)})}}function n(i,a,s,p){Promise.resolve(p).then(function(c){i({value:c,done:s})},a)}}function H(e){return typeof e=="function"}function ut(e){var t=function(o){Error.call(o),o.stack=new Error().stack},r=e(t);return r.prototype=Object.create(Error.prototype),r.prototype.constructor=r,r}var zt=ut(function(e){return function(r){e(this),this.message=r?r.length+` errors occurred during unsubscription: +`+r.map(function(o,n){return n+1+") "+o.toString()}).join(` + `):"",this.name="UnsubscriptionError",this.errors=r}});function Qe(e,t){if(e){var r=e.indexOf(t);0<=r&&e.splice(r,1)}}var Ue=function(){function e(t){this.initialTeardown=t,this.closed=!1,this._parentage=null,this._finalizers=null}return e.prototype.unsubscribe=function(){var t,r,o,n,i;if(!this.closed){this.closed=!0;var a=this._parentage;if(a)if(this._parentage=null,Array.isArray(a))try{for(var s=he(a),p=s.next();!p.done;p=s.next()){var c=p.value;c.remove(this)}}catch(L){t={error:L}}finally{try{p&&!p.done&&(r=s.return)&&r.call(s)}finally{if(t)throw t.error}}else a.remove(this);var l=this.initialTeardown;if(H(l))try{l()}catch(L){i=L instanceof zt?L.errors:[L]}var f=this._finalizers;if(f){this._finalizers=null;try{for(var u=he(f),d=u.next();!d.done;d=u.next()){var y=d.value;try{ho(y)}catch(L){i=i!=null?i:[],L instanceof zt?i=q(q([],N(i)),N(L.errors)):i.push(L)}}}catch(L){o={error:L}}finally{try{d&&!d.done&&(n=u.return)&&n.call(u)}finally{if(o)throw o.error}}}if(i)throw new zt(i)}},e.prototype.add=function(t){var r;if(t&&t!==this)if(this.closed)ho(t);else{if(t instanceof e){if(t.closed||t._hasParent(this))return;t._addParent(this)}(this._finalizers=(r=this._finalizers)!==null&&r!==void 0?r:[]).push(t)}},e.prototype._hasParent=function(t){var r=this._parentage;return r===t||Array.isArray(r)&&r.includes(t)},e.prototype._addParent=function(t){var r=this._parentage;this._parentage=Array.isArray(r)?(r.push(t),r):r?[r,t]:t},e.prototype._removeParent=function(t){var r=this._parentage;r===t?this._parentage=null:Array.isArray(r)&&Qe(r,t)},e.prototype.remove=function(t){var r=this._finalizers;r&&Qe(r,t),t instanceof e&&t._removeParent(this)},e.EMPTY=function(){var t=new e;return t.closed=!0,t}(),e}();var Tr=Ue.EMPTY;function qt(e){return e instanceof Ue||e&&"closed"in e&&H(e.remove)&&H(e.add)&&H(e.unsubscribe)}function ho(e){H(e)?e():e.unsubscribe()}var Pe={onUnhandledError:null,onStoppedNotification:null,Promise:void 0,useDeprecatedSynchronousErrorHandling:!1,useDeprecatedNextContext:!1};var dt={setTimeout:function(e,t){for(var r=[],o=2;o0},enumerable:!1,configurable:!0}),t.prototype._trySubscribe=function(r){return this._throwIfClosed(),e.prototype._trySubscribe.call(this,r)},t.prototype._subscribe=function(r){return this._throwIfClosed(),this._checkFinalizedStatuses(r),this._innerSubscribe(r)},t.prototype._innerSubscribe=function(r){var o=this,n=this,i=n.hasError,a=n.isStopped,s=n.observers;return i||a?Tr:(this.currentObservers=null,s.push(r),new Ue(function(){o.currentObservers=null,Qe(s,r)}))},t.prototype._checkFinalizedStatuses=function(r){var o=this,n=o.hasError,i=o.thrownError,a=o.isStopped;n?r.error(i):a&&r.complete()},t.prototype.asObservable=function(){var r=new j;return r.source=this,r},t.create=function(r,o){return new To(r,o)},t}(j);var To=function(e){oe(t,e);function t(r,o){var n=e.call(this)||this;return n.destination=r,n.source=o,n}return t.prototype.next=function(r){var o,n;(n=(o=this.destination)===null||o===void 0?void 0:o.next)===null||n===void 0||n.call(o,r)},t.prototype.error=function(r){var o,n;(n=(o=this.destination)===null||o===void 0?void 0:o.error)===null||n===void 0||n.call(o,r)},t.prototype.complete=function(){var r,o;(o=(r=this.destination)===null||r===void 0?void 0:r.complete)===null||o===void 0||o.call(r)},t.prototype._subscribe=function(r){var o,n;return(n=(o=this.source)===null||o===void 0?void 0:o.subscribe(r))!==null&&n!==void 0?n:Tr},t}(g);var _r=function(e){oe(t,e);function t(r){var o=e.call(this)||this;return o._value=r,o}return Object.defineProperty(t.prototype,"value",{get:function(){return this.getValue()},enumerable:!1,configurable:!0}),t.prototype._subscribe=function(r){var o=e.prototype._subscribe.call(this,r);return!o.closed&&r.next(this._value),o},t.prototype.getValue=function(){var r=this,o=r.hasError,n=r.thrownError,i=r._value;if(o)throw n;return this._throwIfClosed(),i},t.prototype.next=function(r){e.prototype.next.call(this,this._value=r)},t}(g);var At={now:function(){return(At.delegate||Date).now()},delegate:void 0};var Ct=function(e){oe(t,e);function t(r,o,n){r===void 0&&(r=1/0),o===void 0&&(o=1/0),n===void 0&&(n=At);var i=e.call(this)||this;return i._bufferSize=r,i._windowTime=o,i._timestampProvider=n,i._buffer=[],i._infiniteTimeWindow=!0,i._infiniteTimeWindow=o===1/0,i._bufferSize=Math.max(1,r),i._windowTime=Math.max(1,o),i}return t.prototype.next=function(r){var o=this,n=o.isStopped,i=o._buffer,a=o._infiniteTimeWindow,s=o._timestampProvider,p=o._windowTime;n||(i.push(r),!a&&i.push(s.now()+p)),this._trimBuffer(),e.prototype.next.call(this,r)},t.prototype._subscribe=function(r){this._throwIfClosed(),this._trimBuffer();for(var o=this._innerSubscribe(r),n=this,i=n._infiniteTimeWindow,a=n._buffer,s=a.slice(),p=0;p0?e.prototype.schedule.call(this,r,o):(this.delay=o,this.state=r,this.scheduler.flush(this),this)},t.prototype.execute=function(r,o){return o>0||this.closed?e.prototype.execute.call(this,r,o):this._execute(r,o)},t.prototype.requestAsyncId=function(r,o,n){return n===void 0&&(n=0),n!=null&&n>0||n==null&&this.delay>0?e.prototype.requestAsyncId.call(this,r,o,n):(r.flush(this),0)},t}(gt);var Lo=function(e){oe(t,e);function t(){return e!==null&&e.apply(this,arguments)||this}return t}(yt);var kr=new Lo(Oo);var Mo=function(e){oe(t,e);function t(r,o){var n=e.call(this,r,o)||this;return n.scheduler=r,n.work=o,n}return t.prototype.requestAsyncId=function(r,o,n){return n===void 0&&(n=0),n!==null&&n>0?e.prototype.requestAsyncId.call(this,r,o,n):(r.actions.push(this),r._scheduled||(r._scheduled=vt.requestAnimationFrame(function(){return r.flush(void 0)})))},t.prototype.recycleAsyncId=function(r,o,n){var i;if(n===void 0&&(n=0),n!=null?n>0:this.delay>0)return e.prototype.recycleAsyncId.call(this,r,o,n);var a=r.actions;o!=null&&((i=a[a.length-1])===null||i===void 0?void 0:i.id)!==o&&(vt.cancelAnimationFrame(o),r._scheduled=void 0)},t}(gt);var _o=function(e){oe(t,e);function t(){return e!==null&&e.apply(this,arguments)||this}return t.prototype.flush=function(r){this._active=!0;var o=this._scheduled;this._scheduled=void 0;var n=this.actions,i;r=r||n.shift();do if(i=r.execute(r.state,r.delay))break;while((r=n[0])&&r.id===o&&n.shift());if(this._active=!1,i){for(;(r=n[0])&&r.id===o&&n.shift();)r.unsubscribe();throw i}},t}(yt);var me=new _o(Mo);var S=new j(function(e){return e.complete()});function Yt(e){return e&&H(e.schedule)}function Hr(e){return e[e.length-1]}function Xe(e){return H(Hr(e))?e.pop():void 0}function ke(e){return Yt(Hr(e))?e.pop():void 0}function Bt(e,t){return typeof Hr(e)=="number"?e.pop():t}var xt=function(e){return e&&typeof e.length=="number"&&typeof e!="function"};function Gt(e){return H(e==null?void 0:e.then)}function Jt(e){return H(e[bt])}function Xt(e){return Symbol.asyncIterator&&H(e==null?void 0:e[Symbol.asyncIterator])}function Zt(e){return new TypeError("You provided "+(e!==null&&typeof e=="object"?"an invalid object":"'"+e+"'")+" where a stream was expected. You can provide an Observable, Promise, ReadableStream, Array, AsyncIterable, or Iterable.")}function Zi(){return typeof Symbol!="function"||!Symbol.iterator?"@@iterator":Symbol.iterator}var er=Zi();function tr(e){return H(e==null?void 0:e[er])}function rr(e){return fo(this,arguments,function(){var r,o,n,i;return Nt(this,function(a){switch(a.label){case 0:r=e.getReader(),a.label=1;case 1:a.trys.push([1,,9,10]),a.label=2;case 2:return[4,nt(r.read())];case 3:return o=a.sent(),n=o.value,i=o.done,i?[4,nt(void 0)]:[3,5];case 4:return[2,a.sent()];case 5:return[4,nt(n)];case 6:return[4,a.sent()];case 7:return a.sent(),[3,2];case 8:return[3,10];case 9:return r.releaseLock(),[7];case 10:return[2]}})})}function or(e){return H(e==null?void 0:e.getReader)}function U(e){if(e instanceof j)return e;if(e!=null){if(Jt(e))return ea(e);if(xt(e))return ta(e);if(Gt(e))return ra(e);if(Xt(e))return Ao(e);if(tr(e))return oa(e);if(or(e))return na(e)}throw Zt(e)}function ea(e){return new j(function(t){var r=e[bt]();if(H(r.subscribe))return r.subscribe(t);throw new TypeError("Provided object does not correctly implement Symbol.observable")})}function ta(e){return new j(function(t){for(var r=0;r=2;return function(o){return o.pipe(e?b(function(n,i){return e(n,i,o)}):le,Te(1),r?De(t):Qo(function(){return new ir}))}}function jr(e){return e<=0?function(){return S}:E(function(t,r){var o=[];t.subscribe(T(r,function(n){o.push(n),e=2,!0))}function pe(e){e===void 0&&(e={});var t=e.connector,r=t===void 0?function(){return new g}:t,o=e.resetOnError,n=o===void 0?!0:o,i=e.resetOnComplete,a=i===void 0?!0:i,s=e.resetOnRefCountZero,p=s===void 0?!0:s;return function(c){var l,f,u,d=0,y=!1,L=!1,X=function(){f==null||f.unsubscribe(),f=void 0},te=function(){X(),l=u=void 0,y=L=!1},J=function(){var k=l;te(),k==null||k.unsubscribe()};return E(function(k,ft){d++,!L&&!y&&X();var qe=u=u!=null?u:r();ft.add(function(){d--,d===0&&!L&&!y&&(f=Ur(J,p))}),qe.subscribe(ft),!l&&d>0&&(l=new at({next:function(Fe){return qe.next(Fe)},error:function(Fe){L=!0,X(),f=Ur(te,n,Fe),qe.error(Fe)},complete:function(){y=!0,X(),f=Ur(te,a),qe.complete()}}),U(k).subscribe(l))})(c)}}function Ur(e,t){for(var r=[],o=2;oe.next(document)),e}function P(e,t=document){return Array.from(t.querySelectorAll(e))}function R(e,t=document){let r=fe(e,t);if(typeof r=="undefined")throw new ReferenceError(`Missing element: expected "${e}" to be present`);return r}function fe(e,t=document){return t.querySelector(e)||void 0}function Ie(){var e,t,r,o;return(o=(r=(t=(e=document.activeElement)==null?void 0:e.shadowRoot)==null?void 0:t.activeElement)!=null?r:document.activeElement)!=null?o:void 0}var wa=O(h(document.body,"focusin"),h(document.body,"focusout")).pipe(_e(1),Q(void 0),m(()=>Ie()||document.body),G(1));function et(e){return wa.pipe(m(t=>e.contains(t)),K())}function $t(e,t){return C(()=>O(h(e,"mouseenter").pipe(m(()=>!0)),h(e,"mouseleave").pipe(m(()=>!1))).pipe(t?Ht(r=>Le(+!r*t)):le,Q(e.matches(":hover"))))}function Jo(e,t){if(typeof t=="string"||typeof t=="number")e.innerHTML+=t.toString();else if(t instanceof Node)e.appendChild(t);else if(Array.isArray(t))for(let r of t)Jo(e,r)}function x(e,t,...r){let o=document.createElement(e);if(t)for(let n of Object.keys(t))typeof t[n]!="undefined"&&(typeof t[n]!="boolean"?o.setAttribute(n,t[n]):o.setAttribute(n,""));for(let n of r)Jo(o,n);return o}function sr(e){if(e>999){let t=+((e-950)%1e3>99);return`${((e+1e-6)/1e3).toFixed(t)}k`}else return e.toString()}function Tt(e){let t=x("script",{src:e});return C(()=>(document.head.appendChild(t),O(h(t,"load"),h(t,"error").pipe(v(()=>$r(()=>new ReferenceError(`Invalid script: ${e}`))))).pipe(m(()=>{}),_(()=>document.head.removeChild(t)),Te(1))))}var Xo=new g,Ta=C(()=>typeof ResizeObserver=="undefined"?Tt("https://unpkg.com/resize-observer-polyfill"):I(void 0)).pipe(m(()=>new ResizeObserver(e=>e.forEach(t=>Xo.next(t)))),v(e=>O(Ye,I(e)).pipe(_(()=>e.disconnect()))),G(1));function ce(e){return{width:e.offsetWidth,height:e.offsetHeight}}function ge(e){let t=e;for(;t.clientWidth===0&&t.parentElement;)t=t.parentElement;return Ta.pipe(w(r=>r.observe(t)),v(r=>Xo.pipe(b(o=>o.target===t),_(()=>r.unobserve(t)))),m(()=>ce(e)),Q(ce(e)))}function St(e){return{width:e.scrollWidth,height:e.scrollHeight}}function cr(e){let t=e.parentElement;for(;t&&(e.scrollWidth<=t.scrollWidth&&e.scrollHeight<=t.scrollHeight);)t=(e=t).parentElement;return t?e:void 0}function Zo(e){let t=[],r=e.parentElement;for(;r;)(e.clientWidth>r.clientWidth||e.clientHeight>r.clientHeight)&&t.push(r),r=(e=r).parentElement;return t.length===0&&t.push(document.documentElement),t}function Ve(e){return{x:e.offsetLeft,y:e.offsetTop}}function en(e){let t=e.getBoundingClientRect();return{x:t.x+window.scrollX,y:t.y+window.scrollY}}function tn(e){return O(h(window,"load"),h(window,"resize")).pipe(Me(0,me),m(()=>Ve(e)),Q(Ve(e)))}function pr(e){return{x:e.scrollLeft,y:e.scrollTop}}function Ne(e){return O(h(e,"scroll"),h(window,"scroll"),h(window,"resize")).pipe(Me(0,me),m(()=>pr(e)),Q(pr(e)))}var rn=new g,Sa=C(()=>I(new IntersectionObserver(e=>{for(let t of e)rn.next(t)},{threshold:0}))).pipe(v(e=>O(Ye,I(e)).pipe(_(()=>e.disconnect()))),G(1));function tt(e){return Sa.pipe(w(t=>t.observe(e)),v(t=>rn.pipe(b(({target:r})=>r===e),_(()=>t.unobserve(e)),m(({isIntersecting:r})=>r))))}function on(e,t=16){return Ne(e).pipe(m(({y:r})=>{let o=ce(e),n=St(e);return r>=n.height-o.height-t}),K())}var lr={drawer:R("[data-md-toggle=drawer]"),search:R("[data-md-toggle=search]")};function nn(e){return lr[e].checked}function Je(e,t){lr[e].checked!==t&&lr[e].click()}function ze(e){let t=lr[e];return h(t,"change").pipe(m(()=>t.checked),Q(t.checked))}function Oa(e,t){switch(e.constructor){case HTMLInputElement:return e.type==="radio"?/^Arrow/.test(t):!0;case HTMLSelectElement:case HTMLTextAreaElement:return!0;default:return e.isContentEditable}}function La(){return O(h(window,"compositionstart").pipe(m(()=>!0)),h(window,"compositionend").pipe(m(()=>!1))).pipe(Q(!1))}function an(){let e=h(window,"keydown").pipe(b(t=>!(t.metaKey||t.ctrlKey)),m(t=>({mode:nn("search")?"search":"global",type:t.key,claim(){t.preventDefault(),t.stopPropagation()}})),b(({mode:t,type:r})=>{if(t==="global"){let o=Ie();if(typeof o!="undefined")return!Oa(o,r)}return!0}),pe());return La().pipe(v(t=>t?S:e))}function ye(){return new URL(location.href)}function lt(e,t=!1){if(B("navigation.instant")&&!t){let r=x("a",{href:e.href});document.body.appendChild(r),r.click(),r.remove()}else location.href=e.href}function sn(){return new g}function cn(){return location.hash.slice(1)}function pn(e){let t=x("a",{href:e});t.addEventListener("click",r=>r.stopPropagation()),t.click()}function Ma(e){return O(h(window,"hashchange"),e).pipe(m(cn),Q(cn()),b(t=>t.length>0),G(1))}function ln(e){return Ma(e).pipe(m(t=>fe(`[id="${t}"]`)),b(t=>typeof t!="undefined"))}function Pt(e){let t=matchMedia(e);return ar(r=>t.addListener(()=>r(t.matches))).pipe(Q(t.matches))}function mn(){let e=matchMedia("print");return O(h(window,"beforeprint").pipe(m(()=>!0)),h(window,"afterprint").pipe(m(()=>!1))).pipe(Q(e.matches))}function Nr(e,t){return e.pipe(v(r=>r?t():S))}function zr(e,t){return new j(r=>{let o=new XMLHttpRequest;return o.open("GET",`${e}`),o.responseType="blob",o.addEventListener("load",()=>{o.status>=200&&o.status<300?(r.next(o.response),r.complete()):r.error(new Error(o.statusText))}),o.addEventListener("error",()=>{r.error(new Error("Network error"))}),o.addEventListener("abort",()=>{r.complete()}),typeof(t==null?void 0:t.progress$)!="undefined"&&(o.addEventListener("progress",n=>{var i;if(n.lengthComputable)t.progress$.next(n.loaded/n.total*100);else{let a=(i=o.getResponseHeader("Content-Length"))!=null?i:0;t.progress$.next(n.loaded/+a*100)}}),t.progress$.next(5)),o.send(),()=>o.abort()})}function je(e,t){return zr(e,t).pipe(v(r=>r.text()),m(r=>JSON.parse(r)),G(1))}function fn(e,t){let r=new DOMParser;return zr(e,t).pipe(v(o=>o.text()),m(o=>r.parseFromString(o,"text/html")),G(1))}function un(e,t){let r=new DOMParser;return zr(e,t).pipe(v(o=>o.text()),m(o=>r.parseFromString(o,"text/xml")),G(1))}function dn(){return{x:Math.max(0,scrollX),y:Math.max(0,scrollY)}}function hn(){return O(h(window,"scroll",{passive:!0}),h(window,"resize",{passive:!0})).pipe(m(dn),Q(dn()))}function bn(){return{width:innerWidth,height:innerHeight}}function vn(){return h(window,"resize",{passive:!0}).pipe(m(bn),Q(bn()))}function gn(){return z([hn(),vn()]).pipe(m(([e,t])=>({offset:e,size:t})),G(1))}function mr(e,{viewport$:t,header$:r}){let o=t.pipe(ee("size")),n=z([o,r]).pipe(m(()=>Ve(e)));return z([r,t,n]).pipe(m(([{height:i},{offset:a,size:s},{x:p,y:c}])=>({offset:{x:a.x-p,y:a.y-c+i},size:s})))}function _a(e){return h(e,"message",t=>t.data)}function Aa(e){let t=new g;return t.subscribe(r=>e.postMessage(r)),t}function yn(e,t=new Worker(e)){let r=_a(t),o=Aa(t),n=new g;n.subscribe(o);let i=o.pipe(Z(),ie(!0));return n.pipe(Z(),Re(r.pipe(W(i))),pe())}var Ca=R("#__config"),Ot=JSON.parse(Ca.textContent);Ot.base=`${new URL(Ot.base,ye())}`;function xe(){return Ot}function B(e){return Ot.features.includes(e)}function Ee(e,t){return typeof t!="undefined"?Ot.translations[e].replace("#",t.toString()):Ot.translations[e]}function Se(e,t=document){return R(`[data-md-component=${e}]`,t)}function ae(e,t=document){return P(`[data-md-component=${e}]`,t)}function ka(e){let t=R(".md-typeset > :first-child",e);return h(t,"click",{once:!0}).pipe(m(()=>R(".md-typeset",e)),m(r=>({hash:__md_hash(r.innerHTML)})))}function xn(e){if(!B("announce.dismiss")||!e.childElementCount)return S;if(!e.hidden){let t=R(".md-typeset",e);__md_hash(t.innerHTML)===__md_get("__announce")&&(e.hidden=!0)}return C(()=>{let t=new g;return t.subscribe(({hash:r})=>{e.hidden=!0,__md_set("__announce",r)}),ka(e).pipe(w(r=>t.next(r)),_(()=>t.complete()),m(r=>$({ref:e},r)))})}function Ha(e,{target$:t}){return t.pipe(m(r=>({hidden:r!==e})))}function En(e,t){let r=new g;return r.subscribe(({hidden:o})=>{e.hidden=o}),Ha(e,t).pipe(w(o=>r.next(o)),_(()=>r.complete()),m(o=>$({ref:e},o)))}function Rt(e,t){return t==="inline"?x("div",{class:"md-tooltip md-tooltip--inline",id:e,role:"tooltip"},x("div",{class:"md-tooltip__inner md-typeset"})):x("div",{class:"md-tooltip",id:e,role:"tooltip"},x("div",{class:"md-tooltip__inner md-typeset"}))}function wn(...e){return x("div",{class:"md-tooltip2",role:"tooltip"},x("div",{class:"md-tooltip2__inner md-typeset"},e))}function Tn(e,t){if(t=t?`${t}_annotation_${e}`:void 0,t){let r=t?`#${t}`:void 0;return x("aside",{class:"md-annotation",tabIndex:0},Rt(t),x("a",{href:r,class:"md-annotation__index",tabIndex:-1},x("span",{"data-md-annotation-id":e})))}else return x("aside",{class:"md-annotation",tabIndex:0},Rt(t),x("span",{class:"md-annotation__index",tabIndex:-1},x("span",{"data-md-annotation-id":e})))}function Sn(e){return x("button",{class:"md-clipboard md-icon",title:Ee("clipboard.copy"),"data-clipboard-target":`#${e} > code`})}var Ln=Mt(qr());function Qr(e,t){let r=t&2,o=t&1,n=Object.keys(e.terms).filter(p=>!e.terms[p]).reduce((p,c)=>[...p,x("del",null,(0,Ln.default)(c))," "],[]).slice(0,-1),i=xe(),a=new URL(e.location,i.base);B("search.highlight")&&a.searchParams.set("h",Object.entries(e.terms).filter(([,p])=>p).reduce((p,[c])=>`${p} ${c}`.trim(),""));let{tags:s}=xe();return x("a",{href:`${a}`,class:"md-search-result__link",tabIndex:-1},x("article",{class:"md-search-result__article md-typeset","data-md-score":e.score.toFixed(2)},r>0&&x("div",{class:"md-search-result__icon md-icon"}),r>0&&x("h1",null,e.title),r<=0&&x("h2",null,e.title),o>0&&e.text.length>0&&e.text,e.tags&&x("nav",{class:"md-tags"},e.tags.map(p=>{let c=s?p in s?`md-tag-icon md-tag--${s[p]}`:"md-tag-icon":"";return x("span",{class:`md-tag ${c}`},p)})),o>0&&n.length>0&&x("p",{class:"md-search-result__terms"},Ee("search.result.term.missing"),": ",...n)))}function Mn(e){let t=e[0].score,r=[...e],o=xe(),n=r.findIndex(l=>!`${new URL(l.location,o.base)}`.includes("#")),[i]=r.splice(n,1),a=r.findIndex(l=>l.scoreQr(l,1)),...p.length?[x("details",{class:"md-search-result__more"},x("summary",{tabIndex:-1},x("div",null,p.length>0&&p.length===1?Ee("search.result.more.one"):Ee("search.result.more.other",p.length))),...p.map(l=>Qr(l,1)))]:[]];return x("li",{class:"md-search-result__item"},c)}function _n(e){return x("ul",{class:"md-source__facts"},Object.entries(e).map(([t,r])=>x("li",{class:`md-source__fact md-source__fact--${t}`},typeof r=="number"?sr(r):r)))}function Kr(e){let t=`tabbed-control tabbed-control--${e}`;return x("div",{class:t,hidden:!0},x("button",{class:"tabbed-button",tabIndex:-1,"aria-hidden":"true"}))}function An(e){return x("div",{class:"md-typeset__scrollwrap"},x("div",{class:"md-typeset__table"},e))}function Ra(e){var o;let t=xe(),r=new URL(`../${e.version}/`,t.base);return x("li",{class:"md-version__item"},x("a",{href:`${r}`,class:"md-version__link"},e.title,((o=t.version)==null?void 0:o.alias)&&e.aliases.length>0&&x("span",{class:"md-version__alias"},e.aliases[0])))}function Cn(e,t){var o;let r=xe();return e=e.filter(n=>{var i;return!((i=n.properties)!=null&&i.hidden)}),x("div",{class:"md-version"},x("button",{class:"md-version__current","aria-label":Ee("select.version")},t.title,((o=r.version)==null?void 0:o.alias)&&t.aliases.length>0&&x("span",{class:"md-version__alias"},t.aliases[0])),x("ul",{class:"md-version__list"},e.map(Ra)))}var Ia=0;function ja(e){let t=z([et(e),$t(e)]).pipe(m(([o,n])=>o||n),K()),r=C(()=>Zo(e)).pipe(ne(Ne),pt(1),He(t),m(()=>en(e)));return t.pipe(Ae(o=>o),v(()=>z([t,r])),m(([o,n])=>({active:o,offset:n})),pe())}function Fa(e,t){let{content$:r,viewport$:o}=t,n=`__tooltip2_${Ia++}`;return C(()=>{let i=new g,a=new _r(!1);i.pipe(Z(),ie(!1)).subscribe(a);let s=a.pipe(Ht(c=>Le(+!c*250,kr)),K(),v(c=>c?r:S),w(c=>c.id=n),pe());z([i.pipe(m(({active:c})=>c)),s.pipe(v(c=>$t(c,250)),Q(!1))]).pipe(m(c=>c.some(l=>l))).subscribe(a);let p=a.pipe(b(c=>c),re(s,o),m(([c,l,{size:f}])=>{let u=e.getBoundingClientRect(),d=u.width/2;if(l.role==="tooltip")return{x:d,y:8+u.height};if(u.y>=f.height/2){let{height:y}=ce(l);return{x:d,y:-16-y}}else return{x:d,y:16+u.height}}));return z([s,i,p]).subscribe(([c,{offset:l},f])=>{c.style.setProperty("--md-tooltip-host-x",`${l.x}px`),c.style.setProperty("--md-tooltip-host-y",`${l.y}px`),c.style.setProperty("--md-tooltip-x",`${f.x}px`),c.style.setProperty("--md-tooltip-y",`${f.y}px`),c.classList.toggle("md-tooltip2--top",f.y<0),c.classList.toggle("md-tooltip2--bottom",f.y>=0)}),a.pipe(b(c=>c),re(s,(c,l)=>l),b(c=>c.role==="tooltip")).subscribe(c=>{let l=ce(R(":scope > *",c));c.style.setProperty("--md-tooltip-width",`${l.width}px`),c.style.setProperty("--md-tooltip-tail","0px")}),a.pipe(K(),ve(me),re(s)).subscribe(([c,l])=>{l.classList.toggle("md-tooltip2--active",c)}),z([a.pipe(b(c=>c)),s]).subscribe(([c,l])=>{l.role==="dialog"?(e.setAttribute("aria-controls",n),e.setAttribute("aria-haspopup","dialog")):e.setAttribute("aria-describedby",n)}),a.pipe(b(c=>!c)).subscribe(()=>{e.removeAttribute("aria-controls"),e.removeAttribute("aria-describedby"),e.removeAttribute("aria-haspopup")}),ja(e).pipe(w(c=>i.next(c)),_(()=>i.complete()),m(c=>$({ref:e},c)))})}function mt(e,{viewport$:t},r=document.body){return Fa(e,{content$:new j(o=>{let n=e.title,i=wn(n);return o.next(i),e.removeAttribute("title"),r.append(i),()=>{i.remove(),e.setAttribute("title",n)}}),viewport$:t})}function Ua(e,t){let r=C(()=>z([tn(e),Ne(t)])).pipe(m(([{x:o,y:n},i])=>{let{width:a,height:s}=ce(e);return{x:o-i.x+a/2,y:n-i.y+s/2}}));return et(e).pipe(v(o=>r.pipe(m(n=>({active:o,offset:n})),Te(+!o||1/0))))}function kn(e,t,{target$:r}){let[o,n]=Array.from(e.children);return C(()=>{let i=new g,a=i.pipe(Z(),ie(!0));return i.subscribe({next({offset:s}){e.style.setProperty("--md-tooltip-x",`${s.x}px`),e.style.setProperty("--md-tooltip-y",`${s.y}px`)},complete(){e.style.removeProperty("--md-tooltip-x"),e.style.removeProperty("--md-tooltip-y")}}),tt(e).pipe(W(a)).subscribe(s=>{e.toggleAttribute("data-md-visible",s)}),O(i.pipe(b(({active:s})=>s)),i.pipe(_e(250),b(({active:s})=>!s))).subscribe({next({active:s}){s?e.prepend(o):o.remove()},complete(){e.prepend(o)}}),i.pipe(Me(16,me)).subscribe(({active:s})=>{o.classList.toggle("md-tooltip--active",s)}),i.pipe(pt(125,me),b(()=>!!e.offsetParent),m(()=>e.offsetParent.getBoundingClientRect()),m(({x:s})=>s)).subscribe({next(s){s?e.style.setProperty("--md-tooltip-0",`${-s}px`):e.style.removeProperty("--md-tooltip-0")},complete(){e.style.removeProperty("--md-tooltip-0")}}),h(n,"click").pipe(W(a),b(s=>!(s.metaKey||s.ctrlKey))).subscribe(s=>{s.stopPropagation(),s.preventDefault()}),h(n,"mousedown").pipe(W(a),re(i)).subscribe(([s,{active:p}])=>{var c;if(s.button!==0||s.metaKey||s.ctrlKey)s.preventDefault();else if(p){s.preventDefault();let l=e.parentElement.closest(".md-annotation");l instanceof HTMLElement?l.focus():(c=Ie())==null||c.blur()}}),r.pipe(W(a),b(s=>s===o),Ge(125)).subscribe(()=>e.focus()),Ua(e,t).pipe(w(s=>i.next(s)),_(()=>i.complete()),m(s=>$({ref:e},s)))})}function Wa(e){return e.tagName==="CODE"?P(".c, .c1, .cm",e):[e]}function Da(e){let t=[];for(let r of Wa(e)){let o=[],n=document.createNodeIterator(r,NodeFilter.SHOW_TEXT);for(let i=n.nextNode();i;i=n.nextNode())o.push(i);for(let i of o){let a;for(;a=/(\(\d+\))(!)?/.exec(i.textContent);){let[,s,p]=a;if(typeof p=="undefined"){let c=i.splitText(a.index);i=c.splitText(s.length),t.push(c)}else{i.textContent=s,t.push(i);break}}}}return t}function Hn(e,t){t.append(...Array.from(e.childNodes))}function fr(e,t,{target$:r,print$:o}){let n=t.closest("[id]"),i=n==null?void 0:n.id,a=new Map;for(let s of Da(t)){let[,p]=s.textContent.match(/\((\d+)\)/);fe(`:scope > li:nth-child(${p})`,e)&&(a.set(p,Tn(p,i)),s.replaceWith(a.get(p)))}return a.size===0?S:C(()=>{let s=new g,p=s.pipe(Z(),ie(!0)),c=[];for(let[l,f]of a)c.push([R(".md-typeset",f),R(`:scope > li:nth-child(${l})`,e)]);return o.pipe(W(p)).subscribe(l=>{e.hidden=!l,e.classList.toggle("md-annotation-list",l);for(let[f,u]of c)l?Hn(f,u):Hn(u,f)}),O(...[...a].map(([,l])=>kn(l,t,{target$:r}))).pipe(_(()=>s.complete()),pe())})}function $n(e){if(e.nextElementSibling){let t=e.nextElementSibling;if(t.tagName==="OL")return t;if(t.tagName==="P"&&!t.children.length)return $n(t)}}function Pn(e,t){return C(()=>{let r=$n(e);return typeof r!="undefined"?fr(r,e,t):S})}var Rn=Mt(Br());var Va=0;function In(e){if(e.nextElementSibling){let t=e.nextElementSibling;if(t.tagName==="OL")return t;if(t.tagName==="P"&&!t.children.length)return In(t)}}function Na(e){return ge(e).pipe(m(({width:t})=>({scrollable:St(e).width>t})),ee("scrollable"))}function jn(e,t){let{matches:r}=matchMedia("(hover)"),o=C(()=>{let n=new g,i=n.pipe(jr(1));n.subscribe(({scrollable:c})=>{c&&r?e.setAttribute("tabindex","0"):e.removeAttribute("tabindex")});let a=[];if(Rn.default.isSupported()&&(e.closest(".copy")||B("content.code.copy")&&!e.closest(".no-copy"))){let c=e.closest("pre");c.id=`__code_${Va++}`;let l=Sn(c.id);c.insertBefore(l,e),B("content.tooltips")&&a.push(mt(l,{viewport$}))}let s=e.closest(".highlight");if(s instanceof HTMLElement){let c=In(s);if(typeof c!="undefined"&&(s.classList.contains("annotate")||B("content.code.annotate"))){let l=fr(c,e,t);a.push(ge(s).pipe(W(i),m(({width:f,height:u})=>f&&u),K(),v(f=>f?l:S)))}}return P(":scope > span[id]",e).length&&e.classList.add("md-code__content"),Na(e).pipe(w(c=>n.next(c)),_(()=>n.complete()),m(c=>$({ref:e},c)),Re(...a))});return B("content.lazy")?tt(e).pipe(b(n=>n),Te(1),v(()=>o)):o}function za(e,{target$:t,print$:r}){let o=!0;return O(t.pipe(m(n=>n.closest("details:not([open])")),b(n=>e===n),m(()=>({action:"open",reveal:!0}))),r.pipe(b(n=>n||!o),w(()=>o=e.open),m(n=>({action:n?"open":"close"}))))}function Fn(e,t){return C(()=>{let r=new g;return r.subscribe(({action:o,reveal:n})=>{e.toggleAttribute("open",o==="open"),n&&e.scrollIntoView()}),za(e,t).pipe(w(o=>r.next(o)),_(()=>r.complete()),m(o=>$({ref:e},o)))})}var Un=".node circle,.node ellipse,.node path,.node polygon,.node rect{fill:var(--md-mermaid-node-bg-color);stroke:var(--md-mermaid-node-fg-color)}marker{fill:var(--md-mermaid-edge-color)!important}.edgeLabel .label rect{fill:#0000}.flowchartTitleText{fill:var(--md-mermaid-label-fg-color)}.label{color:var(--md-mermaid-label-fg-color);font-family:var(--md-mermaid-font-family)}.label foreignObject{line-height:normal;overflow:visible}.label div .edgeLabel{color:var(--md-mermaid-label-fg-color)}.edgeLabel,.edgeLabel p,.label div .edgeLabel{background-color:var(--md-mermaid-label-bg-color)}.edgeLabel,.edgeLabel p{fill:var(--md-mermaid-label-bg-color);color:var(--md-mermaid-edge-color)}.edgePath .path,.flowchart-link{stroke:var(--md-mermaid-edge-color);stroke-width:.05rem}.edgePath .arrowheadPath{fill:var(--md-mermaid-edge-color);stroke:none}.cluster rect{fill:var(--md-default-fg-color--lightest);stroke:var(--md-default-fg-color--lighter)}.cluster span{color:var(--md-mermaid-label-fg-color);font-family:var(--md-mermaid-font-family)}g #flowchart-circleEnd,g #flowchart-circleStart,g #flowchart-crossEnd,g #flowchart-crossStart,g #flowchart-pointEnd,g #flowchart-pointStart{stroke:none}.classDiagramTitleText{fill:var(--md-mermaid-label-fg-color)}g.classGroup line,g.classGroup rect{fill:var(--md-mermaid-node-bg-color);stroke:var(--md-mermaid-node-fg-color)}g.classGroup text{fill:var(--md-mermaid-label-fg-color);font-family:var(--md-mermaid-font-family)}.classLabel .box{fill:var(--md-mermaid-label-bg-color);background-color:var(--md-mermaid-label-bg-color);opacity:1}.classLabel .label{fill:var(--md-mermaid-label-fg-color);font-family:var(--md-mermaid-font-family)}.node .divider{stroke:var(--md-mermaid-node-fg-color)}.relation{stroke:var(--md-mermaid-edge-color)}.cardinality{fill:var(--md-mermaid-label-fg-color);font-family:var(--md-mermaid-font-family)}.cardinality text{fill:inherit!important}defs #classDiagram-compositionEnd,defs #classDiagram-compositionStart,defs #classDiagram-dependencyEnd,defs #classDiagram-dependencyStart,defs #classDiagram-extensionEnd,defs #classDiagram-extensionStart{fill:var(--md-mermaid-edge-color)!important;stroke:var(--md-mermaid-edge-color)!important}defs #classDiagram-aggregationEnd,defs #classDiagram-aggregationStart{fill:var(--md-mermaid-label-bg-color)!important;stroke:var(--md-mermaid-edge-color)!important}.statediagramTitleText{fill:var(--md-mermaid-label-fg-color)}g.stateGroup rect{fill:var(--md-mermaid-node-bg-color);stroke:var(--md-mermaid-node-fg-color)}g.stateGroup .state-title{fill:var(--md-mermaid-label-fg-color)!important;font-family:var(--md-mermaid-font-family)}g.stateGroup .composit{fill:var(--md-mermaid-label-bg-color)}.nodeLabel,.nodeLabel p{color:var(--md-mermaid-label-fg-color);font-family:var(--md-mermaid-font-family)}a .nodeLabel{text-decoration:underline}.node circle.state-end,.node circle.state-start,.start-state{fill:var(--md-mermaid-edge-color);stroke:none}.end-state-inner,.end-state-outer{fill:var(--md-mermaid-edge-color)}.end-state-inner,.node circle.state-end{stroke:var(--md-mermaid-label-bg-color)}.transition{stroke:var(--md-mermaid-edge-color)}[id^=state-fork] rect,[id^=state-join] rect{fill:var(--md-mermaid-edge-color)!important;stroke:none!important}.statediagram-cluster.statediagram-cluster .inner{fill:var(--md-default-bg-color)}.statediagram-cluster rect{fill:var(--md-mermaid-node-bg-color);stroke:var(--md-mermaid-node-fg-color)}.statediagram-state rect.divider{fill:var(--md-default-fg-color--lightest);stroke:var(--md-default-fg-color--lighter)}defs #statediagram-barbEnd{stroke:var(--md-mermaid-edge-color)}.entityTitleText{fill:var(--md-mermaid-label-fg-color)}.attributeBoxEven,.attributeBoxOdd{fill:var(--md-mermaid-node-bg-color);stroke:var(--md-mermaid-node-fg-color)}.entityBox{fill:var(--md-mermaid-label-bg-color);stroke:var(--md-mermaid-node-fg-color)}.entityLabel{fill:var(--md-mermaid-label-fg-color);font-family:var(--md-mermaid-font-family)}.relationshipLabelBox{fill:var(--md-mermaid-label-bg-color);fill-opacity:1;background-color:var(--md-mermaid-label-bg-color);opacity:1}.relationshipLabel{fill:var(--md-mermaid-label-fg-color)}.relationshipLine{stroke:var(--md-mermaid-edge-color)}defs #ONE_OR_MORE_END *,defs #ONE_OR_MORE_START *,defs #ONLY_ONE_END *,defs #ONLY_ONE_START *,defs #ZERO_OR_MORE_END *,defs #ZERO_OR_MORE_START *,defs #ZERO_OR_ONE_END *,defs #ZERO_OR_ONE_START *{stroke:var(--md-mermaid-edge-color)!important}defs #ZERO_OR_MORE_END circle,defs #ZERO_OR_MORE_START circle{fill:var(--md-mermaid-label-bg-color)}text:not([class]):last-child{fill:var(--md-mermaid-label-fg-color)}.actor{fill:var(--md-mermaid-sequence-actor-bg-color);stroke:var(--md-mermaid-sequence-actor-border-color)}text.actor>tspan{fill:var(--md-mermaid-sequence-actor-fg-color);font-family:var(--md-mermaid-font-family)}line{stroke:var(--md-mermaid-sequence-actor-line-color)}.actor-man circle,.actor-man line{fill:var(--md-mermaid-sequence-actorman-bg-color);stroke:var(--md-mermaid-sequence-actorman-line-color)}.messageLine0,.messageLine1{stroke:var(--md-mermaid-sequence-message-line-color)}.note{fill:var(--md-mermaid-sequence-note-bg-color);stroke:var(--md-mermaid-sequence-note-border-color)}.loopText,.loopText>tspan,.messageText,.noteText>tspan{stroke:none;font-family:var(--md-mermaid-font-family)!important}.messageText{fill:var(--md-mermaid-sequence-message-fg-color)}.loopText,.loopText>tspan{fill:var(--md-mermaid-sequence-loop-fg-color)}.noteText>tspan{fill:var(--md-mermaid-sequence-note-fg-color)}#arrowhead path{fill:var(--md-mermaid-sequence-message-line-color);stroke:none}.loopLine{fill:var(--md-mermaid-sequence-loop-bg-color);stroke:var(--md-mermaid-sequence-loop-border-color)}.labelBox{fill:var(--md-mermaid-sequence-label-bg-color);stroke:none}.labelText,.labelText>span{fill:var(--md-mermaid-sequence-label-fg-color);font-family:var(--md-mermaid-font-family)}.sequenceNumber{fill:var(--md-mermaid-sequence-number-fg-color)}rect.rect{fill:var(--md-mermaid-sequence-box-bg-color);stroke:none}rect.rect+text.text{fill:var(--md-mermaid-sequence-box-fg-color)}defs #sequencenumber{fill:var(--md-mermaid-sequence-number-bg-color)!important}";var Gr,Qa=0;function Ka(){return typeof mermaid=="undefined"||mermaid instanceof Element?Tt("https://unpkg.com/mermaid@11/dist/mermaid.min.js"):I(void 0)}function Wn(e){return e.classList.remove("mermaid"),Gr||(Gr=Ka().pipe(w(()=>mermaid.initialize({startOnLoad:!1,themeCSS:Un,sequence:{actorFontSize:"16px",messageFontSize:"16px",noteFontSize:"16px"}})),m(()=>{}),G(1))),Gr.subscribe(()=>co(this,null,function*(){e.classList.add("mermaid");let t=`__mermaid_${Qa++}`,r=x("div",{class:"mermaid"}),o=e.textContent,{svg:n,fn:i}=yield mermaid.render(t,o),a=r.attachShadow({mode:"closed"});a.innerHTML=n,e.replaceWith(r),i==null||i(a)})),Gr.pipe(m(()=>({ref:e})))}var Dn=x("table");function Vn(e){return e.replaceWith(Dn),Dn.replaceWith(An(e)),I({ref:e})}function Ya(e){let t=e.find(r=>r.checked)||e[0];return O(...e.map(r=>h(r,"change").pipe(m(()=>R(`label[for="${r.id}"]`))))).pipe(Q(R(`label[for="${t.id}"]`)),m(r=>({active:r})))}function Nn(e,{viewport$:t,target$:r}){let o=R(".tabbed-labels",e),n=P(":scope > input",e),i=Kr("prev");e.append(i);let a=Kr("next");return e.append(a),C(()=>{let s=new g,p=s.pipe(Z(),ie(!0));z([s,ge(e),tt(e)]).pipe(W(p),Me(1,me)).subscribe({next([{active:c},l]){let f=Ve(c),{width:u}=ce(c);e.style.setProperty("--md-indicator-x",`${f.x}px`),e.style.setProperty("--md-indicator-width",`${u}px`);let d=pr(o);(f.xd.x+l.width)&&o.scrollTo({left:Math.max(0,f.x-16),behavior:"smooth"})},complete(){e.style.removeProperty("--md-indicator-x"),e.style.removeProperty("--md-indicator-width")}}),z([Ne(o),ge(o)]).pipe(W(p)).subscribe(([c,l])=>{let f=St(o);i.hidden=c.x<16,a.hidden=c.x>f.width-l.width-16}),O(h(i,"click").pipe(m(()=>-1)),h(a,"click").pipe(m(()=>1))).pipe(W(p)).subscribe(c=>{let{width:l}=ce(o);o.scrollBy({left:l*c,behavior:"smooth"})}),r.pipe(W(p),b(c=>n.includes(c))).subscribe(c=>c.click()),o.classList.add("tabbed-labels--linked");for(let c of n){let l=R(`label[for="${c.id}"]`);l.replaceChildren(x("a",{href:`#${l.htmlFor}`,tabIndex:-1},...Array.from(l.childNodes))),h(l.firstElementChild,"click").pipe(W(p),b(f=>!(f.metaKey||f.ctrlKey)),w(f=>{f.preventDefault(),f.stopPropagation()})).subscribe(()=>{history.replaceState({},"",`#${l.htmlFor}`),l.click()})}return B("content.tabs.link")&&s.pipe(Ce(1),re(t)).subscribe(([{active:c},{offset:l}])=>{let f=c.innerText.trim();if(c.hasAttribute("data-md-switching"))c.removeAttribute("data-md-switching");else{let u=e.offsetTop-l.y;for(let y of P("[data-tabs]"))for(let L of P(":scope > input",y)){let X=R(`label[for="${L.id}"]`);if(X!==c&&X.innerText.trim()===f){X.setAttribute("data-md-switching",""),L.click();break}}window.scrollTo({top:e.offsetTop-u});let d=__md_get("__tabs")||[];__md_set("__tabs",[...new Set([f,...d])])}}),s.pipe(W(p)).subscribe(()=>{for(let c of P("audio, video",e))c.pause()}),Ya(n).pipe(w(c=>s.next(c)),_(()=>s.complete()),m(c=>$({ref:e},c)))}).pipe(Ke(se))}function zn(e,{viewport$:t,target$:r,print$:o}){return O(...P(".annotate:not(.highlight)",e).map(n=>Pn(n,{target$:r,print$:o})),...P("pre:not(.mermaid) > code",e).map(n=>jn(n,{target$:r,print$:o})),...P("pre.mermaid",e).map(n=>Wn(n)),...P("table:not([class])",e).map(n=>Vn(n)),...P("details",e).map(n=>Fn(n,{target$:r,print$:o})),...P("[data-tabs]",e).map(n=>Nn(n,{viewport$:t,target$:r})),...P("[title]",e).filter(()=>B("content.tooltips")).map(n=>mt(n,{viewport$:t})))}function Ba(e,{alert$:t}){return t.pipe(v(r=>O(I(!0),I(!1).pipe(Ge(2e3))).pipe(m(o=>({message:r,active:o})))))}function qn(e,t){let r=R(".md-typeset",e);return C(()=>{let o=new g;return o.subscribe(({message:n,active:i})=>{e.classList.toggle("md-dialog--active",i),r.textContent=n}),Ba(e,t).pipe(w(n=>o.next(n)),_(()=>o.complete()),m(n=>$({ref:e},n)))})}var Ga=0;function Ja(e,t){document.body.append(e);let{width:r}=ce(e);e.style.setProperty("--md-tooltip-width",`${r}px`),e.remove();let o=cr(t),n=typeof o!="undefined"?Ne(o):I({x:0,y:0}),i=O(et(t),$t(t)).pipe(K());return z([i,n]).pipe(m(([a,s])=>{let{x:p,y:c}=Ve(t),l=ce(t),f=t.closest("table");return f&&t.parentElement&&(p+=f.offsetLeft+t.parentElement.offsetLeft,c+=f.offsetTop+t.parentElement.offsetTop),{active:a,offset:{x:p-s.x+l.width/2-r/2,y:c-s.y+l.height+8}}}))}function Qn(e){let t=e.title;if(!t.length)return S;let r=`__tooltip_${Ga++}`,o=Rt(r,"inline"),n=R(".md-typeset",o);return n.innerHTML=t,C(()=>{let i=new g;return i.subscribe({next({offset:a}){o.style.setProperty("--md-tooltip-x",`${a.x}px`),o.style.setProperty("--md-tooltip-y",`${a.y}px`)},complete(){o.style.removeProperty("--md-tooltip-x"),o.style.removeProperty("--md-tooltip-y")}}),O(i.pipe(b(({active:a})=>a)),i.pipe(_e(250),b(({active:a})=>!a))).subscribe({next({active:a}){a?(e.insertAdjacentElement("afterend",o),e.setAttribute("aria-describedby",r),e.removeAttribute("title")):(o.remove(),e.removeAttribute("aria-describedby"),e.setAttribute("title",t))},complete(){o.remove(),e.removeAttribute("aria-describedby"),e.setAttribute("title",t)}}),i.pipe(Me(16,me)).subscribe(({active:a})=>{o.classList.toggle("md-tooltip--active",a)}),i.pipe(pt(125,me),b(()=>!!e.offsetParent),m(()=>e.offsetParent.getBoundingClientRect()),m(({x:a})=>a)).subscribe({next(a){a?o.style.setProperty("--md-tooltip-0",`${-a}px`):o.style.removeProperty("--md-tooltip-0")},complete(){o.style.removeProperty("--md-tooltip-0")}}),Ja(o,e).pipe(w(a=>i.next(a)),_(()=>i.complete()),m(a=>$({ref:e},a)))}).pipe(Ke(se))}function Xa({viewport$:e}){if(!B("header.autohide"))return I(!1);let t=e.pipe(m(({offset:{y:n}})=>n),Be(2,1),m(([n,i])=>[nMath.abs(i-n.y)>100),m(([,[n]])=>n),K()),o=ze("search");return z([e,o]).pipe(m(([{offset:n},i])=>n.y>400&&!i),K(),v(n=>n?r:I(!1)),Q(!1))}function Kn(e,t){return C(()=>z([ge(e),Xa(t)])).pipe(m(([{height:r},o])=>({height:r,hidden:o})),K((r,o)=>r.height===o.height&&r.hidden===o.hidden),G(1))}function Yn(e,{header$:t,main$:r}){return C(()=>{let o=new g,n=o.pipe(Z(),ie(!0));o.pipe(ee("active"),He(t)).subscribe(([{active:a},{hidden:s}])=>{e.classList.toggle("md-header--shadow",a&&!s),e.hidden=s});let i=ue(P("[title]",e)).pipe(b(()=>B("content.tooltips")),ne(a=>Qn(a)));return r.subscribe(o),t.pipe(W(n),m(a=>$({ref:e},a)),Re(i.pipe(W(n))))})}function Za(e,{viewport$:t,header$:r}){return mr(e,{viewport$:t,header$:r}).pipe(m(({offset:{y:o}})=>{let{height:n}=ce(e);return{active:o>=n}}),ee("active"))}function Bn(e,t){return C(()=>{let r=new g;r.subscribe({next({active:n}){e.classList.toggle("md-header__title--active",n)},complete(){e.classList.remove("md-header__title--active")}});let o=fe(".md-content h1");return typeof o=="undefined"?S:Za(o,t).pipe(w(n=>r.next(n)),_(()=>r.complete()),m(n=>$({ref:e},n)))})}function Gn(e,{viewport$:t,header$:r}){let o=r.pipe(m(({height:i})=>i),K()),n=o.pipe(v(()=>ge(e).pipe(m(({height:i})=>({top:e.offsetTop,bottom:e.offsetTop+i})),ee("bottom"))));return z([o,n,t]).pipe(m(([i,{top:a,bottom:s},{offset:{y:p},size:{height:c}}])=>(c=Math.max(0,c-Math.max(0,a-p,i)-Math.max(0,c+p-s)),{offset:a-i,height:c,active:a-i<=p})),K((i,a)=>i.offset===a.offset&&i.height===a.height&&i.active===a.active))}function es(e){let t=__md_get("__palette")||{index:e.findIndex(o=>matchMedia(o.getAttribute("data-md-color-media")).matches)},r=Math.max(0,Math.min(t.index,e.length-1));return I(...e).pipe(ne(o=>h(o,"change").pipe(m(()=>o))),Q(e[r]),m(o=>({index:e.indexOf(o),color:{media:o.getAttribute("data-md-color-media"),scheme:o.getAttribute("data-md-color-scheme"),primary:o.getAttribute("data-md-color-primary"),accent:o.getAttribute("data-md-color-accent")}})),G(1))}function Jn(e){let t=P("input",e),r=x("meta",{name:"theme-color"});document.head.appendChild(r);let o=x("meta",{name:"color-scheme"});document.head.appendChild(o);let n=Pt("(prefers-color-scheme: light)");return C(()=>{let i=new g;return i.subscribe(a=>{if(document.body.setAttribute("data-md-color-switching",""),a.color.media==="(prefers-color-scheme)"){let s=matchMedia("(prefers-color-scheme: light)"),p=document.querySelector(s.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");a.color.scheme=p.getAttribute("data-md-color-scheme"),a.color.primary=p.getAttribute("data-md-color-primary"),a.color.accent=p.getAttribute("data-md-color-accent")}for(let[s,p]of Object.entries(a.color))document.body.setAttribute(`data-md-color-${s}`,p);for(let s=0;sa.key==="Enter"),re(i,(a,s)=>s)).subscribe(({index:a})=>{a=(a+1)%t.length,t[a].click(),t[a].focus()}),i.pipe(m(()=>{let a=Se("header"),s=window.getComputedStyle(a);return o.content=s.colorScheme,s.backgroundColor.match(/\d+/g).map(p=>(+p).toString(16).padStart(2,"0")).join("")})).subscribe(a=>r.content=`#${a}`),i.pipe(ve(se)).subscribe(()=>{document.body.removeAttribute("data-md-color-switching")}),es(t).pipe(W(n.pipe(Ce(1))),ct(),w(a=>i.next(a)),_(()=>i.complete()),m(a=>$({ref:e},a)))})}function Xn(e,{progress$:t}){return C(()=>{let r=new g;return r.subscribe(({value:o})=>{e.style.setProperty("--md-progress-value",`${o}`)}),t.pipe(w(o=>r.next({value:o})),_(()=>r.complete()),m(o=>({ref:e,value:o})))})}var Jr=Mt(Br());function ts(e){e.setAttribute("data-md-copying","");let t=e.closest("[data-copy]"),r=t?t.getAttribute("data-copy"):e.innerText;return e.removeAttribute("data-md-copying"),r.trimEnd()}function Zn({alert$:e}){Jr.default.isSupported()&&new j(t=>{new Jr.default("[data-clipboard-target], [data-clipboard-text]",{text:r=>r.getAttribute("data-clipboard-text")||ts(R(r.getAttribute("data-clipboard-target")))}).on("success",r=>t.next(r))}).pipe(w(t=>{t.trigger.focus()}),m(()=>Ee("clipboard.copied"))).subscribe(e)}function ei(e,t){return e.protocol=t.protocol,e.hostname=t.hostname,e}function rs(e,t){let r=new Map;for(let o of P("url",e)){let n=R("loc",o),i=[ei(new URL(n.textContent),t)];r.set(`${i[0]}`,i);for(let a of P("[rel=alternate]",o)){let s=a.getAttribute("href");s!=null&&i.push(ei(new URL(s),t))}}return r}function ur(e){return un(new URL("sitemap.xml",e)).pipe(m(t=>rs(t,new URL(e))),de(()=>I(new Map)))}function os(e,t){if(!(e.target instanceof Element))return S;let r=e.target.closest("a");if(r===null)return S;if(r.target||e.metaKey||e.ctrlKey)return S;let o=new URL(r.href);return o.search=o.hash="",t.has(`${o}`)?(e.preventDefault(),I(new URL(r.href))):S}function ti(e){let t=new Map;for(let r of P(":scope > *",e.head))t.set(r.outerHTML,r);return t}function ri(e){for(let t of P("[href], [src]",e))for(let r of["href","src"]){let o=t.getAttribute(r);if(o&&!/^(?:[a-z]+:)?\/\//i.test(o)){t[r]=t[r];break}}return I(e)}function ns(e){for(let o of["[data-md-component=announce]","[data-md-component=container]","[data-md-component=header-topic]","[data-md-component=outdated]","[data-md-component=logo]","[data-md-component=skip]",...B("navigation.tabs.sticky")?["[data-md-component=tabs]"]:[]]){let n=fe(o),i=fe(o,e);typeof n!="undefined"&&typeof i!="undefined"&&n.replaceWith(i)}let t=ti(document);for(let[o,n]of ti(e))t.has(o)?t.delete(o):document.head.appendChild(n);for(let o of t.values()){let n=o.getAttribute("name");n!=="theme-color"&&n!=="color-scheme"&&o.remove()}let r=Se("container");return We(P("script",r)).pipe(v(o=>{let n=e.createElement("script");if(o.src){for(let i of o.getAttributeNames())n.setAttribute(i,o.getAttribute(i));return o.replaceWith(n),new j(i=>{n.onload=()=>i.complete()})}else return n.textContent=o.textContent,o.replaceWith(n),S}),Z(),ie(document))}function oi({location$:e,viewport$:t,progress$:r}){let o=xe();if(location.protocol==="file:")return S;let n=ur(o.base);I(document).subscribe(ri);let i=h(document.body,"click").pipe(He(n),v(([p,c])=>os(p,c)),pe()),a=h(window,"popstate").pipe(m(ye),pe());i.pipe(re(t)).subscribe(([p,{offset:c}])=>{history.replaceState(c,""),history.pushState(null,"",p)}),O(i,a).subscribe(e);let s=e.pipe(ee("pathname"),v(p=>fn(p,{progress$:r}).pipe(de(()=>(lt(p,!0),S)))),v(ri),v(ns),pe());return O(s.pipe(re(e,(p,c)=>c)),s.pipe(v(()=>e),ee("pathname"),v(()=>e),ee("hash")),e.pipe(K((p,c)=>p.pathname===c.pathname&&p.hash===c.hash),v(()=>i),w(()=>history.back()))).subscribe(p=>{var c,l;history.state!==null||!p.hash?window.scrollTo(0,(l=(c=history.state)==null?void 0:c.y)!=null?l:0):(history.scrollRestoration="auto",pn(p.hash),history.scrollRestoration="manual")}),e.subscribe(()=>{history.scrollRestoration="manual"}),h(window,"beforeunload").subscribe(()=>{history.scrollRestoration="auto"}),t.pipe(ee("offset"),_e(100)).subscribe(({offset:p})=>{history.replaceState(p,"")}),s}var ni=Mt(qr());function ii(e){let t=e.separator.split("|").map(n=>n.replace(/(\(\?[!=<][^)]+\))/g,"").length===0?"\uFFFD":n).join("|"),r=new RegExp(t,"img"),o=(n,i,a)=>`${i}${a}`;return n=>{n=n.replace(/[\s*+\-:~^]+/g," ").trim();let i=new RegExp(`(^|${e.separator}|)(${n.replace(/[|\\{}()[\]^$+*?.-]/g,"\\$&").replace(r,"|")})`,"img");return a=>(0,ni.default)(a).replace(i,o).replace(/<\/mark>(\s+)]*>/img,"$1")}}function jt(e){return e.type===1}function dr(e){return e.type===3}function ai(e,t){let r=yn(e);return O(I(location.protocol!=="file:"),ze("search")).pipe(Ae(o=>o),v(()=>t)).subscribe(({config:o,docs:n})=>r.next({type:0,data:{config:o,docs:n,options:{suggest:B("search.suggest")}}})),r}function si(e){var l;let{selectedVersionSitemap:t,selectedVersionBaseURL:r,currentLocation:o,currentBaseURL:n}=e,i=(l=Xr(n))==null?void 0:l.pathname;if(i===void 0)return;let a=ss(o.pathname,i);if(a===void 0)return;let s=ps(t.keys());if(!t.has(s))return;let p=Xr(a,s);if(!p||!t.has(p.href))return;let c=Xr(a,r);if(c)return c.hash=o.hash,c.search=o.search,c}function Xr(e,t){try{return new URL(e,t)}catch(r){return}}function ss(e,t){if(e.startsWith(t))return e.slice(t.length)}function cs(e,t){let r=Math.min(e.length,t.length),o;for(o=0;oS)),o=r.pipe(m(n=>{let[,i]=t.base.match(/([^/]+)\/?$/);return n.find(({version:a,aliases:s})=>a===i||s.includes(i))||n[0]}));r.pipe(m(n=>new Map(n.map(i=>[`${new URL(`../${i.version}/`,t.base)}`,i]))),v(n=>h(document.body,"click").pipe(b(i=>!i.metaKey&&!i.ctrlKey),re(o),v(([i,a])=>{if(i.target instanceof Element){let s=i.target.closest("a");if(s&&!s.target&&n.has(s.href)){let p=s.href;return!i.target.closest(".md-version")&&n.get(p)===a?S:(i.preventDefault(),I(new URL(p)))}}return S}),v(i=>ur(i).pipe(m(a=>{var s;return(s=si({selectedVersionSitemap:a,selectedVersionBaseURL:i,currentLocation:ye(),currentBaseURL:t.base}))!=null?s:i})))))).subscribe(n=>lt(n,!0)),z([r,o]).subscribe(([n,i])=>{R(".md-header__topic").appendChild(Cn(n,i))}),e.pipe(v(()=>o)).subscribe(n=>{var a;let i=__md_get("__outdated",sessionStorage);if(i===null){i=!0;let s=((a=t.version)==null?void 0:a.default)||"latest";Array.isArray(s)||(s=[s]);e:for(let p of s)for(let c of n.aliases.concat(n.version))if(new RegExp(p,"i").test(c)){i=!1;break e}__md_set("__outdated",i,sessionStorage)}if(i)for(let s of ae("outdated"))s.hidden=!1})}function ls(e,{worker$:t}){let{searchParams:r}=ye();r.has("q")&&(Je("search",!0),e.value=r.get("q"),e.focus(),ze("search").pipe(Ae(i=>!i)).subscribe(()=>{let i=ye();i.searchParams.delete("q"),history.replaceState({},"",`${i}`)}));let o=et(e),n=O(t.pipe(Ae(jt)),h(e,"keyup"),o).pipe(m(()=>e.value),K());return z([n,o]).pipe(m(([i,a])=>({value:i,focus:a})),G(1))}function pi(e,{worker$:t}){let r=new g,o=r.pipe(Z(),ie(!0));z([t.pipe(Ae(jt)),r],(i,a)=>a).pipe(ee("value")).subscribe(({value:i})=>t.next({type:2,data:i})),r.pipe(ee("focus")).subscribe(({focus:i})=>{i&&Je("search",i)}),h(e.form,"reset").pipe(W(o)).subscribe(()=>e.focus());let n=R("header [for=__search]");return h(n,"click").subscribe(()=>e.focus()),ls(e,{worker$:t}).pipe(w(i=>r.next(i)),_(()=>r.complete()),m(i=>$({ref:e},i)),G(1))}function li(e,{worker$:t,query$:r}){let o=new g,n=on(e.parentElement).pipe(b(Boolean)),i=e.parentElement,a=R(":scope > :first-child",e),s=R(":scope > :last-child",e);ze("search").subscribe(l=>s.setAttribute("role",l?"list":"presentation")),o.pipe(re(r),Wr(t.pipe(Ae(jt)))).subscribe(([{items:l},{value:f}])=>{switch(l.length){case 0:a.textContent=f.length?Ee("search.result.none"):Ee("search.result.placeholder");break;case 1:a.textContent=Ee("search.result.one");break;default:let u=sr(l.length);a.textContent=Ee("search.result.other",u)}});let p=o.pipe(w(()=>s.innerHTML=""),v(({items:l})=>O(I(...l.slice(0,10)),I(...l.slice(10)).pipe(Be(4),Vr(n),v(([f])=>f)))),m(Mn),pe());return p.subscribe(l=>s.appendChild(l)),p.pipe(ne(l=>{let f=fe("details",l);return typeof f=="undefined"?S:h(f,"toggle").pipe(W(o),m(()=>f))})).subscribe(l=>{l.open===!1&&l.offsetTop<=i.scrollTop&&i.scrollTo({top:l.offsetTop})}),t.pipe(b(dr),m(({data:l})=>l)).pipe(w(l=>o.next(l)),_(()=>o.complete()),m(l=>$({ref:e},l)))}function ms(e,{query$:t}){return t.pipe(m(({value:r})=>{let o=ye();return o.hash="",r=r.replace(/\s+/g,"+").replace(/&/g,"%26").replace(/=/g,"%3D"),o.search=`q=${r}`,{url:o}}))}function mi(e,t){let r=new g,o=r.pipe(Z(),ie(!0));return r.subscribe(({url:n})=>{e.setAttribute("data-clipboard-text",e.href),e.href=`${n}`}),h(e,"click").pipe(W(o)).subscribe(n=>n.preventDefault()),ms(e,t).pipe(w(n=>r.next(n)),_(()=>r.complete()),m(n=>$({ref:e},n)))}function fi(e,{worker$:t,keyboard$:r}){let o=new g,n=Se("search-query"),i=O(h(n,"keydown"),h(n,"focus")).pipe(ve(se),m(()=>n.value),K());return o.pipe(He(i),m(([{suggest:s},p])=>{let c=p.split(/([\s-]+)/);if(s!=null&&s.length&&c[c.length-1]){let l=s[s.length-1];l.startsWith(c[c.length-1])&&(c[c.length-1]=l)}else c.length=0;return c})).subscribe(s=>e.innerHTML=s.join("").replace(/\s/g," ")),r.pipe(b(({mode:s})=>s==="search")).subscribe(s=>{switch(s.type){case"ArrowRight":e.innerText.length&&n.selectionStart===n.value.length&&(n.value=e.innerText);break}}),t.pipe(b(dr),m(({data:s})=>s)).pipe(w(s=>o.next(s)),_(()=>o.complete()),m(()=>({ref:e})))}function ui(e,{index$:t,keyboard$:r}){let o=xe();try{let n=ai(o.search,t),i=Se("search-query",e),a=Se("search-result",e);h(e,"click").pipe(b(({target:p})=>p instanceof Element&&!!p.closest("a"))).subscribe(()=>Je("search",!1)),r.pipe(b(({mode:p})=>p==="search")).subscribe(p=>{let c=Ie();switch(p.type){case"Enter":if(c===i){let l=new Map;for(let f of P(":first-child [href]",a)){let u=f.firstElementChild;l.set(f,parseFloat(u.getAttribute("data-md-score")))}if(l.size){let[[f]]=[...l].sort(([,u],[,d])=>d-u);f.click()}p.claim()}break;case"Escape":case"Tab":Je("search",!1),i.blur();break;case"ArrowUp":case"ArrowDown":if(typeof c=="undefined")i.focus();else{let l=[i,...P(":not(details) > [href], summary, details[open] [href]",a)],f=Math.max(0,(Math.max(0,l.indexOf(c))+l.length+(p.type==="ArrowUp"?-1:1))%l.length);l[f].focus()}p.claim();break;default:i!==Ie()&&i.focus()}}),r.pipe(b(({mode:p})=>p==="global")).subscribe(p=>{switch(p.type){case"f":case"s":case"/":i.focus(),i.select(),p.claim();break}});let s=pi(i,{worker$:n});return O(s,li(a,{worker$:n,query$:s})).pipe(Re(...ae("search-share",e).map(p=>mi(p,{query$:s})),...ae("search-suggest",e).map(p=>fi(p,{worker$:n,keyboard$:r}))))}catch(n){return e.hidden=!0,Ye}}function di(e,{index$:t,location$:r}){return z([t,r.pipe(Q(ye()),b(o=>!!o.searchParams.get("h")))]).pipe(m(([o,n])=>ii(o.config)(n.searchParams.get("h"))),m(o=>{var a;let n=new Map,i=document.createNodeIterator(e,NodeFilter.SHOW_TEXT);for(let s=i.nextNode();s;s=i.nextNode())if((a=s.parentElement)!=null&&a.offsetHeight){let p=s.textContent,c=o(p);c.length>p.length&&n.set(s,c)}for(let[s,p]of n){let{childNodes:c}=x("span",null,p);s.replaceWith(...Array.from(c))}return{ref:e,nodes:n}}))}function fs(e,{viewport$:t,main$:r}){let o=e.closest(".md-grid"),n=o.offsetTop-o.parentElement.offsetTop;return z([r,t]).pipe(m(([{offset:i,height:a},{offset:{y:s}}])=>(a=a+Math.min(n,Math.max(0,s-i))-n,{height:a,locked:s>=i+n})),K((i,a)=>i.height===a.height&&i.locked===a.locked))}function Zr(e,o){var n=o,{header$:t}=n,r=so(n,["header$"]);let i=R(".md-sidebar__scrollwrap",e),{y:a}=Ve(i);return C(()=>{let s=new g,p=s.pipe(Z(),ie(!0)),c=s.pipe(Me(0,me));return c.pipe(re(t)).subscribe({next([{height:l},{height:f}]){i.style.height=`${l-2*a}px`,e.style.top=`${f}px`},complete(){i.style.height="",e.style.top=""}}),c.pipe(Ae()).subscribe(()=>{for(let l of P(".md-nav__link--active[href]",e)){if(!l.clientHeight)continue;let f=l.closest(".md-sidebar__scrollwrap");if(typeof f!="undefined"){let u=l.offsetTop-f.offsetTop,{height:d}=ce(f);f.scrollTo({top:u-d/2})}}}),ue(P("label[tabindex]",e)).pipe(ne(l=>h(l,"click").pipe(ve(se),m(()=>l),W(p)))).subscribe(l=>{let f=R(`[id="${l.htmlFor}"]`);R(`[aria-labelledby="${l.id}"]`).setAttribute("aria-expanded",`${f.checked}`)}),fs(e,r).pipe(w(l=>s.next(l)),_(()=>s.complete()),m(l=>$({ref:e},l)))})}function hi(e,t){if(typeof t!="undefined"){let r=`https://api.github.com/repos/${e}/${t}`;return st(je(`${r}/releases/latest`).pipe(de(()=>S),m(o=>({version:o.tag_name})),De({})),je(r).pipe(de(()=>S),m(o=>({stars:o.stargazers_count,forks:o.forks_count})),De({}))).pipe(m(([o,n])=>$($({},o),n)))}else{let r=`https://api.github.com/users/${e}`;return je(r).pipe(m(o=>({repositories:o.public_repos})),De({}))}}function bi(e,t){let r=`https://${e}/api/v4/projects/${encodeURIComponent(t)}`;return st(je(`${r}/releases/permalink/latest`).pipe(de(()=>S),m(({tag_name:o})=>({version:o})),De({})),je(r).pipe(de(()=>S),m(({star_count:o,forks_count:n})=>({stars:o,forks:n})),De({}))).pipe(m(([o,n])=>$($({},o),n)))}function vi(e){let t=e.match(/^.+github\.com\/([^/]+)\/?([^/]+)?/i);if(t){let[,r,o]=t;return hi(r,o)}if(t=e.match(/^.+?([^/]*gitlab[^/]+)\/(.+?)\/?$/i),t){let[,r,o]=t;return bi(r,o)}return S}var us;function ds(e){return us||(us=C(()=>{let t=__md_get("__source",sessionStorage);if(t)return I(t);if(ae("consent").length){let o=__md_get("__consent");if(!(o&&o.github))return S}return vi(e.href).pipe(w(o=>__md_set("__source",o,sessionStorage)))}).pipe(de(()=>S),b(t=>Object.keys(t).length>0),m(t=>({facts:t})),G(1)))}function gi(e){let t=R(":scope > :last-child",e);return C(()=>{let r=new g;return r.subscribe(({facts:o})=>{t.appendChild(_n(o)),t.classList.add("md-source__repository--active")}),ds(e).pipe(w(o=>r.next(o)),_(()=>r.complete()),m(o=>$({ref:e},o)))})}function hs(e,{viewport$:t,header$:r}){return ge(document.body).pipe(v(()=>mr(e,{header$:r,viewport$:t})),m(({offset:{y:o}})=>({hidden:o>=10})),ee("hidden"))}function yi(e,t){return C(()=>{let r=new g;return r.subscribe({next({hidden:o}){e.hidden=o},complete(){e.hidden=!1}}),(B("navigation.tabs.sticky")?I({hidden:!1}):hs(e,t)).pipe(w(o=>r.next(o)),_(()=>r.complete()),m(o=>$({ref:e},o)))})}function bs(e,{viewport$:t,header$:r}){let o=new Map,n=P(".md-nav__link",e);for(let s of n){let p=decodeURIComponent(s.hash.substring(1)),c=fe(`[id="${p}"]`);typeof c!="undefined"&&o.set(s,c)}let i=r.pipe(ee("height"),m(({height:s})=>{let p=Se("main"),c=R(":scope > :first-child",p);return s+.8*(c.offsetTop-p.offsetTop)}),pe());return ge(document.body).pipe(ee("height"),v(s=>C(()=>{let p=[];return I([...o].reduce((c,[l,f])=>{for(;p.length&&o.get(p[p.length-1]).tagName>=f.tagName;)p.pop();let u=f.offsetTop;for(;!u&&f.parentElement;)f=f.parentElement,u=f.offsetTop;let d=f.offsetParent;for(;d;d=d.offsetParent)u+=d.offsetTop;return c.set([...p=[...p,l]].reverse(),u)},new Map))}).pipe(m(p=>new Map([...p].sort(([,c],[,l])=>c-l))),He(i),v(([p,c])=>t.pipe(Fr(([l,f],{offset:{y:u},size:d})=>{let y=u+d.height>=Math.floor(s.height);for(;f.length;){let[,L]=f[0];if(L-c=u&&!y)f=[l.pop(),...f];else break}return[l,f]},[[],[...p]]),K((l,f)=>l[0]===f[0]&&l[1]===f[1])))))).pipe(m(([s,p])=>({prev:s.map(([c])=>c),next:p.map(([c])=>c)})),Q({prev:[],next:[]}),Be(2,1),m(([s,p])=>s.prev.length{let i=new g,a=i.pipe(Z(),ie(!0));if(i.subscribe(({prev:s,next:p})=>{for(let[c]of p)c.classList.remove("md-nav__link--passed"),c.classList.remove("md-nav__link--active");for(let[c,[l]]of s.entries())l.classList.add("md-nav__link--passed"),l.classList.toggle("md-nav__link--active",c===s.length-1)}),B("toc.follow")){let s=O(t.pipe(_e(1),m(()=>{})),t.pipe(_e(250),m(()=>"smooth")));i.pipe(b(({prev:p})=>p.length>0),He(o.pipe(ve(se))),re(s)).subscribe(([[{prev:p}],c])=>{let[l]=p[p.length-1];if(l.offsetHeight){let f=cr(l);if(typeof f!="undefined"){let u=l.offsetTop-f.offsetTop,{height:d}=ce(f);f.scrollTo({top:u-d/2,behavior:c})}}})}return B("navigation.tracking")&&t.pipe(W(a),ee("offset"),_e(250),Ce(1),W(n.pipe(Ce(1))),ct({delay:250}),re(i)).subscribe(([,{prev:s}])=>{let p=ye(),c=s[s.length-1];if(c&&c.length){let[l]=c,{hash:f}=new URL(l.href);p.hash!==f&&(p.hash=f,history.replaceState({},"",`${p}`))}else p.hash="",history.replaceState({},"",`${p}`)}),bs(e,{viewport$:t,header$:r}).pipe(w(s=>i.next(s)),_(()=>i.complete()),m(s=>$({ref:e},s)))})}function vs(e,{viewport$:t,main$:r,target$:o}){let n=t.pipe(m(({offset:{y:a}})=>a),Be(2,1),m(([a,s])=>a>s&&s>0),K()),i=r.pipe(m(({active:a})=>a));return z([i,n]).pipe(m(([a,s])=>!(a&&s)),K(),W(o.pipe(Ce(1))),ie(!0),ct({delay:250}),m(a=>({hidden:a})))}function Ei(e,{viewport$:t,header$:r,main$:o,target$:n}){let i=new g,a=i.pipe(Z(),ie(!0));return i.subscribe({next({hidden:s}){e.hidden=s,s?(e.setAttribute("tabindex","-1"),e.blur()):e.removeAttribute("tabindex")},complete(){e.style.top="",e.hidden=!0,e.removeAttribute("tabindex")}}),r.pipe(W(a),ee("height")).subscribe(({height:s})=>{e.style.top=`${s+16}px`}),h(e,"click").subscribe(s=>{s.preventDefault(),window.scrollTo({top:0})}),vs(e,{viewport$:t,main$:o,target$:n}).pipe(w(s=>i.next(s)),_(()=>i.complete()),m(s=>$({ref:e},s)))}function wi({document$:e,viewport$:t}){e.pipe(v(()=>P(".md-ellipsis")),ne(r=>tt(r).pipe(W(e.pipe(Ce(1))),b(o=>o),m(()=>r),Te(1))),b(r=>r.offsetWidth{let o=r.innerText,n=r.closest("a")||r;return n.title=o,B("content.tooltips")?mt(n,{viewport$:t}).pipe(W(e.pipe(Ce(1))),_(()=>n.removeAttribute("title"))):S})).subscribe(),B("content.tooltips")&&e.pipe(v(()=>P(".md-status")),ne(r=>mt(r,{viewport$:t}))).subscribe()}function Ti({document$:e,tablet$:t}){e.pipe(v(()=>P(".md-toggle--indeterminate")),w(r=>{r.indeterminate=!0,r.checked=!1}),ne(r=>h(r,"change").pipe(Dr(()=>r.classList.contains("md-toggle--indeterminate")),m(()=>r))),re(t)).subscribe(([r,o])=>{r.classList.remove("md-toggle--indeterminate"),o&&(r.checked=!1)})}function gs(){return/(iPad|iPhone|iPod)/.test(navigator.userAgent)}function Si({document$:e}){e.pipe(v(()=>P("[data-md-scrollfix]")),w(t=>t.removeAttribute("data-md-scrollfix")),b(gs),ne(t=>h(t,"touchstart").pipe(m(()=>t)))).subscribe(t=>{let r=t.scrollTop;r===0?t.scrollTop=1:r+t.offsetHeight===t.scrollHeight&&(t.scrollTop=r-1)})}function Oi({viewport$:e,tablet$:t}){z([ze("search"),t]).pipe(m(([r,o])=>r&&!o),v(r=>I(r).pipe(Ge(r?400:100))),re(e)).subscribe(([r,{offset:{y:o}}])=>{if(r)document.body.setAttribute("data-md-scrolllock",""),document.body.style.top=`-${o}px`;else{let n=-1*parseInt(document.body.style.top,10);document.body.removeAttribute("data-md-scrolllock"),document.body.style.top="",n&&window.scrollTo(0,n)}})}Object.entries||(Object.entries=function(e){let t=[];for(let r of Object.keys(e))t.push([r,e[r]]);return t});Object.values||(Object.values=function(e){let t=[];for(let r of Object.keys(e))t.push(e[r]);return t});typeof Element!="undefined"&&(Element.prototype.scrollTo||(Element.prototype.scrollTo=function(e,t){typeof e=="object"?(this.scrollLeft=e.left,this.scrollTop=e.top):(this.scrollLeft=e,this.scrollTop=t)}),Element.prototype.replaceWith||(Element.prototype.replaceWith=function(...e){let t=this.parentNode;if(t){e.length===0&&t.removeChild(this);for(let r=e.length-1;r>=0;r--){let o=e[r];typeof o=="string"?o=document.createTextNode(o):o.parentNode&&o.parentNode.removeChild(o),r?t.insertBefore(this.previousSibling,o):t.replaceChild(o,this)}}}));function ys(){return location.protocol==="file:"?Tt(`${new URL("search/search_index.js",eo.base)}`).pipe(m(()=>__index),G(1)):je(new URL("search/search_index.json",eo.base))}document.documentElement.classList.remove("no-js");document.documentElement.classList.add("js");var ot=Go(),Ut=sn(),Lt=ln(Ut),to=an(),Oe=gn(),hr=Pt("(min-width: 960px)"),Mi=Pt("(min-width: 1220px)"),_i=mn(),eo=xe(),Ai=document.forms.namedItem("search")?ys():Ye,ro=new g;Zn({alert$:ro});var oo=new g;B("navigation.instant")&&oi({location$:Ut,viewport$:Oe,progress$:oo}).subscribe(ot);var Li;((Li=eo.version)==null?void 0:Li.provider)==="mike"&&ci({document$:ot});O(Ut,Lt).pipe(Ge(125)).subscribe(()=>{Je("drawer",!1),Je("search",!1)});to.pipe(b(({mode:e})=>e==="global")).subscribe(e=>{switch(e.type){case"p":case",":let t=fe("link[rel=prev]");typeof t!="undefined"&<(t);break;case"n":case".":let r=fe("link[rel=next]");typeof r!="undefined"&<(r);break;case"Enter":let o=Ie();o instanceof HTMLLabelElement&&o.click()}});wi({viewport$:Oe,document$:ot});Ti({document$:ot,tablet$:hr});Si({document$:ot});Oi({viewport$:Oe,tablet$:hr});var rt=Kn(Se("header"),{viewport$:Oe}),Ft=ot.pipe(m(()=>Se("main")),v(e=>Gn(e,{viewport$:Oe,header$:rt})),G(1)),xs=O(...ae("consent").map(e=>En(e,{target$:Lt})),...ae("dialog").map(e=>qn(e,{alert$:ro})),...ae("palette").map(e=>Jn(e)),...ae("progress").map(e=>Xn(e,{progress$:oo})),...ae("search").map(e=>ui(e,{index$:Ai,keyboard$:to})),...ae("source").map(e=>gi(e))),Es=C(()=>O(...ae("announce").map(e=>xn(e)),...ae("content").map(e=>zn(e,{viewport$:Oe,target$:Lt,print$:_i})),...ae("content").map(e=>B("search.highlight")?di(e,{index$:Ai,location$:Ut}):S),...ae("header").map(e=>Yn(e,{viewport$:Oe,header$:rt,main$:Ft})),...ae("header-title").map(e=>Bn(e,{viewport$:Oe,header$:rt})),...ae("sidebar").map(e=>e.getAttribute("data-md-type")==="navigation"?Nr(Mi,()=>Zr(e,{viewport$:Oe,header$:rt,main$:Ft})):Nr(hr,()=>Zr(e,{viewport$:Oe,header$:rt,main$:Ft}))),...ae("tabs").map(e=>yi(e,{viewport$:Oe,header$:rt})),...ae("toc").map(e=>xi(e,{viewport$:Oe,header$:rt,main$:Ft,target$:Lt})),...ae("top").map(e=>Ei(e,{viewport$:Oe,header$:rt,main$:Ft,target$:Lt})))),Ci=ot.pipe(v(()=>Es),Re(xs),G(1));Ci.subscribe();window.document$=ot;window.location$=Ut;window.target$=Lt;window.keyboard$=to;window.viewport$=Oe;window.tablet$=hr;window.screen$=Mi;window.print$=_i;window.alert$=ro;window.progress$=oo;window.component$=Ci;})(); +//# sourceMappingURL=bundle.88dd0f4e.min.js.map + diff --git a/assets/javascripts/bundle.88dd0f4e.min.js.map b/assets/javascripts/bundle.88dd0f4e.min.js.map new file mode 100644 index 00000000..dab2a875 --- /dev/null +++ b/assets/javascripts/bundle.88dd0f4e.min.js.map @@ -0,0 +1,7 @@ +{ + "version": 3, + "sources": ["node_modules/focus-visible/dist/focus-visible.js", "node_modules/escape-html/index.js", "node_modules/clipboard/dist/clipboard.js", "src/templates/assets/javascripts/bundle.ts", "node_modules/tslib/tslib.es6.mjs", "node_modules/rxjs/src/internal/util/isFunction.ts", "node_modules/rxjs/src/internal/util/createErrorClass.ts", "node_modules/rxjs/src/internal/util/UnsubscriptionError.ts", "node_modules/rxjs/src/internal/util/arrRemove.ts", "node_modules/rxjs/src/internal/Subscription.ts", "node_modules/rxjs/src/internal/config.ts", "node_modules/rxjs/src/internal/scheduler/timeoutProvider.ts", "node_modules/rxjs/src/internal/util/reportUnhandledError.ts", "node_modules/rxjs/src/internal/util/noop.ts", "node_modules/rxjs/src/internal/NotificationFactories.ts", "node_modules/rxjs/src/internal/util/errorContext.ts", "node_modules/rxjs/src/internal/Subscriber.ts", "node_modules/rxjs/src/internal/symbol/observable.ts", "node_modules/rxjs/src/internal/util/identity.ts", "node_modules/rxjs/src/internal/util/pipe.ts", "node_modules/rxjs/src/internal/Observable.ts", "node_modules/rxjs/src/internal/util/lift.ts", "node_modules/rxjs/src/internal/operators/OperatorSubscriber.ts", "node_modules/rxjs/src/internal/scheduler/animationFrameProvider.ts", "node_modules/rxjs/src/internal/util/ObjectUnsubscribedError.ts", "node_modules/rxjs/src/internal/Subject.ts", "node_modules/rxjs/src/internal/BehaviorSubject.ts", "node_modules/rxjs/src/internal/scheduler/dateTimestampProvider.ts", "node_modules/rxjs/src/internal/ReplaySubject.ts", "node_modules/rxjs/src/internal/scheduler/Action.ts", "node_modules/rxjs/src/internal/scheduler/intervalProvider.ts", "node_modules/rxjs/src/internal/scheduler/AsyncAction.ts", "node_modules/rxjs/src/internal/Scheduler.ts", "node_modules/rxjs/src/internal/scheduler/AsyncScheduler.ts", "node_modules/rxjs/src/internal/scheduler/async.ts", "node_modules/rxjs/src/internal/scheduler/QueueAction.ts", "node_modules/rxjs/src/internal/scheduler/QueueScheduler.ts", "node_modules/rxjs/src/internal/scheduler/queue.ts", "node_modules/rxjs/src/internal/scheduler/AnimationFrameAction.ts", "node_modules/rxjs/src/internal/scheduler/AnimationFrameScheduler.ts", "node_modules/rxjs/src/internal/scheduler/animationFrame.ts", "node_modules/rxjs/src/internal/observable/empty.ts", "node_modules/rxjs/src/internal/util/isScheduler.ts", "node_modules/rxjs/src/internal/util/args.ts", "node_modules/rxjs/src/internal/util/isArrayLike.ts", "node_modules/rxjs/src/internal/util/isPromise.ts", "node_modules/rxjs/src/internal/util/isInteropObservable.ts", "node_modules/rxjs/src/internal/util/isAsyncIterable.ts", "node_modules/rxjs/src/internal/util/throwUnobservableError.ts", "node_modules/rxjs/src/internal/symbol/iterator.ts", "node_modules/rxjs/src/internal/util/isIterable.ts", "node_modules/rxjs/src/internal/util/isReadableStreamLike.ts", "node_modules/rxjs/src/internal/observable/innerFrom.ts", "node_modules/rxjs/src/internal/util/executeSchedule.ts", "node_modules/rxjs/src/internal/operators/observeOn.ts", "node_modules/rxjs/src/internal/operators/subscribeOn.ts", "node_modules/rxjs/src/internal/scheduled/scheduleObservable.ts", "node_modules/rxjs/src/internal/scheduled/schedulePromise.ts", "node_modules/rxjs/src/internal/scheduled/scheduleArray.ts", "node_modules/rxjs/src/internal/scheduled/scheduleIterable.ts", "node_modules/rxjs/src/internal/scheduled/scheduleAsyncIterable.ts", "node_modules/rxjs/src/internal/scheduled/scheduleReadableStreamLike.ts", "node_modules/rxjs/src/internal/scheduled/scheduled.ts", "node_modules/rxjs/src/internal/observable/from.ts", "node_modules/rxjs/src/internal/observable/of.ts", "node_modules/rxjs/src/internal/observable/throwError.ts", "node_modules/rxjs/src/internal/util/EmptyError.ts", "node_modules/rxjs/src/internal/util/isDate.ts", "node_modules/rxjs/src/internal/operators/map.ts", "node_modules/rxjs/src/internal/util/mapOneOrManyArgs.ts", "node_modules/rxjs/src/internal/util/argsArgArrayOrObject.ts", "node_modules/rxjs/src/internal/util/createObject.ts", "node_modules/rxjs/src/internal/observable/combineLatest.ts", "node_modules/rxjs/src/internal/operators/mergeInternals.ts", "node_modules/rxjs/src/internal/operators/mergeMap.ts", "node_modules/rxjs/src/internal/operators/mergeAll.ts", "node_modules/rxjs/src/internal/operators/concatAll.ts", "node_modules/rxjs/src/internal/observable/concat.ts", "node_modules/rxjs/src/internal/observable/defer.ts", "node_modules/rxjs/src/internal/observable/fromEvent.ts", "node_modules/rxjs/src/internal/observable/fromEventPattern.ts", "node_modules/rxjs/src/internal/observable/timer.ts", "node_modules/rxjs/src/internal/observable/merge.ts", "node_modules/rxjs/src/internal/observable/never.ts", "node_modules/rxjs/src/internal/util/argsOrArgArray.ts", "node_modules/rxjs/src/internal/operators/filter.ts", "node_modules/rxjs/src/internal/observable/zip.ts", "node_modules/rxjs/src/internal/operators/audit.ts", "node_modules/rxjs/src/internal/operators/auditTime.ts", "node_modules/rxjs/src/internal/operators/bufferCount.ts", "node_modules/rxjs/src/internal/operators/catchError.ts", "node_modules/rxjs/src/internal/operators/scanInternals.ts", "node_modules/rxjs/src/internal/operators/combineLatest.ts", "node_modules/rxjs/src/internal/operators/combineLatestWith.ts", "node_modules/rxjs/src/internal/operators/debounce.ts", "node_modules/rxjs/src/internal/operators/debounceTime.ts", "node_modules/rxjs/src/internal/operators/defaultIfEmpty.ts", "node_modules/rxjs/src/internal/operators/take.ts", "node_modules/rxjs/src/internal/operators/ignoreElements.ts", "node_modules/rxjs/src/internal/operators/mapTo.ts", "node_modules/rxjs/src/internal/operators/delayWhen.ts", "node_modules/rxjs/src/internal/operators/delay.ts", "node_modules/rxjs/src/internal/operators/distinctUntilChanged.ts", "node_modules/rxjs/src/internal/operators/distinctUntilKeyChanged.ts", "node_modules/rxjs/src/internal/operators/throwIfEmpty.ts", "node_modules/rxjs/src/internal/operators/endWith.ts", "node_modules/rxjs/src/internal/operators/finalize.ts", "node_modules/rxjs/src/internal/operators/first.ts", "node_modules/rxjs/src/internal/operators/takeLast.ts", "node_modules/rxjs/src/internal/operators/merge.ts", "node_modules/rxjs/src/internal/operators/mergeWith.ts", "node_modules/rxjs/src/internal/operators/repeat.ts", "node_modules/rxjs/src/internal/operators/scan.ts", "node_modules/rxjs/src/internal/operators/share.ts", "node_modules/rxjs/src/internal/operators/shareReplay.ts", "node_modules/rxjs/src/internal/operators/skip.ts", "node_modules/rxjs/src/internal/operators/skipUntil.ts", "node_modules/rxjs/src/internal/operators/startWith.ts", "node_modules/rxjs/src/internal/operators/switchMap.ts", "node_modules/rxjs/src/internal/operators/takeUntil.ts", "node_modules/rxjs/src/internal/operators/takeWhile.ts", "node_modules/rxjs/src/internal/operators/tap.ts", "node_modules/rxjs/src/internal/operators/throttle.ts", "node_modules/rxjs/src/internal/operators/throttleTime.ts", "node_modules/rxjs/src/internal/operators/withLatestFrom.ts", "node_modules/rxjs/src/internal/operators/zip.ts", "node_modules/rxjs/src/internal/operators/zipWith.ts", "src/templates/assets/javascripts/browser/document/index.ts", "src/templates/assets/javascripts/browser/element/_/index.ts", "src/templates/assets/javascripts/browser/element/focus/index.ts", "src/templates/assets/javascripts/browser/element/hover/index.ts", "src/templates/assets/javascripts/utilities/h/index.ts", "src/templates/assets/javascripts/utilities/round/index.ts", "src/templates/assets/javascripts/browser/script/index.ts", "src/templates/assets/javascripts/browser/element/size/_/index.ts", "src/templates/assets/javascripts/browser/element/size/content/index.ts", "src/templates/assets/javascripts/browser/element/offset/_/index.ts", "src/templates/assets/javascripts/browser/element/offset/content/index.ts", "src/templates/assets/javascripts/browser/element/visibility/index.ts", "src/templates/assets/javascripts/browser/toggle/index.ts", "src/templates/assets/javascripts/browser/keyboard/index.ts", "src/templates/assets/javascripts/browser/location/_/index.ts", "src/templates/assets/javascripts/browser/location/hash/index.ts", "src/templates/assets/javascripts/browser/media/index.ts", "src/templates/assets/javascripts/browser/request/index.ts", "src/templates/assets/javascripts/browser/viewport/offset/index.ts", "src/templates/assets/javascripts/browser/viewport/size/index.ts", "src/templates/assets/javascripts/browser/viewport/_/index.ts", "src/templates/assets/javascripts/browser/viewport/at/index.ts", "src/templates/assets/javascripts/browser/worker/index.ts", "src/templates/assets/javascripts/_/index.ts", "src/templates/assets/javascripts/components/_/index.ts", "src/templates/assets/javascripts/components/announce/index.ts", "src/templates/assets/javascripts/components/consent/index.ts", "src/templates/assets/javascripts/templates/tooltip/index.tsx", "src/templates/assets/javascripts/templates/annotation/index.tsx", "src/templates/assets/javascripts/templates/clipboard/index.tsx", "src/templates/assets/javascripts/templates/search/index.tsx", "src/templates/assets/javascripts/templates/source/index.tsx", "src/templates/assets/javascripts/templates/tabbed/index.tsx", "src/templates/assets/javascripts/templates/table/index.tsx", "src/templates/assets/javascripts/templates/version/index.tsx", "src/templates/assets/javascripts/components/tooltip2/index.ts", "src/templates/assets/javascripts/components/content/annotation/_/index.ts", "src/templates/assets/javascripts/components/content/annotation/list/index.ts", "src/templates/assets/javascripts/components/content/annotation/block/index.ts", "src/templates/assets/javascripts/components/content/code/_/index.ts", "src/templates/assets/javascripts/components/content/details/index.ts", "src/templates/assets/javascripts/components/content/mermaid/index.css", "src/templates/assets/javascripts/components/content/mermaid/index.ts", "src/templates/assets/javascripts/components/content/table/index.ts", "src/templates/assets/javascripts/components/content/tabs/index.ts", "src/templates/assets/javascripts/components/content/_/index.ts", "src/templates/assets/javascripts/components/dialog/index.ts", "src/templates/assets/javascripts/components/tooltip/index.ts", "src/templates/assets/javascripts/components/header/_/index.ts", "src/templates/assets/javascripts/components/header/title/index.ts", "src/templates/assets/javascripts/components/main/index.ts", "src/templates/assets/javascripts/components/palette/index.ts", "src/templates/assets/javascripts/components/progress/index.ts", "src/templates/assets/javascripts/integrations/clipboard/index.ts", "src/templates/assets/javascripts/integrations/sitemap/index.ts", "src/templates/assets/javascripts/integrations/instant/index.ts", "src/templates/assets/javascripts/integrations/search/highlighter/index.ts", "src/templates/assets/javascripts/integrations/search/worker/message/index.ts", "src/templates/assets/javascripts/integrations/search/worker/_/index.ts", "src/templates/assets/javascripts/integrations/version/findurl/index.ts", "src/templates/assets/javascripts/integrations/version/index.ts", "src/templates/assets/javascripts/components/search/query/index.ts", "src/templates/assets/javascripts/components/search/result/index.ts", "src/templates/assets/javascripts/components/search/share/index.ts", "src/templates/assets/javascripts/components/search/suggest/index.ts", "src/templates/assets/javascripts/components/search/_/index.ts", "src/templates/assets/javascripts/components/search/highlight/index.ts", "src/templates/assets/javascripts/components/sidebar/index.ts", "src/templates/assets/javascripts/components/source/facts/github/index.ts", "src/templates/assets/javascripts/components/source/facts/gitlab/index.ts", "src/templates/assets/javascripts/components/source/facts/_/index.ts", "src/templates/assets/javascripts/components/source/_/index.ts", "src/templates/assets/javascripts/components/tabs/index.ts", "src/templates/assets/javascripts/components/toc/index.ts", "src/templates/assets/javascripts/components/top/index.ts", "src/templates/assets/javascripts/patches/ellipsis/index.ts", "src/templates/assets/javascripts/patches/indeterminate/index.ts", "src/templates/assets/javascripts/patches/scrollfix/index.ts", "src/templates/assets/javascripts/patches/scrolllock/index.ts", "src/templates/assets/javascripts/polyfills/index.ts"], + "sourcesContent": ["(function (global, factory) {\n typeof exports === 'object' && typeof module !== 'undefined' ? factory() :\n typeof define === 'function' && define.amd ? define(factory) :\n (factory());\n}(this, (function () { 'use strict';\n\n /**\n * Applies the :focus-visible polyfill at the given scope.\n * A scope in this case is either the top-level Document or a Shadow Root.\n *\n * @param {(Document|ShadowRoot)} scope\n * @see https://github.com/WICG/focus-visible\n */\n function applyFocusVisiblePolyfill(scope) {\n var hadKeyboardEvent = true;\n var hadFocusVisibleRecently = false;\n var hadFocusVisibleRecentlyTimeout = null;\n\n var inputTypesAllowlist = {\n text: true,\n search: true,\n url: true,\n tel: true,\n email: true,\n password: true,\n number: true,\n date: true,\n month: true,\n week: true,\n time: true,\n datetime: true,\n 'datetime-local': true\n };\n\n /**\n * Helper function for legacy browsers and iframes which sometimes focus\n * elements like document, body, and non-interactive SVG.\n * @param {Element} el\n */\n function isValidFocusTarget(el) {\n if (\n el &&\n el !== document &&\n el.nodeName !== 'HTML' &&\n el.nodeName !== 'BODY' &&\n 'classList' in el &&\n 'contains' in el.classList\n ) {\n return true;\n }\n return false;\n }\n\n /**\n * Computes whether the given element should automatically trigger the\n * `focus-visible` class being added, i.e. whether it should always match\n * `:focus-visible` when focused.\n * @param {Element} el\n * @return {boolean}\n */\n function focusTriggersKeyboardModality(el) {\n var type = el.type;\n var tagName = el.tagName;\n\n if (tagName === 'INPUT' && inputTypesAllowlist[type] && !el.readOnly) {\n return true;\n }\n\n if (tagName === 'TEXTAREA' && !el.readOnly) {\n return true;\n }\n\n if (el.isContentEditable) {\n return true;\n }\n\n return false;\n }\n\n /**\n * Add the `focus-visible` class to the given element if it was not added by\n * the author.\n * @param {Element} el\n */\n function addFocusVisibleClass(el) {\n if (el.classList.contains('focus-visible')) {\n return;\n }\n el.classList.add('focus-visible');\n el.setAttribute('data-focus-visible-added', '');\n }\n\n /**\n * Remove the `focus-visible` class from the given element if it was not\n * originally added by the author.\n * @param {Element} el\n */\n function removeFocusVisibleClass(el) {\n if (!el.hasAttribute('data-focus-visible-added')) {\n return;\n }\n el.classList.remove('focus-visible');\n el.removeAttribute('data-focus-visible-added');\n }\n\n /**\n * If the most recent user interaction was via the keyboard;\n * and the key press did not include a meta, alt/option, or control key;\n * then the modality is keyboard. Otherwise, the modality is not keyboard.\n * Apply `focus-visible` to any current active element and keep track\n * of our keyboard modality state with `hadKeyboardEvent`.\n * @param {KeyboardEvent} e\n */\n function onKeyDown(e) {\n if (e.metaKey || e.altKey || e.ctrlKey) {\n return;\n }\n\n if (isValidFocusTarget(scope.activeElement)) {\n addFocusVisibleClass(scope.activeElement);\n }\n\n hadKeyboardEvent = true;\n }\n\n /**\n * If at any point a user clicks with a pointing device, ensure that we change\n * the modality away from keyboard.\n * This avoids the situation where a user presses a key on an already focused\n * element, and then clicks on a different element, focusing it with a\n * pointing device, while we still think we're in keyboard modality.\n * @param {Event} e\n */\n function onPointerDown(e) {\n hadKeyboardEvent = false;\n }\n\n /**\n * On `focus`, add the `focus-visible` class to the target if:\n * - the target received focus as a result of keyboard navigation, or\n * - the event target is an element that will likely require interaction\n * via the keyboard (e.g. a text box)\n * @param {Event} e\n */\n function onFocus(e) {\n // Prevent IE from focusing the document or HTML element.\n if (!isValidFocusTarget(e.target)) {\n return;\n }\n\n if (hadKeyboardEvent || focusTriggersKeyboardModality(e.target)) {\n addFocusVisibleClass(e.target);\n }\n }\n\n /**\n * On `blur`, remove the `focus-visible` class from the target.\n * @param {Event} e\n */\n function onBlur(e) {\n if (!isValidFocusTarget(e.target)) {\n return;\n }\n\n if (\n e.target.classList.contains('focus-visible') ||\n e.target.hasAttribute('data-focus-visible-added')\n ) {\n // To detect a tab/window switch, we look for a blur event followed\n // rapidly by a visibility change.\n // If we don't see a visibility change within 100ms, it's probably a\n // regular focus change.\n hadFocusVisibleRecently = true;\n window.clearTimeout(hadFocusVisibleRecentlyTimeout);\n hadFocusVisibleRecentlyTimeout = window.setTimeout(function() {\n hadFocusVisibleRecently = false;\n }, 100);\n removeFocusVisibleClass(e.target);\n }\n }\n\n /**\n * If the user changes tabs, keep track of whether or not the previously\n * focused element had .focus-visible.\n * @param {Event} e\n */\n function onVisibilityChange(e) {\n if (document.visibilityState === 'hidden') {\n // If the tab becomes active again, the browser will handle calling focus\n // on the element (Safari actually calls it twice).\n // If this tab change caused a blur on an element with focus-visible,\n // re-apply the class when the user switches back to the tab.\n if (hadFocusVisibleRecently) {\n hadKeyboardEvent = true;\n }\n addInitialPointerMoveListeners();\n }\n }\n\n /**\n * Add a group of listeners to detect usage of any pointing devices.\n * These listeners will be added when the polyfill first loads, and anytime\n * the window is blurred, so that they are active when the window regains\n * focus.\n */\n function addInitialPointerMoveListeners() {\n document.addEventListener('mousemove', onInitialPointerMove);\n document.addEventListener('mousedown', onInitialPointerMove);\n document.addEventListener('mouseup', onInitialPointerMove);\n document.addEventListener('pointermove', onInitialPointerMove);\n document.addEventListener('pointerdown', onInitialPointerMove);\n document.addEventListener('pointerup', onInitialPointerMove);\n document.addEventListener('touchmove', onInitialPointerMove);\n document.addEventListener('touchstart', onInitialPointerMove);\n document.addEventListener('touchend', onInitialPointerMove);\n }\n\n function removeInitialPointerMoveListeners() {\n document.removeEventListener('mousemove', onInitialPointerMove);\n document.removeEventListener('mousedown', onInitialPointerMove);\n document.removeEventListener('mouseup', onInitialPointerMove);\n document.removeEventListener('pointermove', onInitialPointerMove);\n document.removeEventListener('pointerdown', onInitialPointerMove);\n document.removeEventListener('pointerup', onInitialPointerMove);\n document.removeEventListener('touchmove', onInitialPointerMove);\n document.removeEventListener('touchstart', onInitialPointerMove);\n document.removeEventListener('touchend', onInitialPointerMove);\n }\n\n /**\n * When the polfyill first loads, assume the user is in keyboard modality.\n * If any event is received from a pointing device (e.g. mouse, pointer,\n * touch), turn off keyboard modality.\n * This accounts for situations where focus enters the page from the URL bar.\n * @param {Event} e\n */\n function onInitialPointerMove(e) {\n // Work around a Safari quirk that fires a mousemove on whenever the\n // window blurs, even if you're tabbing out of the page. \u00AF\\_(\u30C4)_/\u00AF\n if (e.target.nodeName && e.target.nodeName.toLowerCase() === 'html') {\n return;\n }\n\n hadKeyboardEvent = false;\n removeInitialPointerMoveListeners();\n }\n\n // For some kinds of state, we are interested in changes at the global scope\n // only. For example, global pointer input, global key presses and global\n // visibility change should affect the state at every scope:\n document.addEventListener('keydown', onKeyDown, true);\n document.addEventListener('mousedown', onPointerDown, true);\n document.addEventListener('pointerdown', onPointerDown, true);\n document.addEventListener('touchstart', onPointerDown, true);\n document.addEventListener('visibilitychange', onVisibilityChange, true);\n\n addInitialPointerMoveListeners();\n\n // For focus and blur, we specifically care about state changes in the local\n // scope. This is because focus / blur events that originate from within a\n // shadow root are not re-dispatched from the host element if it was already\n // the active element in its own scope:\n scope.addEventListener('focus', onFocus, true);\n scope.addEventListener('blur', onBlur, true);\n\n // We detect that a node is a ShadowRoot by ensuring that it is a\n // DocumentFragment and also has a host property. This check covers native\n // implementation and polyfill implementation transparently. If we only cared\n // about the native implementation, we could just check if the scope was\n // an instance of a ShadowRoot.\n if (scope.nodeType === Node.DOCUMENT_FRAGMENT_NODE && scope.host) {\n // Since a ShadowRoot is a special kind of DocumentFragment, it does not\n // have a root element to add a class to. So, we add this attribute to the\n // host element instead:\n scope.host.setAttribute('data-js-focus-visible', '');\n } else if (scope.nodeType === Node.DOCUMENT_NODE) {\n document.documentElement.classList.add('js-focus-visible');\n document.documentElement.setAttribute('data-js-focus-visible', '');\n }\n }\n\n // It is important to wrap all references to global window and document in\n // these checks to support server-side rendering use cases\n // @see https://github.com/WICG/focus-visible/issues/199\n if (typeof window !== 'undefined' && typeof document !== 'undefined') {\n // Make the polyfill helper globally available. This can be used as a signal\n // to interested libraries that wish to coordinate with the polyfill for e.g.,\n // applying the polyfill to a shadow root:\n window.applyFocusVisiblePolyfill = applyFocusVisiblePolyfill;\n\n // Notify interested libraries of the polyfill's presence, in case the\n // polyfill was loaded lazily:\n var event;\n\n try {\n event = new CustomEvent('focus-visible-polyfill-ready');\n } catch (error) {\n // IE11 does not support using CustomEvent as a constructor directly:\n event = document.createEvent('CustomEvent');\n event.initCustomEvent('focus-visible-polyfill-ready', false, false, {});\n }\n\n window.dispatchEvent(event);\n }\n\n if (typeof document !== 'undefined') {\n // Apply the polyfill to the global document, so that no JavaScript\n // coordination is required to use the polyfill in the top-level document:\n applyFocusVisiblePolyfill(document);\n }\n\n})));\n", "/*!\n * escape-html\n * Copyright(c) 2012-2013 TJ Holowaychuk\n * Copyright(c) 2015 Andreas Lubbe\n * Copyright(c) 2015 Tiancheng \"Timothy\" Gu\n * MIT Licensed\n */\n\n'use strict';\n\n/**\n * Module variables.\n * @private\n */\n\nvar matchHtmlRegExp = /[\"'&<>]/;\n\n/**\n * Module exports.\n * @public\n */\n\nmodule.exports = escapeHtml;\n\n/**\n * Escape special characters in the given string of html.\n *\n * @param {string} string The string to escape for inserting into HTML\n * @return {string}\n * @public\n */\n\nfunction escapeHtml(string) {\n var str = '' + string;\n var match = matchHtmlRegExp.exec(str);\n\n if (!match) {\n return str;\n }\n\n var escape;\n var html = '';\n var index = 0;\n var lastIndex = 0;\n\n for (index = match.index; index < str.length; index++) {\n switch (str.charCodeAt(index)) {\n case 34: // \"\n escape = '"';\n break;\n case 38: // &\n escape = '&';\n break;\n case 39: // '\n escape = ''';\n break;\n case 60: // <\n escape = '<';\n break;\n case 62: // >\n escape = '>';\n break;\n default:\n continue;\n }\n\n if (lastIndex !== index) {\n html += str.substring(lastIndex, index);\n }\n\n lastIndex = index + 1;\n html += escape;\n }\n\n return lastIndex !== index\n ? html + str.substring(lastIndex, index)\n : html;\n}\n", "/*!\n * clipboard.js v2.0.11\n * https://clipboardjs.com/\n *\n * Licensed MIT \u00A9 Zeno Rocha\n */\n(function webpackUniversalModuleDefinition(root, factory) {\n\tif(typeof exports === 'object' && typeof module === 'object')\n\t\tmodule.exports = factory();\n\telse if(typeof define === 'function' && define.amd)\n\t\tdefine([], factory);\n\telse if(typeof exports === 'object')\n\t\texports[\"ClipboardJS\"] = factory();\n\telse\n\t\troot[\"ClipboardJS\"] = factory();\n})(this, function() {\nreturn /******/ (function() { // webpackBootstrap\n/******/ \tvar __webpack_modules__ = ({\n\n/***/ 686:\n/***/ (function(__unused_webpack_module, __webpack_exports__, __webpack_require__) {\n\n\"use strict\";\n\n// EXPORTS\n__webpack_require__.d(__webpack_exports__, {\n \"default\": function() { return /* binding */ clipboard; }\n});\n\n// EXTERNAL MODULE: ./node_modules/tiny-emitter/index.js\nvar tiny_emitter = __webpack_require__(279);\nvar tiny_emitter_default = /*#__PURE__*/__webpack_require__.n(tiny_emitter);\n// EXTERNAL MODULE: ./node_modules/good-listener/src/listen.js\nvar listen = __webpack_require__(370);\nvar listen_default = /*#__PURE__*/__webpack_require__.n(listen);\n// EXTERNAL MODULE: ./node_modules/select/src/select.js\nvar src_select = __webpack_require__(817);\nvar select_default = /*#__PURE__*/__webpack_require__.n(src_select);\n;// CONCATENATED MODULE: ./src/common/command.js\n/**\n * Executes a given operation type.\n * @param {String} type\n * @return {Boolean}\n */\nfunction command(type) {\n try {\n return document.execCommand(type);\n } catch (err) {\n return false;\n }\n}\n;// CONCATENATED MODULE: ./src/actions/cut.js\n\n\n/**\n * Cut action wrapper.\n * @param {String|HTMLElement} target\n * @return {String}\n */\n\nvar ClipboardActionCut = function ClipboardActionCut(target) {\n var selectedText = select_default()(target);\n command('cut');\n return selectedText;\n};\n\n/* harmony default export */ var actions_cut = (ClipboardActionCut);\n;// CONCATENATED MODULE: ./src/common/create-fake-element.js\n/**\n * Creates a fake textarea element with a value.\n * @param {String} value\n * @return {HTMLElement}\n */\nfunction createFakeElement(value) {\n var isRTL = document.documentElement.getAttribute('dir') === 'rtl';\n var fakeElement = document.createElement('textarea'); // Prevent zooming on iOS\n\n fakeElement.style.fontSize = '12pt'; // Reset box model\n\n fakeElement.style.border = '0';\n fakeElement.style.padding = '0';\n fakeElement.style.margin = '0'; // Move element out of screen horizontally\n\n fakeElement.style.position = 'absolute';\n fakeElement.style[isRTL ? 'right' : 'left'] = '-9999px'; // Move element to the same position vertically\n\n var yPosition = window.pageYOffset || document.documentElement.scrollTop;\n fakeElement.style.top = \"\".concat(yPosition, \"px\");\n fakeElement.setAttribute('readonly', '');\n fakeElement.value = value;\n return fakeElement;\n}\n;// CONCATENATED MODULE: ./src/actions/copy.js\n\n\n\n/**\n * Create fake copy action wrapper using a fake element.\n * @param {String} target\n * @param {Object} options\n * @return {String}\n */\n\nvar fakeCopyAction = function fakeCopyAction(value, options) {\n var fakeElement = createFakeElement(value);\n options.container.appendChild(fakeElement);\n var selectedText = select_default()(fakeElement);\n command('copy');\n fakeElement.remove();\n return selectedText;\n};\n/**\n * Copy action wrapper.\n * @param {String|HTMLElement} target\n * @param {Object} options\n * @return {String}\n */\n\n\nvar ClipboardActionCopy = function ClipboardActionCopy(target) {\n var options = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : {\n container: document.body\n };\n var selectedText = '';\n\n if (typeof target === 'string') {\n selectedText = fakeCopyAction(target, options);\n } else if (target instanceof HTMLInputElement && !['text', 'search', 'url', 'tel', 'password'].includes(target === null || target === void 0 ? void 0 : target.type)) {\n // If input type doesn't support `setSelectionRange`. Simulate it. https://developer.mozilla.org/en-US/docs/Web/API/HTMLInputElement/setSelectionRange\n selectedText = fakeCopyAction(target.value, options);\n } else {\n selectedText = select_default()(target);\n command('copy');\n }\n\n return selectedText;\n};\n\n/* harmony default export */ var actions_copy = (ClipboardActionCopy);\n;// CONCATENATED MODULE: ./src/actions/default.js\nfunction _typeof(obj) { \"@babel/helpers - typeof\"; if (typeof Symbol === \"function\" && typeof Symbol.iterator === \"symbol\") { _typeof = function _typeof(obj) { return typeof obj; }; } else { _typeof = function _typeof(obj) { return obj && typeof Symbol === \"function\" && obj.constructor === Symbol && obj !== Symbol.prototype ? \"symbol\" : typeof obj; }; } return _typeof(obj); }\n\n\n\n/**\n * Inner function which performs selection from either `text` or `target`\n * properties and then executes copy or cut operations.\n * @param {Object} options\n */\n\nvar ClipboardActionDefault = function ClipboardActionDefault() {\n var options = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : {};\n // Defines base properties passed from constructor.\n var _options$action = options.action,\n action = _options$action === void 0 ? 'copy' : _options$action,\n container = options.container,\n target = options.target,\n text = options.text; // Sets the `action` to be performed which can be either 'copy' or 'cut'.\n\n if (action !== 'copy' && action !== 'cut') {\n throw new Error('Invalid \"action\" value, use either \"copy\" or \"cut\"');\n } // Sets the `target` property using an element that will be have its content copied.\n\n\n if (target !== undefined) {\n if (target && _typeof(target) === 'object' && target.nodeType === 1) {\n if (action === 'copy' && target.hasAttribute('disabled')) {\n throw new Error('Invalid \"target\" attribute. Please use \"readonly\" instead of \"disabled\" attribute');\n }\n\n if (action === 'cut' && (target.hasAttribute('readonly') || target.hasAttribute('disabled'))) {\n throw new Error('Invalid \"target\" attribute. You can\\'t cut text from elements with \"readonly\" or \"disabled\" attributes');\n }\n } else {\n throw new Error('Invalid \"target\" value, use a valid Element');\n }\n } // Define selection strategy based on `text` property.\n\n\n if (text) {\n return actions_copy(text, {\n container: container\n });\n } // Defines which selection strategy based on `target` property.\n\n\n if (target) {\n return action === 'cut' ? actions_cut(target) : actions_copy(target, {\n container: container\n });\n }\n};\n\n/* harmony default export */ var actions_default = (ClipboardActionDefault);\n;// CONCATENATED MODULE: ./src/clipboard.js\nfunction clipboard_typeof(obj) { \"@babel/helpers - typeof\"; if (typeof Symbol === \"function\" && typeof Symbol.iterator === \"symbol\") { clipboard_typeof = function _typeof(obj) { return typeof obj; }; } else { clipboard_typeof = function _typeof(obj) { return obj && typeof Symbol === \"function\" && obj.constructor === Symbol && obj !== Symbol.prototype ? \"symbol\" : typeof obj; }; } return clipboard_typeof(obj); }\n\nfunction _classCallCheck(instance, Constructor) { if (!(instance instanceof Constructor)) { throw new TypeError(\"Cannot call a class as a function\"); } }\n\nfunction _defineProperties(target, props) { for (var i = 0; i < props.length; i++) { var descriptor = props[i]; descriptor.enumerable = descriptor.enumerable || false; descriptor.configurable = true; if (\"value\" in descriptor) descriptor.writable = true; Object.defineProperty(target, descriptor.key, descriptor); } }\n\nfunction _createClass(Constructor, protoProps, staticProps) { if (protoProps) _defineProperties(Constructor.prototype, protoProps); if (staticProps) _defineProperties(Constructor, staticProps); return Constructor; }\n\nfunction _inherits(subClass, superClass) { if (typeof superClass !== \"function\" && superClass !== null) { throw new TypeError(\"Super expression must either be null or a function\"); } subClass.prototype = Object.create(superClass && superClass.prototype, { constructor: { value: subClass, writable: true, configurable: true } }); if (superClass) _setPrototypeOf(subClass, superClass); }\n\nfunction _setPrototypeOf(o, p) { _setPrototypeOf = Object.setPrototypeOf || function _setPrototypeOf(o, p) { o.__proto__ = p; return o; }; return _setPrototypeOf(o, p); }\n\nfunction _createSuper(Derived) { var hasNativeReflectConstruct = _isNativeReflectConstruct(); return function _createSuperInternal() { var Super = _getPrototypeOf(Derived), result; if (hasNativeReflectConstruct) { var NewTarget = _getPrototypeOf(this).constructor; result = Reflect.construct(Super, arguments, NewTarget); } else { result = Super.apply(this, arguments); } return _possibleConstructorReturn(this, result); }; }\n\nfunction _possibleConstructorReturn(self, call) { if (call && (clipboard_typeof(call) === \"object\" || typeof call === \"function\")) { return call; } return _assertThisInitialized(self); }\n\nfunction _assertThisInitialized(self) { if (self === void 0) { throw new ReferenceError(\"this hasn't been initialised - super() hasn't been called\"); } return self; }\n\nfunction _isNativeReflectConstruct() { if (typeof Reflect === \"undefined\" || !Reflect.construct) return false; if (Reflect.construct.sham) return false; if (typeof Proxy === \"function\") return true; try { Date.prototype.toString.call(Reflect.construct(Date, [], function () {})); return true; } catch (e) { return false; } }\n\nfunction _getPrototypeOf(o) { _getPrototypeOf = Object.setPrototypeOf ? Object.getPrototypeOf : function _getPrototypeOf(o) { return o.__proto__ || Object.getPrototypeOf(o); }; return _getPrototypeOf(o); }\n\n\n\n\n\n\n/**\n * Helper function to retrieve attribute value.\n * @param {String} suffix\n * @param {Element} element\n */\n\nfunction getAttributeValue(suffix, element) {\n var attribute = \"data-clipboard-\".concat(suffix);\n\n if (!element.hasAttribute(attribute)) {\n return;\n }\n\n return element.getAttribute(attribute);\n}\n/**\n * Base class which takes one or more elements, adds event listeners to them,\n * and instantiates a new `ClipboardAction` on each click.\n */\n\n\nvar Clipboard = /*#__PURE__*/function (_Emitter) {\n _inherits(Clipboard, _Emitter);\n\n var _super = _createSuper(Clipboard);\n\n /**\n * @param {String|HTMLElement|HTMLCollection|NodeList} trigger\n * @param {Object} options\n */\n function Clipboard(trigger, options) {\n var _this;\n\n _classCallCheck(this, Clipboard);\n\n _this = _super.call(this);\n\n _this.resolveOptions(options);\n\n _this.listenClick(trigger);\n\n return _this;\n }\n /**\n * Defines if attributes would be resolved using internal setter functions\n * or custom functions that were passed in the constructor.\n * @param {Object} options\n */\n\n\n _createClass(Clipboard, [{\n key: \"resolveOptions\",\n value: function resolveOptions() {\n var options = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : {};\n this.action = typeof options.action === 'function' ? options.action : this.defaultAction;\n this.target = typeof options.target === 'function' ? options.target : this.defaultTarget;\n this.text = typeof options.text === 'function' ? options.text : this.defaultText;\n this.container = clipboard_typeof(options.container) === 'object' ? options.container : document.body;\n }\n /**\n * Adds a click event listener to the passed trigger.\n * @param {String|HTMLElement|HTMLCollection|NodeList} trigger\n */\n\n }, {\n key: \"listenClick\",\n value: function listenClick(trigger) {\n var _this2 = this;\n\n this.listener = listen_default()(trigger, 'click', function (e) {\n return _this2.onClick(e);\n });\n }\n /**\n * Defines a new `ClipboardAction` on each click event.\n * @param {Event} e\n */\n\n }, {\n key: \"onClick\",\n value: function onClick(e) {\n var trigger = e.delegateTarget || e.currentTarget;\n var action = this.action(trigger) || 'copy';\n var text = actions_default({\n action: action,\n container: this.container,\n target: this.target(trigger),\n text: this.text(trigger)\n }); // Fires an event based on the copy operation result.\n\n this.emit(text ? 'success' : 'error', {\n action: action,\n text: text,\n trigger: trigger,\n clearSelection: function clearSelection() {\n if (trigger) {\n trigger.focus();\n }\n\n window.getSelection().removeAllRanges();\n }\n });\n }\n /**\n * Default `action` lookup function.\n * @param {Element} trigger\n */\n\n }, {\n key: \"defaultAction\",\n value: function defaultAction(trigger) {\n return getAttributeValue('action', trigger);\n }\n /**\n * Default `target` lookup function.\n * @param {Element} trigger\n */\n\n }, {\n key: \"defaultTarget\",\n value: function defaultTarget(trigger) {\n var selector = getAttributeValue('target', trigger);\n\n if (selector) {\n return document.querySelector(selector);\n }\n }\n /**\n * Allow fire programmatically a copy action\n * @param {String|HTMLElement} target\n * @param {Object} options\n * @returns Text copied.\n */\n\n }, {\n key: \"defaultText\",\n\n /**\n * Default `text` lookup function.\n * @param {Element} trigger\n */\n value: function defaultText(trigger) {\n return getAttributeValue('text', trigger);\n }\n /**\n * Destroy lifecycle.\n */\n\n }, {\n key: \"destroy\",\n value: function destroy() {\n this.listener.destroy();\n }\n }], [{\n key: \"copy\",\n value: function copy(target) {\n var options = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : {\n container: document.body\n };\n return actions_copy(target, options);\n }\n /**\n * Allow fire programmatically a cut action\n * @param {String|HTMLElement} target\n * @returns Text cutted.\n */\n\n }, {\n key: \"cut\",\n value: function cut(target) {\n return actions_cut(target);\n }\n /**\n * Returns the support of the given action, or all actions if no action is\n * given.\n * @param {String} [action]\n */\n\n }, {\n key: \"isSupported\",\n value: function isSupported() {\n var action = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : ['copy', 'cut'];\n var actions = typeof action === 'string' ? [action] : action;\n var support = !!document.queryCommandSupported;\n actions.forEach(function (action) {\n support = support && !!document.queryCommandSupported(action);\n });\n return support;\n }\n }]);\n\n return Clipboard;\n}((tiny_emitter_default()));\n\n/* harmony default export */ var clipboard = (Clipboard);\n\n/***/ }),\n\n/***/ 828:\n/***/ (function(module) {\n\nvar DOCUMENT_NODE_TYPE = 9;\n\n/**\n * A polyfill for Element.matches()\n */\nif (typeof Element !== 'undefined' && !Element.prototype.matches) {\n var proto = Element.prototype;\n\n proto.matches = proto.matchesSelector ||\n proto.mozMatchesSelector ||\n proto.msMatchesSelector ||\n proto.oMatchesSelector ||\n proto.webkitMatchesSelector;\n}\n\n/**\n * Finds the closest parent that matches a selector.\n *\n * @param {Element} element\n * @param {String} selector\n * @return {Function}\n */\nfunction closest (element, selector) {\n while (element && element.nodeType !== DOCUMENT_NODE_TYPE) {\n if (typeof element.matches === 'function' &&\n element.matches(selector)) {\n return element;\n }\n element = element.parentNode;\n }\n}\n\nmodule.exports = closest;\n\n\n/***/ }),\n\n/***/ 438:\n/***/ (function(module, __unused_webpack_exports, __webpack_require__) {\n\nvar closest = __webpack_require__(828);\n\n/**\n * Delegates event to a selector.\n *\n * @param {Element} element\n * @param {String} selector\n * @param {String} type\n * @param {Function} callback\n * @param {Boolean} useCapture\n * @return {Object}\n */\nfunction _delegate(element, selector, type, callback, useCapture) {\n var listenerFn = listener.apply(this, arguments);\n\n element.addEventListener(type, listenerFn, useCapture);\n\n return {\n destroy: function() {\n element.removeEventListener(type, listenerFn, useCapture);\n }\n }\n}\n\n/**\n * Delegates event to a selector.\n *\n * @param {Element|String|Array} [elements]\n * @param {String} selector\n * @param {String} type\n * @param {Function} callback\n * @param {Boolean} useCapture\n * @return {Object}\n */\nfunction delegate(elements, selector, type, callback, useCapture) {\n // Handle the regular Element usage\n if (typeof elements.addEventListener === 'function') {\n return _delegate.apply(null, arguments);\n }\n\n // Handle Element-less usage, it defaults to global delegation\n if (typeof type === 'function') {\n // Use `document` as the first parameter, then apply arguments\n // This is a short way to .unshift `arguments` without running into deoptimizations\n return _delegate.bind(null, document).apply(null, arguments);\n }\n\n // Handle Selector-based usage\n if (typeof elements === 'string') {\n elements = document.querySelectorAll(elements);\n }\n\n // Handle Array-like based usage\n return Array.prototype.map.call(elements, function (element) {\n return _delegate(element, selector, type, callback, useCapture);\n });\n}\n\n/**\n * Finds closest match and invokes callback.\n *\n * @param {Element} element\n * @param {String} selector\n * @param {String} type\n * @param {Function} callback\n * @return {Function}\n */\nfunction listener(element, selector, type, callback) {\n return function(e) {\n e.delegateTarget = closest(e.target, selector);\n\n if (e.delegateTarget) {\n callback.call(element, e);\n }\n }\n}\n\nmodule.exports = delegate;\n\n\n/***/ }),\n\n/***/ 879:\n/***/ (function(__unused_webpack_module, exports) {\n\n/**\n * Check if argument is a HTML element.\n *\n * @param {Object} value\n * @return {Boolean}\n */\nexports.node = function(value) {\n return value !== undefined\n && value instanceof HTMLElement\n && value.nodeType === 1;\n};\n\n/**\n * Check if argument is a list of HTML elements.\n *\n * @param {Object} value\n * @return {Boolean}\n */\nexports.nodeList = function(value) {\n var type = Object.prototype.toString.call(value);\n\n return value !== undefined\n && (type === '[object NodeList]' || type === '[object HTMLCollection]')\n && ('length' in value)\n && (value.length === 0 || exports.node(value[0]));\n};\n\n/**\n * Check if argument is a string.\n *\n * @param {Object} value\n * @return {Boolean}\n */\nexports.string = function(value) {\n return typeof value === 'string'\n || value instanceof String;\n};\n\n/**\n * Check if argument is a function.\n *\n * @param {Object} value\n * @return {Boolean}\n */\nexports.fn = function(value) {\n var type = Object.prototype.toString.call(value);\n\n return type === '[object Function]';\n};\n\n\n/***/ }),\n\n/***/ 370:\n/***/ (function(module, __unused_webpack_exports, __webpack_require__) {\n\nvar is = __webpack_require__(879);\nvar delegate = __webpack_require__(438);\n\n/**\n * Validates all params and calls the right\n * listener function based on its target type.\n *\n * @param {String|HTMLElement|HTMLCollection|NodeList} target\n * @param {String} type\n * @param {Function} callback\n * @return {Object}\n */\nfunction listen(target, type, callback) {\n if (!target && !type && !callback) {\n throw new Error('Missing required arguments');\n }\n\n if (!is.string(type)) {\n throw new TypeError('Second argument must be a String');\n }\n\n if (!is.fn(callback)) {\n throw new TypeError('Third argument must be a Function');\n }\n\n if (is.node(target)) {\n return listenNode(target, type, callback);\n }\n else if (is.nodeList(target)) {\n return listenNodeList(target, type, callback);\n }\n else if (is.string(target)) {\n return listenSelector(target, type, callback);\n }\n else {\n throw new TypeError('First argument must be a String, HTMLElement, HTMLCollection, or NodeList');\n }\n}\n\n/**\n * Adds an event listener to a HTML element\n * and returns a remove listener function.\n *\n * @param {HTMLElement} node\n * @param {String} type\n * @param {Function} callback\n * @return {Object}\n */\nfunction listenNode(node, type, callback) {\n node.addEventListener(type, callback);\n\n return {\n destroy: function() {\n node.removeEventListener(type, callback);\n }\n }\n}\n\n/**\n * Add an event listener to a list of HTML elements\n * and returns a remove listener function.\n *\n * @param {NodeList|HTMLCollection} nodeList\n * @param {String} type\n * @param {Function} callback\n * @return {Object}\n */\nfunction listenNodeList(nodeList, type, callback) {\n Array.prototype.forEach.call(nodeList, function(node) {\n node.addEventListener(type, callback);\n });\n\n return {\n destroy: function() {\n Array.prototype.forEach.call(nodeList, function(node) {\n node.removeEventListener(type, callback);\n });\n }\n }\n}\n\n/**\n * Add an event listener to a selector\n * and returns a remove listener function.\n *\n * @param {String} selector\n * @param {String} type\n * @param {Function} callback\n * @return {Object}\n */\nfunction listenSelector(selector, type, callback) {\n return delegate(document.body, selector, type, callback);\n}\n\nmodule.exports = listen;\n\n\n/***/ }),\n\n/***/ 817:\n/***/ (function(module) {\n\nfunction select(element) {\n var selectedText;\n\n if (element.nodeName === 'SELECT') {\n element.focus();\n\n selectedText = element.value;\n }\n else if (element.nodeName === 'INPUT' || element.nodeName === 'TEXTAREA') {\n var isReadOnly = element.hasAttribute('readonly');\n\n if (!isReadOnly) {\n element.setAttribute('readonly', '');\n }\n\n element.select();\n element.setSelectionRange(0, element.value.length);\n\n if (!isReadOnly) {\n element.removeAttribute('readonly');\n }\n\n selectedText = element.value;\n }\n else {\n if (element.hasAttribute('contenteditable')) {\n element.focus();\n }\n\n var selection = window.getSelection();\n var range = document.createRange();\n\n range.selectNodeContents(element);\n selection.removeAllRanges();\n selection.addRange(range);\n\n selectedText = selection.toString();\n }\n\n return selectedText;\n}\n\nmodule.exports = select;\n\n\n/***/ }),\n\n/***/ 279:\n/***/ (function(module) {\n\nfunction E () {\n // Keep this empty so it's easier to inherit from\n // (via https://github.com/lipsmack from https://github.com/scottcorgan/tiny-emitter/issues/3)\n}\n\nE.prototype = {\n on: function (name, callback, ctx) {\n var e = this.e || (this.e = {});\n\n (e[name] || (e[name] = [])).push({\n fn: callback,\n ctx: ctx\n });\n\n return this;\n },\n\n once: function (name, callback, ctx) {\n var self = this;\n function listener () {\n self.off(name, listener);\n callback.apply(ctx, arguments);\n };\n\n listener._ = callback\n return this.on(name, listener, ctx);\n },\n\n emit: function (name) {\n var data = [].slice.call(arguments, 1);\n var evtArr = ((this.e || (this.e = {}))[name] || []).slice();\n var i = 0;\n var len = evtArr.length;\n\n for (i; i < len; i++) {\n evtArr[i].fn.apply(evtArr[i].ctx, data);\n }\n\n return this;\n },\n\n off: function (name, callback) {\n var e = this.e || (this.e = {});\n var evts = e[name];\n var liveEvents = [];\n\n if (evts && callback) {\n for (var i = 0, len = evts.length; i < len; i++) {\n if (evts[i].fn !== callback && evts[i].fn._ !== callback)\n liveEvents.push(evts[i]);\n }\n }\n\n // Remove event from queue to prevent memory leak\n // Suggested by https://github.com/lazd\n // Ref: https://github.com/scottcorgan/tiny-emitter/commit/c6ebfaa9bc973b33d110a84a307742b7cf94c953#commitcomment-5024910\n\n (liveEvents.length)\n ? e[name] = liveEvents\n : delete e[name];\n\n return this;\n }\n};\n\nmodule.exports = E;\nmodule.exports.TinyEmitter = E;\n\n\n/***/ })\n\n/******/ \t});\n/************************************************************************/\n/******/ \t// The module cache\n/******/ \tvar __webpack_module_cache__ = {};\n/******/ \t\n/******/ \t// The require function\n/******/ \tfunction __webpack_require__(moduleId) {\n/******/ \t\t// Check if module is in cache\n/******/ \t\tif(__webpack_module_cache__[moduleId]) {\n/******/ \t\t\treturn __webpack_module_cache__[moduleId].exports;\n/******/ \t\t}\n/******/ \t\t// Create a new module (and put it into the cache)\n/******/ \t\tvar module = __webpack_module_cache__[moduleId] = {\n/******/ \t\t\t// no module.id needed\n/******/ \t\t\t// no module.loaded needed\n/******/ \t\t\texports: {}\n/******/ \t\t};\n/******/ \t\n/******/ \t\t// Execute the module function\n/******/ \t\t__webpack_modules__[moduleId](module, module.exports, __webpack_require__);\n/******/ \t\n/******/ \t\t// Return the exports of the module\n/******/ \t\treturn module.exports;\n/******/ \t}\n/******/ \t\n/************************************************************************/\n/******/ \t/* webpack/runtime/compat get default export */\n/******/ \t!function() {\n/******/ \t\t// getDefaultExport function for compatibility with non-harmony modules\n/******/ \t\t__webpack_require__.n = function(module) {\n/******/ \t\t\tvar getter = module && module.__esModule ?\n/******/ \t\t\t\tfunction() { return module['default']; } :\n/******/ \t\t\t\tfunction() { return module; };\n/******/ \t\t\t__webpack_require__.d(getter, { a: getter });\n/******/ \t\t\treturn getter;\n/******/ \t\t};\n/******/ \t}();\n/******/ \t\n/******/ \t/* webpack/runtime/define property getters */\n/******/ \t!function() {\n/******/ \t\t// define getter functions for harmony exports\n/******/ \t\t__webpack_require__.d = function(exports, definition) {\n/******/ \t\t\tfor(var key in definition) {\n/******/ \t\t\t\tif(__webpack_require__.o(definition, key) && !__webpack_require__.o(exports, key)) {\n/******/ \t\t\t\t\tObject.defineProperty(exports, key, { enumerable: true, get: definition[key] });\n/******/ \t\t\t\t}\n/******/ \t\t\t}\n/******/ \t\t};\n/******/ \t}();\n/******/ \t\n/******/ \t/* webpack/runtime/hasOwnProperty shorthand */\n/******/ \t!function() {\n/******/ \t\t__webpack_require__.o = function(obj, prop) { return Object.prototype.hasOwnProperty.call(obj, prop); }\n/******/ \t}();\n/******/ \t\n/************************************************************************/\n/******/ \t// module exports must be returned from runtime so entry inlining is disabled\n/******/ \t// startup\n/******/ \t// Load entry module and return exports\n/******/ \treturn __webpack_require__(686);\n/******/ })()\n.default;\n});", "/*\n * Copyright (c) 2016-2024 Martin Donath \n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport \"focus-visible\"\n\nimport {\n EMPTY,\n NEVER,\n Observable,\n Subject,\n defer,\n delay,\n filter,\n map,\n merge,\n mergeWith,\n shareReplay,\n switchMap\n} from \"rxjs\"\n\nimport { configuration, feature } from \"./_\"\nimport {\n at,\n getActiveElement,\n getOptionalElement,\n requestJSON,\n setLocation,\n setToggle,\n watchDocument,\n watchKeyboard,\n watchLocation,\n watchLocationTarget,\n watchMedia,\n watchPrint,\n watchScript,\n watchViewport\n} from \"./browser\"\nimport {\n getComponentElement,\n getComponentElements,\n mountAnnounce,\n mountBackToTop,\n mountConsent,\n mountContent,\n mountDialog,\n mountHeader,\n mountHeaderTitle,\n mountPalette,\n mountProgress,\n mountSearch,\n mountSearchHiglight,\n mountSidebar,\n mountSource,\n mountTableOfContents,\n mountTabs,\n watchHeader,\n watchMain\n} from \"./components\"\nimport {\n SearchIndex,\n setupClipboardJS,\n setupInstantNavigation,\n setupVersionSelector\n} from \"./integrations\"\nimport {\n patchEllipsis,\n patchIndeterminate,\n patchScrollfix,\n patchScrolllock\n} from \"./patches\"\nimport \"./polyfills\"\n\n/* ----------------------------------------------------------------------------\n * Functions - @todo refactor\n * ------------------------------------------------------------------------- */\n\n/**\n * Fetch search index\n *\n * @returns Search index observable\n */\nfunction fetchSearchIndex(): Observable {\n if (location.protocol === \"file:\") {\n return watchScript(\n `${new URL(\"search/search_index.js\", config.base)}`\n )\n .pipe(\n // @ts-ignore - @todo fix typings\n map(() => __index),\n shareReplay(1)\n )\n } else {\n return requestJSON(\n new URL(\"search/search_index.json\", config.base)\n )\n }\n}\n\n/* ----------------------------------------------------------------------------\n * Application\n * ------------------------------------------------------------------------- */\n\n/* Yay, JavaScript is available */\ndocument.documentElement.classList.remove(\"no-js\")\ndocument.documentElement.classList.add(\"js\")\n\n/* Set up navigation observables and subjects */\nconst document$ = watchDocument()\nconst location$ = watchLocation()\nconst target$ = watchLocationTarget(location$)\nconst keyboard$ = watchKeyboard()\n\n/* Set up media observables */\nconst viewport$ = watchViewport()\nconst tablet$ = watchMedia(\"(min-width: 960px)\")\nconst screen$ = watchMedia(\"(min-width: 1220px)\")\nconst print$ = watchPrint()\n\n/* Retrieve search index, if search is enabled */\nconst config = configuration()\nconst index$ = document.forms.namedItem(\"search\")\n ? fetchSearchIndex()\n : NEVER\n\n/* Set up Clipboard.js integration */\nconst alert$ = new Subject()\nsetupClipboardJS({ alert$ })\n\n/* Set up progress indicator */\nconst progress$ = new Subject()\n\n/* Set up instant navigation, if enabled */\nif (feature(\"navigation.instant\"))\n setupInstantNavigation({ location$, viewport$, progress$ })\n .subscribe(document$)\n\n/* Set up version selector */\nif (config.version?.provider === \"mike\")\n setupVersionSelector({ document$ })\n\n/* Always close drawer and search on navigation */\nmerge(location$, target$)\n .pipe(\n delay(125)\n )\n .subscribe(() => {\n setToggle(\"drawer\", false)\n setToggle(\"search\", false)\n })\n\n/* Set up global keyboard handlers */\nkeyboard$\n .pipe(\n filter(({ mode }) => mode === \"global\")\n )\n .subscribe(key => {\n switch (key.type) {\n\n /* Go to previous page */\n case \"p\":\n case \",\":\n const prev = getOptionalElement(\"link[rel=prev]\")\n if (typeof prev !== \"undefined\")\n setLocation(prev)\n break\n\n /* Go to next page */\n case \"n\":\n case \".\":\n const next = getOptionalElement(\"link[rel=next]\")\n if (typeof next !== \"undefined\")\n setLocation(next)\n break\n\n /* Expand navigation, see https://bit.ly/3ZjG5io */\n case \"Enter\":\n const active = getActiveElement()\n if (active instanceof HTMLLabelElement)\n active.click()\n }\n })\n\n/* Set up patches */\npatchEllipsis({ viewport$, document$ })\npatchIndeterminate({ document$, tablet$ })\npatchScrollfix({ document$ })\npatchScrolllock({ viewport$, tablet$ })\n\n/* Set up header and main area observable */\nconst header$ = watchHeader(getComponentElement(\"header\"), { viewport$ })\nconst main$ = document$\n .pipe(\n map(() => getComponentElement(\"main\")),\n switchMap(el => watchMain(el, { viewport$, header$ })),\n shareReplay(1)\n )\n\n/* Set up control component observables */\nconst control$ = merge(\n\n /* Consent */\n ...getComponentElements(\"consent\")\n .map(el => mountConsent(el, { target$ })),\n\n /* Dialog */\n ...getComponentElements(\"dialog\")\n .map(el => mountDialog(el, { alert$ })),\n\n /* Color palette */\n ...getComponentElements(\"palette\")\n .map(el => mountPalette(el)),\n\n /* Progress bar */\n ...getComponentElements(\"progress\")\n .map(el => mountProgress(el, { progress$ })),\n\n /* Search */\n ...getComponentElements(\"search\")\n .map(el => mountSearch(el, { index$, keyboard$ })),\n\n /* Repository information */\n ...getComponentElements(\"source\")\n .map(el => mountSource(el))\n)\n\n/* Set up content component observables */\nconst content$ = defer(() => merge(\n\n /* Announcement bar */\n ...getComponentElements(\"announce\")\n .map(el => mountAnnounce(el)),\n\n /* Content */\n ...getComponentElements(\"content\")\n .map(el => mountContent(el, { viewport$, target$, print$ })),\n\n /* Search highlighting */\n ...getComponentElements(\"content\")\n .map(el => feature(\"search.highlight\")\n ? mountSearchHiglight(el, { index$, location$ })\n : EMPTY\n ),\n\n /* Header */\n ...getComponentElements(\"header\")\n .map(el => mountHeader(el, { viewport$, header$, main$ })),\n\n /* Header title */\n ...getComponentElements(\"header-title\")\n .map(el => mountHeaderTitle(el, { viewport$, header$ })),\n\n /* Sidebar */\n ...getComponentElements(\"sidebar\")\n .map(el => el.getAttribute(\"data-md-type\") === \"navigation\"\n ? at(screen$, () => mountSidebar(el, { viewport$, header$, main$ }))\n : at(tablet$, () => mountSidebar(el, { viewport$, header$, main$ }))\n ),\n\n /* Navigation tabs */\n ...getComponentElements(\"tabs\")\n .map(el => mountTabs(el, { viewport$, header$ })),\n\n /* Table of contents */\n ...getComponentElements(\"toc\")\n .map(el => mountTableOfContents(el, {\n viewport$, header$, main$, target$\n })),\n\n /* Back-to-top button */\n ...getComponentElements(\"top\")\n .map(el => mountBackToTop(el, { viewport$, header$, main$, target$ }))\n))\n\n/* Set up component observables */\nconst component$ = document$\n .pipe(\n switchMap(() => content$),\n mergeWith(control$),\n shareReplay(1)\n )\n\n/* Subscribe to all components */\ncomponent$.subscribe()\n\n/* ----------------------------------------------------------------------------\n * Exports\n * ------------------------------------------------------------------------- */\n\nwindow.document$ = document$ /* Document observable */\nwindow.location$ = location$ /* Location subject */\nwindow.target$ = target$ /* Location target observable */\nwindow.keyboard$ = keyboard$ /* Keyboard observable */\nwindow.viewport$ = viewport$ /* Viewport observable */\nwindow.tablet$ = tablet$ /* Media tablet observable */\nwindow.screen$ = screen$ /* Media screen observable */\nwindow.print$ = print$ /* Media print observable */\nwindow.alert$ = alert$ /* Alert subject */\nwindow.progress$ = progress$ /* Progress indicator subject */\nwindow.component$ = component$ /* Component observable */\n", "/******************************************************************************\nCopyright (c) Microsoft Corporation.\n\nPermission to use, copy, modify, and/or distribute this software for any\npurpose with or without fee is hereby granted.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH\nREGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY\nAND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,\nINDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM\nLOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR\nOTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR\nPERFORMANCE OF THIS SOFTWARE.\n***************************************************************************** */\n/* global Reflect, Promise, SuppressedError, Symbol, Iterator */\n\nvar extendStatics = function(d, b) {\n extendStatics = Object.setPrototypeOf ||\n ({ __proto__: [] } instanceof Array && function (d, b) { d.__proto__ = b; }) ||\n function (d, b) { for (var p in b) if (Object.prototype.hasOwnProperty.call(b, p)) d[p] = b[p]; };\n return extendStatics(d, b);\n};\n\nexport function __extends(d, b) {\n if (typeof b !== \"function\" && b !== null)\n throw new TypeError(\"Class extends value \" + String(b) + \" is not a constructor or null\");\n extendStatics(d, b);\n function __() { this.constructor = d; }\n d.prototype = b === null ? Object.create(b) : (__.prototype = b.prototype, new __());\n}\n\nexport var __assign = function() {\n __assign = Object.assign || function __assign(t) {\n for (var s, i = 1, n = arguments.length; i < n; i++) {\n s = arguments[i];\n for (var p in s) if (Object.prototype.hasOwnProperty.call(s, p)) t[p] = s[p];\n }\n return t;\n }\n return __assign.apply(this, arguments);\n}\n\nexport function __rest(s, e) {\n var t = {};\n for (var p in s) if (Object.prototype.hasOwnProperty.call(s, p) && e.indexOf(p) < 0)\n t[p] = s[p];\n if (s != null && typeof Object.getOwnPropertySymbols === \"function\")\n for (var i = 0, p = Object.getOwnPropertySymbols(s); i < p.length; i++) {\n if (e.indexOf(p[i]) < 0 && Object.prototype.propertyIsEnumerable.call(s, p[i]))\n t[p[i]] = s[p[i]];\n }\n return t;\n}\n\nexport function __decorate(decorators, target, key, desc) {\n var c = arguments.length, r = c < 3 ? target : desc === null ? desc = Object.getOwnPropertyDescriptor(target, key) : desc, d;\n if (typeof Reflect === \"object\" && typeof Reflect.decorate === \"function\") r = Reflect.decorate(decorators, target, key, desc);\n else for (var i = decorators.length - 1; i >= 0; i--) if (d = decorators[i]) r = (c < 3 ? d(r) : c > 3 ? d(target, key, r) : d(target, key)) || r;\n return c > 3 && r && Object.defineProperty(target, key, r), r;\n}\n\nexport function __param(paramIndex, decorator) {\n return function (target, key) { decorator(target, key, paramIndex); }\n}\n\nexport function __esDecorate(ctor, descriptorIn, decorators, contextIn, initializers, extraInitializers) {\n function accept(f) { if (f !== void 0 && typeof f !== \"function\") throw new TypeError(\"Function expected\"); return f; }\n var kind = contextIn.kind, key = kind === \"getter\" ? \"get\" : kind === \"setter\" ? \"set\" : \"value\";\n var target = !descriptorIn && ctor ? contextIn[\"static\"] ? ctor : ctor.prototype : null;\n var descriptor = descriptorIn || (target ? Object.getOwnPropertyDescriptor(target, contextIn.name) : {});\n var _, done = false;\n for (var i = decorators.length - 1; i >= 0; i--) {\n var context = {};\n for (var p in contextIn) context[p] = p === \"access\" ? {} : contextIn[p];\n for (var p in contextIn.access) context.access[p] = contextIn.access[p];\n context.addInitializer = function (f) { if (done) throw new TypeError(\"Cannot add initializers after decoration has completed\"); extraInitializers.push(accept(f || null)); };\n var result = (0, decorators[i])(kind === \"accessor\" ? { get: descriptor.get, set: descriptor.set } : descriptor[key], context);\n if (kind === \"accessor\") {\n if (result === void 0) continue;\n if (result === null || typeof result !== \"object\") throw new TypeError(\"Object expected\");\n if (_ = accept(result.get)) descriptor.get = _;\n if (_ = accept(result.set)) descriptor.set = _;\n if (_ = accept(result.init)) initializers.unshift(_);\n }\n else if (_ = accept(result)) {\n if (kind === \"field\") initializers.unshift(_);\n else descriptor[key] = _;\n }\n }\n if (target) Object.defineProperty(target, contextIn.name, descriptor);\n done = true;\n};\n\nexport function __runInitializers(thisArg, initializers, value) {\n var useValue = arguments.length > 2;\n for (var i = 0; i < initializers.length; i++) {\n value = useValue ? initializers[i].call(thisArg, value) : initializers[i].call(thisArg);\n }\n return useValue ? value : void 0;\n};\n\nexport function __propKey(x) {\n return typeof x === \"symbol\" ? x : \"\".concat(x);\n};\n\nexport function __setFunctionName(f, name, prefix) {\n if (typeof name === \"symbol\") name = name.description ? \"[\".concat(name.description, \"]\") : \"\";\n return Object.defineProperty(f, \"name\", { configurable: true, value: prefix ? \"\".concat(prefix, \" \", name) : name });\n};\n\nexport function __metadata(metadataKey, metadataValue) {\n if (typeof Reflect === \"object\" && typeof Reflect.metadata === \"function\") return Reflect.metadata(metadataKey, metadataValue);\n}\n\nexport function __awaiter(thisArg, _arguments, P, generator) {\n function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }\n return new (P || (P = Promise))(function (resolve, reject) {\n function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }\n function rejected(value) { try { step(generator[\"throw\"](value)); } catch (e) { reject(e); } }\n function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }\n step((generator = generator.apply(thisArg, _arguments || [])).next());\n });\n}\n\nexport function __generator(thisArg, body) {\n var _ = { label: 0, sent: function() { if (t[0] & 1) throw t[1]; return t[1]; }, trys: [], ops: [] }, f, y, t, g = Object.create((typeof Iterator === \"function\" ? Iterator : Object).prototype);\n return g.next = verb(0), g[\"throw\"] = verb(1), g[\"return\"] = verb(2), typeof Symbol === \"function\" && (g[Symbol.iterator] = function() { return this; }), g;\n function verb(n) { return function (v) { return step([n, v]); }; }\n function step(op) {\n if (f) throw new TypeError(\"Generator is already executing.\");\n while (g && (g = 0, op[0] && (_ = 0)), _) try {\n if (f = 1, y && (t = op[0] & 2 ? y[\"return\"] : op[0] ? y[\"throw\"] || ((t = y[\"return\"]) && t.call(y), 0) : y.next) && !(t = t.call(y, op[1])).done) return t;\n if (y = 0, t) op = [op[0] & 2, t.value];\n switch (op[0]) {\n case 0: case 1: t = op; break;\n case 4: _.label++; return { value: op[1], done: false };\n case 5: _.label++; y = op[1]; op = [0]; continue;\n case 7: op = _.ops.pop(); _.trys.pop(); continue;\n default:\n if (!(t = _.trys, t = t.length > 0 && t[t.length - 1]) && (op[0] === 6 || op[0] === 2)) { _ = 0; continue; }\n if (op[0] === 3 && (!t || (op[1] > t[0] && op[1] < t[3]))) { _.label = op[1]; break; }\n if (op[0] === 6 && _.label < t[1]) { _.label = t[1]; t = op; break; }\n if (t && _.label < t[2]) { _.label = t[2]; _.ops.push(op); break; }\n if (t[2]) _.ops.pop();\n _.trys.pop(); continue;\n }\n op = body.call(thisArg, _);\n } catch (e) { op = [6, e]; y = 0; } finally { f = t = 0; }\n if (op[0] & 5) throw op[1]; return { value: op[0] ? op[1] : void 0, done: true };\n }\n}\n\nexport var __createBinding = Object.create ? (function(o, m, k, k2) {\n if (k2 === undefined) k2 = k;\n var desc = Object.getOwnPropertyDescriptor(m, k);\n if (!desc || (\"get\" in desc ? !m.__esModule : desc.writable || desc.configurable)) {\n desc = { enumerable: true, get: function() { return m[k]; } };\n }\n Object.defineProperty(o, k2, desc);\n}) : (function(o, m, k, k2) {\n if (k2 === undefined) k2 = k;\n o[k2] = m[k];\n});\n\nexport function __exportStar(m, o) {\n for (var p in m) if (p !== \"default\" && !Object.prototype.hasOwnProperty.call(o, p)) __createBinding(o, m, p);\n}\n\nexport function __values(o) {\n var s = typeof Symbol === \"function\" && Symbol.iterator, m = s && o[s], i = 0;\n if (m) return m.call(o);\n if (o && typeof o.length === \"number\") return {\n next: function () {\n if (o && i >= o.length) o = void 0;\n return { value: o && o[i++], done: !o };\n }\n };\n throw new TypeError(s ? \"Object is not iterable.\" : \"Symbol.iterator is not defined.\");\n}\n\nexport function __read(o, n) {\n var m = typeof Symbol === \"function\" && o[Symbol.iterator];\n if (!m) return o;\n var i = m.call(o), r, ar = [], e;\n try {\n while ((n === void 0 || n-- > 0) && !(r = i.next()).done) ar.push(r.value);\n }\n catch (error) { e = { error: error }; }\n finally {\n try {\n if (r && !r.done && (m = i[\"return\"])) m.call(i);\n }\n finally { if (e) throw e.error; }\n }\n return ar;\n}\n\n/** @deprecated */\nexport function __spread() {\n for (var ar = [], i = 0; i < arguments.length; i++)\n ar = ar.concat(__read(arguments[i]));\n return ar;\n}\n\n/** @deprecated */\nexport function __spreadArrays() {\n for (var s = 0, i = 0, il = arguments.length; i < il; i++) s += arguments[i].length;\n for (var r = Array(s), k = 0, i = 0; i < il; i++)\n for (var a = arguments[i], j = 0, jl = a.length; j < jl; j++, k++)\n r[k] = a[j];\n return r;\n}\n\nexport function __spreadArray(to, from, pack) {\n if (pack || arguments.length === 2) for (var i = 0, l = from.length, ar; i < l; i++) {\n if (ar || !(i in from)) {\n if (!ar) ar = Array.prototype.slice.call(from, 0, i);\n ar[i] = from[i];\n }\n }\n return to.concat(ar || Array.prototype.slice.call(from));\n}\n\nexport function __await(v) {\n return this instanceof __await ? (this.v = v, this) : new __await(v);\n}\n\nexport function __asyncGenerator(thisArg, _arguments, generator) {\n if (!Symbol.asyncIterator) throw new TypeError(\"Symbol.asyncIterator is not defined.\");\n var g = generator.apply(thisArg, _arguments || []), i, q = [];\n return i = Object.create((typeof AsyncIterator === \"function\" ? AsyncIterator : Object).prototype), verb(\"next\"), verb(\"throw\"), verb(\"return\", awaitReturn), i[Symbol.asyncIterator] = function () { return this; }, i;\n function awaitReturn(f) { return function (v) { return Promise.resolve(v).then(f, reject); }; }\n function verb(n, f) { if (g[n]) { i[n] = function (v) { return new Promise(function (a, b) { q.push([n, v, a, b]) > 1 || resume(n, v); }); }; if (f) i[n] = f(i[n]); } }\n function resume(n, v) { try { step(g[n](v)); } catch (e) { settle(q[0][3], e); } }\n function step(r) { r.value instanceof __await ? Promise.resolve(r.value.v).then(fulfill, reject) : settle(q[0][2], r); }\n function fulfill(value) { resume(\"next\", value); }\n function reject(value) { resume(\"throw\", value); }\n function settle(f, v) { if (f(v), q.shift(), q.length) resume(q[0][0], q[0][1]); }\n}\n\nexport function __asyncDelegator(o) {\n var i, p;\n return i = {}, verb(\"next\"), verb(\"throw\", function (e) { throw e; }), verb(\"return\"), i[Symbol.iterator] = function () { return this; }, i;\n function verb(n, f) { i[n] = o[n] ? function (v) { return (p = !p) ? { value: __await(o[n](v)), done: false } : f ? f(v) : v; } : f; }\n}\n\nexport function __asyncValues(o) {\n if (!Symbol.asyncIterator) throw new TypeError(\"Symbol.asyncIterator is not defined.\");\n var m = o[Symbol.asyncIterator], i;\n return m ? m.call(o) : (o = typeof __values === \"function\" ? __values(o) : o[Symbol.iterator](), i = {}, verb(\"next\"), verb(\"throw\"), verb(\"return\"), i[Symbol.asyncIterator] = function () { return this; }, i);\n function verb(n) { i[n] = o[n] && function (v) { return new Promise(function (resolve, reject) { v = o[n](v), settle(resolve, reject, v.done, v.value); }); }; }\n function settle(resolve, reject, d, v) { Promise.resolve(v).then(function(v) { resolve({ value: v, done: d }); }, reject); }\n}\n\nexport function __makeTemplateObject(cooked, raw) {\n if (Object.defineProperty) { Object.defineProperty(cooked, \"raw\", { value: raw }); } else { cooked.raw = raw; }\n return cooked;\n};\n\nvar __setModuleDefault = Object.create ? (function(o, v) {\n Object.defineProperty(o, \"default\", { enumerable: true, value: v });\n}) : function(o, v) {\n o[\"default\"] = v;\n};\n\nexport function __importStar(mod) {\n if (mod && mod.__esModule) return mod;\n var result = {};\n if (mod != null) for (var k in mod) if (k !== \"default\" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);\n __setModuleDefault(result, mod);\n return result;\n}\n\nexport function __importDefault(mod) {\n return (mod && mod.__esModule) ? mod : { default: mod };\n}\n\nexport function __classPrivateFieldGet(receiver, state, kind, f) {\n if (kind === \"a\" && !f) throw new TypeError(\"Private accessor was defined without a getter\");\n if (typeof state === \"function\" ? receiver !== state || !f : !state.has(receiver)) throw new TypeError(\"Cannot read private member from an object whose class did not declare it\");\n return kind === \"m\" ? f : kind === \"a\" ? f.call(receiver) : f ? f.value : state.get(receiver);\n}\n\nexport function __classPrivateFieldSet(receiver, state, value, kind, f) {\n if (kind === \"m\") throw new TypeError(\"Private method is not writable\");\n if (kind === \"a\" && !f) throw new TypeError(\"Private accessor was defined without a setter\");\n if (typeof state === \"function\" ? receiver !== state || !f : !state.has(receiver)) throw new TypeError(\"Cannot write private member to an object whose class did not declare it\");\n return (kind === \"a\" ? f.call(receiver, value) : f ? f.value = value : state.set(receiver, value)), value;\n}\n\nexport function __classPrivateFieldIn(state, receiver) {\n if (receiver === null || (typeof receiver !== \"object\" && typeof receiver !== \"function\")) throw new TypeError(\"Cannot use 'in' operator on non-object\");\n return typeof state === \"function\" ? receiver === state : state.has(receiver);\n}\n\nexport function __addDisposableResource(env, value, async) {\n if (value !== null && value !== void 0) {\n if (typeof value !== \"object\" && typeof value !== \"function\") throw new TypeError(\"Object expected.\");\n var dispose, inner;\n if (async) {\n if (!Symbol.asyncDispose) throw new TypeError(\"Symbol.asyncDispose is not defined.\");\n dispose = value[Symbol.asyncDispose];\n }\n if (dispose === void 0) {\n if (!Symbol.dispose) throw new TypeError(\"Symbol.dispose is not defined.\");\n dispose = value[Symbol.dispose];\n if (async) inner = dispose;\n }\n if (typeof dispose !== \"function\") throw new TypeError(\"Object not disposable.\");\n if (inner) dispose = function() { try { inner.call(this); } catch (e) { return Promise.reject(e); } };\n env.stack.push({ value: value, dispose: dispose, async: async });\n }\n else if (async) {\n env.stack.push({ async: true });\n }\n return value;\n}\n\nvar _SuppressedError = typeof SuppressedError === \"function\" ? SuppressedError : function (error, suppressed, message) {\n var e = new Error(message);\n return e.name = \"SuppressedError\", e.error = error, e.suppressed = suppressed, e;\n};\n\nexport function __disposeResources(env) {\n function fail(e) {\n env.error = env.hasError ? new _SuppressedError(e, env.error, \"An error was suppressed during disposal.\") : e;\n env.hasError = true;\n }\n var r, s = 0;\n function next() {\n while (r = env.stack.pop()) {\n try {\n if (!r.async && s === 1) return s = 0, env.stack.push(r), Promise.resolve().then(next);\n if (r.dispose) {\n var result = r.dispose.call(r.value);\n if (r.async) return s |= 2, Promise.resolve(result).then(next, function(e) { fail(e); return next(); });\n }\n else s |= 1;\n }\n catch (e) {\n fail(e);\n }\n }\n if (s === 1) return env.hasError ? Promise.reject(env.error) : Promise.resolve();\n if (env.hasError) throw env.error;\n }\n return next();\n}\n\nexport default {\n __extends,\n __assign,\n __rest,\n __decorate,\n __param,\n __metadata,\n __awaiter,\n __generator,\n __createBinding,\n __exportStar,\n __values,\n __read,\n __spread,\n __spreadArrays,\n __spreadArray,\n __await,\n __asyncGenerator,\n __asyncDelegator,\n __asyncValues,\n __makeTemplateObject,\n __importStar,\n __importDefault,\n __classPrivateFieldGet,\n __classPrivateFieldSet,\n __classPrivateFieldIn,\n __addDisposableResource,\n __disposeResources,\n};\n", "/**\n * Returns true if the object is a function.\n * @param value The value to check\n */\nexport function isFunction(value: any): value is (...args: any[]) => any {\n return typeof value === 'function';\n}\n", "/**\n * Used to create Error subclasses until the community moves away from ES5.\n *\n * This is because compiling from TypeScript down to ES5 has issues with subclassing Errors\n * as well as other built-in types: https://github.com/Microsoft/TypeScript/issues/12123\n *\n * @param createImpl A factory function to create the actual constructor implementation. The returned\n * function should be a named function that calls `_super` internally.\n */\nexport function createErrorClass(createImpl: (_super: any) => any): T {\n const _super = (instance: any) => {\n Error.call(instance);\n instance.stack = new Error().stack;\n };\n\n const ctorFunc = createImpl(_super);\n ctorFunc.prototype = Object.create(Error.prototype);\n ctorFunc.prototype.constructor = ctorFunc;\n return ctorFunc;\n}\n", "import { createErrorClass } from './createErrorClass';\n\nexport interface UnsubscriptionError extends Error {\n readonly errors: any[];\n}\n\nexport interface UnsubscriptionErrorCtor {\n /**\n * @deprecated Internal implementation detail. Do not construct error instances.\n * Cannot be tagged as internal: https://github.com/ReactiveX/rxjs/issues/6269\n */\n new (errors: any[]): UnsubscriptionError;\n}\n\n/**\n * An error thrown when one or more errors have occurred during the\n * `unsubscribe` of a {@link Subscription}.\n */\nexport const UnsubscriptionError: UnsubscriptionErrorCtor = createErrorClass(\n (_super) =>\n function UnsubscriptionErrorImpl(this: any, errors: (Error | string)[]) {\n _super(this);\n this.message = errors\n ? `${errors.length} errors occurred during unsubscription:\n${errors.map((err, i) => `${i + 1}) ${err.toString()}`).join('\\n ')}`\n : '';\n this.name = 'UnsubscriptionError';\n this.errors = errors;\n }\n);\n", "/**\n * Removes an item from an array, mutating it.\n * @param arr The array to remove the item from\n * @param item The item to remove\n */\nexport function arrRemove(arr: T[] | undefined | null, item: T) {\n if (arr) {\n const index = arr.indexOf(item);\n 0 <= index && arr.splice(index, 1);\n }\n}\n", "import { isFunction } from './util/isFunction';\nimport { UnsubscriptionError } from './util/UnsubscriptionError';\nimport { SubscriptionLike, TeardownLogic, Unsubscribable } from './types';\nimport { arrRemove } from './util/arrRemove';\n\n/**\n * Represents a disposable resource, such as the execution of an Observable. A\n * Subscription has one important method, `unsubscribe`, that takes no argument\n * and just disposes the resource held by the subscription.\n *\n * Additionally, subscriptions may be grouped together through the `add()`\n * method, which will attach a child Subscription to the current Subscription.\n * When a Subscription is unsubscribed, all its children (and its grandchildren)\n * will be unsubscribed as well.\n *\n * @class Subscription\n */\nexport class Subscription implements SubscriptionLike {\n /** @nocollapse */\n public static EMPTY = (() => {\n const empty = new Subscription();\n empty.closed = true;\n return empty;\n })();\n\n /**\n * A flag to indicate whether this Subscription has already been unsubscribed.\n */\n public closed = false;\n\n private _parentage: Subscription[] | Subscription | null = null;\n\n /**\n * The list of registered finalizers to execute upon unsubscription. Adding and removing from this\n * list occurs in the {@link #add} and {@link #remove} methods.\n */\n private _finalizers: Exclude[] | null = null;\n\n /**\n * @param initialTeardown A function executed first as part of the finalization\n * process that is kicked off when {@link #unsubscribe} is called.\n */\n constructor(private initialTeardown?: () => void) {}\n\n /**\n * Disposes the resources held by the subscription. May, for instance, cancel\n * an ongoing Observable execution or cancel any other type of work that\n * started when the Subscription was created.\n * @return {void}\n */\n unsubscribe(): void {\n let errors: any[] | undefined;\n\n if (!this.closed) {\n this.closed = true;\n\n // Remove this from it's parents.\n const { _parentage } = this;\n if (_parentage) {\n this._parentage = null;\n if (Array.isArray(_parentage)) {\n for (const parent of _parentage) {\n parent.remove(this);\n }\n } else {\n _parentage.remove(this);\n }\n }\n\n const { initialTeardown: initialFinalizer } = this;\n if (isFunction(initialFinalizer)) {\n try {\n initialFinalizer();\n } catch (e) {\n errors = e instanceof UnsubscriptionError ? e.errors : [e];\n }\n }\n\n const { _finalizers } = this;\n if (_finalizers) {\n this._finalizers = null;\n for (const finalizer of _finalizers) {\n try {\n execFinalizer(finalizer);\n } catch (err) {\n errors = errors ?? [];\n if (err instanceof UnsubscriptionError) {\n errors = [...errors, ...err.errors];\n } else {\n errors.push(err);\n }\n }\n }\n }\n\n if (errors) {\n throw new UnsubscriptionError(errors);\n }\n }\n }\n\n /**\n * Adds a finalizer to this subscription, so that finalization will be unsubscribed/called\n * when this subscription is unsubscribed. If this subscription is already {@link #closed},\n * because it has already been unsubscribed, then whatever finalizer is passed to it\n * will automatically be executed (unless the finalizer itself is also a closed subscription).\n *\n * Closed Subscriptions cannot be added as finalizers to any subscription. Adding a closed\n * subscription to a any subscription will result in no operation. (A noop).\n *\n * Adding a subscription to itself, or adding `null` or `undefined` will not perform any\n * operation at all. (A noop).\n *\n * `Subscription` instances that are added to this instance will automatically remove themselves\n * if they are unsubscribed. Functions and {@link Unsubscribable} objects that you wish to remove\n * will need to be removed manually with {@link #remove}\n *\n * @param teardown The finalization logic to add to this subscription.\n */\n add(teardown: TeardownLogic): void {\n // Only add the finalizer if it's not undefined\n // and don't add a subscription to itself.\n if (teardown && teardown !== this) {\n if (this.closed) {\n // If this subscription is already closed,\n // execute whatever finalizer is handed to it automatically.\n execFinalizer(teardown);\n } else {\n if (teardown instanceof Subscription) {\n // We don't add closed subscriptions, and we don't add the same subscription\n // twice. Subscription unsubscribe is idempotent.\n if (teardown.closed || teardown._hasParent(this)) {\n return;\n }\n teardown._addParent(this);\n }\n (this._finalizers = this._finalizers ?? []).push(teardown);\n }\n }\n }\n\n /**\n * Checks to see if a this subscription already has a particular parent.\n * This will signal that this subscription has already been added to the parent in question.\n * @param parent the parent to check for\n */\n private _hasParent(parent: Subscription) {\n const { _parentage } = this;\n return _parentage === parent || (Array.isArray(_parentage) && _parentage.includes(parent));\n }\n\n /**\n * Adds a parent to this subscription so it can be removed from the parent if it\n * unsubscribes on it's own.\n *\n * NOTE: THIS ASSUMES THAT {@link _hasParent} HAS ALREADY BEEN CHECKED.\n * @param parent The parent subscription to add\n */\n private _addParent(parent: Subscription) {\n const { _parentage } = this;\n this._parentage = Array.isArray(_parentage) ? (_parentage.push(parent), _parentage) : _parentage ? [_parentage, parent] : parent;\n }\n\n /**\n * Called on a child when it is removed via {@link #remove}.\n * @param parent The parent to remove\n */\n private _removeParent(parent: Subscription) {\n const { _parentage } = this;\n if (_parentage === parent) {\n this._parentage = null;\n } else if (Array.isArray(_parentage)) {\n arrRemove(_parentage, parent);\n }\n }\n\n /**\n * Removes a finalizer from this subscription that was previously added with the {@link #add} method.\n *\n * Note that `Subscription` instances, when unsubscribed, will automatically remove themselves\n * from every other `Subscription` they have been added to. This means that using the `remove` method\n * is not a common thing and should be used thoughtfully.\n *\n * If you add the same finalizer instance of a function or an unsubscribable object to a `Subscription` instance\n * more than once, you will need to call `remove` the same number of times to remove all instances.\n *\n * All finalizer instances are removed to free up memory upon unsubscription.\n *\n * @param teardown The finalizer to remove from this subscription\n */\n remove(teardown: Exclude): void {\n const { _finalizers } = this;\n _finalizers && arrRemove(_finalizers, teardown);\n\n if (teardown instanceof Subscription) {\n teardown._removeParent(this);\n }\n }\n}\n\nexport const EMPTY_SUBSCRIPTION = Subscription.EMPTY;\n\nexport function isSubscription(value: any): value is Subscription {\n return (\n value instanceof Subscription ||\n (value && 'closed' in value && isFunction(value.remove) && isFunction(value.add) && isFunction(value.unsubscribe))\n );\n}\n\nfunction execFinalizer(finalizer: Unsubscribable | (() => void)) {\n if (isFunction(finalizer)) {\n finalizer();\n } else {\n finalizer.unsubscribe();\n }\n}\n", "import { Subscriber } from './Subscriber';\nimport { ObservableNotification } from './types';\n\n/**\n * The {@link GlobalConfig} object for RxJS. It is used to configure things\n * like how to react on unhandled errors.\n */\nexport const config: GlobalConfig = {\n onUnhandledError: null,\n onStoppedNotification: null,\n Promise: undefined,\n useDeprecatedSynchronousErrorHandling: false,\n useDeprecatedNextContext: false,\n};\n\n/**\n * The global configuration object for RxJS, used to configure things\n * like how to react on unhandled errors. Accessible via {@link config}\n * object.\n */\nexport interface GlobalConfig {\n /**\n * A registration point for unhandled errors from RxJS. These are errors that\n * cannot were not handled by consuming code in the usual subscription path. For\n * example, if you have this configured, and you subscribe to an observable without\n * providing an error handler, errors from that subscription will end up here. This\n * will _always_ be called asynchronously on another job in the runtime. This is because\n * we do not want errors thrown in this user-configured handler to interfere with the\n * behavior of the library.\n */\n onUnhandledError: ((err: any) => void) | null;\n\n /**\n * A registration point for notifications that cannot be sent to subscribers because they\n * have completed, errored or have been explicitly unsubscribed. By default, next, complete\n * and error notifications sent to stopped subscribers are noops. However, sometimes callers\n * might want a different behavior. For example, with sources that attempt to report errors\n * to stopped subscribers, a caller can configure RxJS to throw an unhandled error instead.\n * This will _always_ be called asynchronously on another job in the runtime. This is because\n * we do not want errors thrown in this user-configured handler to interfere with the\n * behavior of the library.\n */\n onStoppedNotification: ((notification: ObservableNotification, subscriber: Subscriber) => void) | null;\n\n /**\n * The promise constructor used by default for {@link Observable#toPromise toPromise} and {@link Observable#forEach forEach}\n * methods.\n *\n * @deprecated As of version 8, RxJS will no longer support this sort of injection of a\n * Promise constructor. If you need a Promise implementation other than native promises,\n * please polyfill/patch Promise as you see appropriate. Will be removed in v8.\n */\n Promise?: PromiseConstructorLike;\n\n /**\n * If true, turns on synchronous error rethrowing, which is a deprecated behavior\n * in v6 and higher. This behavior enables bad patterns like wrapping a subscribe\n * call in a try/catch block. It also enables producer interference, a nasty bug\n * where a multicast can be broken for all observers by a downstream consumer with\n * an unhandled error. DO NOT USE THIS FLAG UNLESS IT'S NEEDED TO BUY TIME\n * FOR MIGRATION REASONS.\n *\n * @deprecated As of version 8, RxJS will no longer support synchronous throwing\n * of unhandled errors. All errors will be thrown on a separate call stack to prevent bad\n * behaviors described above. Will be removed in v8.\n */\n useDeprecatedSynchronousErrorHandling: boolean;\n\n /**\n * If true, enables an as-of-yet undocumented feature from v5: The ability to access\n * `unsubscribe()` via `this` context in `next` functions created in observers passed\n * to `subscribe`.\n *\n * This is being removed because the performance was severely problematic, and it could also cause\n * issues when types other than POJOs are passed to subscribe as subscribers, as they will likely have\n * their `this` context overwritten.\n *\n * @deprecated As of version 8, RxJS will no longer support altering the\n * context of next functions provided as part of an observer to Subscribe. Instead,\n * you will have access to a subscription or a signal or token that will allow you to do things like\n * unsubscribe and test closed status. Will be removed in v8.\n */\n useDeprecatedNextContext: boolean;\n}\n", "import type { TimerHandle } from './timerHandle';\ntype SetTimeoutFunction = (handler: () => void, timeout?: number, ...args: any[]) => TimerHandle;\ntype ClearTimeoutFunction = (handle: TimerHandle) => void;\n\ninterface TimeoutProvider {\n setTimeout: SetTimeoutFunction;\n clearTimeout: ClearTimeoutFunction;\n delegate:\n | {\n setTimeout: SetTimeoutFunction;\n clearTimeout: ClearTimeoutFunction;\n }\n | undefined;\n}\n\nexport const timeoutProvider: TimeoutProvider = {\n // When accessing the delegate, use the variable rather than `this` so that\n // the functions can be called without being bound to the provider.\n setTimeout(handler: () => void, timeout?: number, ...args) {\n const { delegate } = timeoutProvider;\n if (delegate?.setTimeout) {\n return delegate.setTimeout(handler, timeout, ...args);\n }\n return setTimeout(handler, timeout, ...args);\n },\n clearTimeout(handle) {\n const { delegate } = timeoutProvider;\n return (delegate?.clearTimeout || clearTimeout)(handle as any);\n },\n delegate: undefined,\n};\n", "import { config } from '../config';\nimport { timeoutProvider } from '../scheduler/timeoutProvider';\n\n/**\n * Handles an error on another job either with the user-configured {@link onUnhandledError},\n * or by throwing it on that new job so it can be picked up by `window.onerror`, `process.on('error')`, etc.\n *\n * This should be called whenever there is an error that is out-of-band with the subscription\n * or when an error hits a terminal boundary of the subscription and no error handler was provided.\n *\n * @param err the error to report\n */\nexport function reportUnhandledError(err: any) {\n timeoutProvider.setTimeout(() => {\n const { onUnhandledError } = config;\n if (onUnhandledError) {\n // Execute the user-configured error handler.\n onUnhandledError(err);\n } else {\n // Throw so it is picked up by the runtime's uncaught error mechanism.\n throw err;\n }\n });\n}\n", "/* tslint:disable:no-empty */\nexport function noop() { }\n", "import { CompleteNotification, NextNotification, ErrorNotification } from './types';\n\n/**\n * A completion object optimized for memory use and created to be the\n * same \"shape\" as other notifications in v8.\n * @internal\n */\nexport const COMPLETE_NOTIFICATION = (() => createNotification('C', undefined, undefined) as CompleteNotification)();\n\n/**\n * Internal use only. Creates an optimized error notification that is the same \"shape\"\n * as other notifications.\n * @internal\n */\nexport function errorNotification(error: any): ErrorNotification {\n return createNotification('E', undefined, error) as any;\n}\n\n/**\n * Internal use only. Creates an optimized next notification that is the same \"shape\"\n * as other notifications.\n * @internal\n */\nexport function nextNotification(value: T) {\n return createNotification('N', value, undefined) as NextNotification;\n}\n\n/**\n * Ensures that all notifications created internally have the same \"shape\" in v8.\n *\n * TODO: This is only exported to support a crazy legacy test in `groupBy`.\n * @internal\n */\nexport function createNotification(kind: 'N' | 'E' | 'C', value: any, error: any) {\n return {\n kind,\n value,\n error,\n };\n}\n", "import { config } from '../config';\n\nlet context: { errorThrown: boolean; error: any } | null = null;\n\n/**\n * Handles dealing with errors for super-gross mode. Creates a context, in which\n * any synchronously thrown errors will be passed to {@link captureError}. Which\n * will record the error such that it will be rethrown after the call back is complete.\n * TODO: Remove in v8\n * @param cb An immediately executed function.\n */\nexport function errorContext(cb: () => void) {\n if (config.useDeprecatedSynchronousErrorHandling) {\n const isRoot = !context;\n if (isRoot) {\n context = { errorThrown: false, error: null };\n }\n cb();\n if (isRoot) {\n const { errorThrown, error } = context!;\n context = null;\n if (errorThrown) {\n throw error;\n }\n }\n } else {\n // This is the general non-deprecated path for everyone that\n // isn't crazy enough to use super-gross mode (useDeprecatedSynchronousErrorHandling)\n cb();\n }\n}\n\n/**\n * Captures errors only in super-gross mode.\n * @param err the error to capture\n */\nexport function captureError(err: any) {\n if (config.useDeprecatedSynchronousErrorHandling && context) {\n context.errorThrown = true;\n context.error = err;\n }\n}\n", "import { isFunction } from './util/isFunction';\nimport { Observer, ObservableNotification } from './types';\nimport { isSubscription, Subscription } from './Subscription';\nimport { config } from './config';\nimport { reportUnhandledError } from './util/reportUnhandledError';\nimport { noop } from './util/noop';\nimport { nextNotification, errorNotification, COMPLETE_NOTIFICATION } from './NotificationFactories';\nimport { timeoutProvider } from './scheduler/timeoutProvider';\nimport { captureError } from './util/errorContext';\n\n/**\n * Implements the {@link Observer} interface and extends the\n * {@link Subscription} class. While the {@link Observer} is the public API for\n * consuming the values of an {@link Observable}, all Observers get converted to\n * a Subscriber, in order to provide Subscription-like capabilities such as\n * `unsubscribe`. Subscriber is a common type in RxJS, and crucial for\n * implementing operators, but it is rarely used as a public API.\n *\n * @class Subscriber\n */\nexport class Subscriber extends Subscription implements Observer {\n /**\n * A static factory for a Subscriber, given a (potentially partial) definition\n * of an Observer.\n * @param next The `next` callback of an Observer.\n * @param error The `error` callback of an\n * Observer.\n * @param complete The `complete` callback of an\n * Observer.\n * @return A Subscriber wrapping the (partially defined)\n * Observer represented by the given arguments.\n * @nocollapse\n * @deprecated Do not use. Will be removed in v8. There is no replacement for this\n * method, and there is no reason to be creating instances of `Subscriber` directly.\n * If you have a specific use case, please file an issue.\n */\n static create(next?: (x?: T) => void, error?: (e?: any) => void, complete?: () => void): Subscriber {\n return new SafeSubscriber(next, error, complete);\n }\n\n /** @deprecated Internal implementation detail, do not use directly. Will be made internal in v8. */\n protected isStopped: boolean = false;\n /** @deprecated Internal implementation detail, do not use directly. Will be made internal in v8. */\n protected destination: Subscriber | Observer; // this `any` is the escape hatch to erase extra type param (e.g. R)\n\n /**\n * @deprecated Internal implementation detail, do not use directly. Will be made internal in v8.\n * There is no reason to directly create an instance of Subscriber. This type is exported for typings reasons.\n */\n constructor(destination?: Subscriber | Observer) {\n super();\n if (destination) {\n this.destination = destination;\n // Automatically chain subscriptions together here.\n // if destination is a Subscription, then it is a Subscriber.\n if (isSubscription(destination)) {\n destination.add(this);\n }\n } else {\n this.destination = EMPTY_OBSERVER;\n }\n }\n\n /**\n * The {@link Observer} callback to receive notifications of type `next` from\n * the Observable, with a value. The Observable may call this method 0 or more\n * times.\n * @param {T} [value] The `next` value.\n * @return {void}\n */\n next(value?: T): void {\n if (this.isStopped) {\n handleStoppedNotification(nextNotification(value), this);\n } else {\n this._next(value!);\n }\n }\n\n /**\n * The {@link Observer} callback to receive notifications of type `error` from\n * the Observable, with an attached `Error`. Notifies the Observer that\n * the Observable has experienced an error condition.\n * @param {any} [err] The `error` exception.\n * @return {void}\n */\n error(err?: any): void {\n if (this.isStopped) {\n handleStoppedNotification(errorNotification(err), this);\n } else {\n this.isStopped = true;\n this._error(err);\n }\n }\n\n /**\n * The {@link Observer} callback to receive a valueless notification of type\n * `complete` from the Observable. Notifies the Observer that the Observable\n * has finished sending push-based notifications.\n * @return {void}\n */\n complete(): void {\n if (this.isStopped) {\n handleStoppedNotification(COMPLETE_NOTIFICATION, this);\n } else {\n this.isStopped = true;\n this._complete();\n }\n }\n\n unsubscribe(): void {\n if (!this.closed) {\n this.isStopped = true;\n super.unsubscribe();\n this.destination = null!;\n }\n }\n\n protected _next(value: T): void {\n this.destination.next(value);\n }\n\n protected _error(err: any): void {\n try {\n this.destination.error(err);\n } finally {\n this.unsubscribe();\n }\n }\n\n protected _complete(): void {\n try {\n this.destination.complete();\n } finally {\n this.unsubscribe();\n }\n }\n}\n\n/**\n * This bind is captured here because we want to be able to have\n * compatibility with monoid libraries that tend to use a method named\n * `bind`. In particular, a library called Monio requires this.\n */\nconst _bind = Function.prototype.bind;\n\nfunction bind any>(fn: Fn, thisArg: any): Fn {\n return _bind.call(fn, thisArg);\n}\n\n/**\n * Internal optimization only, DO NOT EXPOSE.\n * @internal\n */\nclass ConsumerObserver implements Observer {\n constructor(private partialObserver: Partial>) {}\n\n next(value: T): void {\n const { partialObserver } = this;\n if (partialObserver.next) {\n try {\n partialObserver.next(value);\n } catch (error) {\n handleUnhandledError(error);\n }\n }\n }\n\n error(err: any): void {\n const { partialObserver } = this;\n if (partialObserver.error) {\n try {\n partialObserver.error(err);\n } catch (error) {\n handleUnhandledError(error);\n }\n } else {\n handleUnhandledError(err);\n }\n }\n\n complete(): void {\n const { partialObserver } = this;\n if (partialObserver.complete) {\n try {\n partialObserver.complete();\n } catch (error) {\n handleUnhandledError(error);\n }\n }\n }\n}\n\nexport class SafeSubscriber extends Subscriber {\n constructor(\n observerOrNext?: Partial> | ((value: T) => void) | null,\n error?: ((e?: any) => void) | null,\n complete?: (() => void) | null\n ) {\n super();\n\n let partialObserver: Partial>;\n if (isFunction(observerOrNext) || !observerOrNext) {\n // The first argument is a function, not an observer. The next\n // two arguments *could* be observers, or they could be empty.\n partialObserver = {\n next: (observerOrNext ?? undefined) as (((value: T) => void) | undefined),\n error: error ?? undefined,\n complete: complete ?? undefined,\n };\n } else {\n // The first argument is a partial observer.\n let context: any;\n if (this && config.useDeprecatedNextContext) {\n // This is a deprecated path that made `this.unsubscribe()` available in\n // next handler functions passed to subscribe. This only exists behind a flag\n // now, as it is *very* slow.\n context = Object.create(observerOrNext);\n context.unsubscribe = () => this.unsubscribe();\n partialObserver = {\n next: observerOrNext.next && bind(observerOrNext.next, context),\n error: observerOrNext.error && bind(observerOrNext.error, context),\n complete: observerOrNext.complete && bind(observerOrNext.complete, context),\n };\n } else {\n // The \"normal\" path. Just use the partial observer directly.\n partialObserver = observerOrNext;\n }\n }\n\n // Wrap the partial observer to ensure it's a full observer, and\n // make sure proper error handling is accounted for.\n this.destination = new ConsumerObserver(partialObserver);\n }\n}\n\nfunction handleUnhandledError(error: any) {\n if (config.useDeprecatedSynchronousErrorHandling) {\n captureError(error);\n } else {\n // Ideal path, we report this as an unhandled error,\n // which is thrown on a new call stack.\n reportUnhandledError(error);\n }\n}\n\n/**\n * An error handler used when no error handler was supplied\n * to the SafeSubscriber -- meaning no error handler was supplied\n * do the `subscribe` call on our observable.\n * @param err The error to handle\n */\nfunction defaultErrorHandler(err: any) {\n throw err;\n}\n\n/**\n * A handler for notifications that cannot be sent to a stopped subscriber.\n * @param notification The notification being sent\n * @param subscriber The stopped subscriber\n */\nfunction handleStoppedNotification(notification: ObservableNotification, subscriber: Subscriber) {\n const { onStoppedNotification } = config;\n onStoppedNotification && timeoutProvider.setTimeout(() => onStoppedNotification(notification, subscriber));\n}\n\n/**\n * The observer used as a stub for subscriptions where the user did not\n * pass any arguments to `subscribe`. Comes with the default error handling\n * behavior.\n */\nexport const EMPTY_OBSERVER: Readonly> & { closed: true } = {\n closed: true,\n next: noop,\n error: defaultErrorHandler,\n complete: noop,\n};\n", "/**\n * Symbol.observable or a string \"@@observable\". Used for interop\n *\n * @deprecated We will no longer be exporting this symbol in upcoming versions of RxJS.\n * Instead polyfill and use Symbol.observable directly *or* use https://www.npmjs.com/package/symbol-observable\n */\nexport const observable: string | symbol = (() => (typeof Symbol === 'function' && Symbol.observable) || '@@observable')();\n", "/**\n * This function takes one parameter and just returns it. Simply put,\n * this is like `(x: T): T => x`.\n *\n * ## Examples\n *\n * This is useful in some cases when using things like `mergeMap`\n *\n * ```ts\n * import { interval, take, map, range, mergeMap, identity } from 'rxjs';\n *\n * const source$ = interval(1000).pipe(take(5));\n *\n * const result$ = source$.pipe(\n * map(i => range(i)),\n * mergeMap(identity) // same as mergeMap(x => x)\n * );\n *\n * result$.subscribe({\n * next: console.log\n * });\n * ```\n *\n * Or when you want to selectively apply an operator\n *\n * ```ts\n * import { interval, take, identity } from 'rxjs';\n *\n * const shouldLimit = () => Math.random() < 0.5;\n *\n * const source$ = interval(1000);\n *\n * const result$ = source$.pipe(shouldLimit() ? take(5) : identity);\n *\n * result$.subscribe({\n * next: console.log\n * });\n * ```\n *\n * @param x Any value that is returned by this function\n * @returns The value passed as the first parameter to this function\n */\nexport function identity(x: T): T {\n return x;\n}\n", "import { identity } from './identity';\nimport { UnaryFunction } from '../types';\n\nexport function pipe(): typeof identity;\nexport function pipe(fn1: UnaryFunction): UnaryFunction;\nexport function pipe(fn1: UnaryFunction, fn2: UnaryFunction): UnaryFunction;\nexport function pipe(fn1: UnaryFunction, fn2: UnaryFunction, fn3: UnaryFunction): UnaryFunction;\nexport function pipe(\n fn1: UnaryFunction,\n fn2: UnaryFunction,\n fn3: UnaryFunction,\n fn4: UnaryFunction\n): UnaryFunction;\nexport function pipe(\n fn1: UnaryFunction,\n fn2: UnaryFunction,\n fn3: UnaryFunction,\n fn4: UnaryFunction,\n fn5: UnaryFunction\n): UnaryFunction;\nexport function pipe(\n fn1: UnaryFunction,\n fn2: UnaryFunction,\n fn3: UnaryFunction,\n fn4: UnaryFunction,\n fn5: UnaryFunction,\n fn6: UnaryFunction\n): UnaryFunction;\nexport function pipe(\n fn1: UnaryFunction,\n fn2: UnaryFunction,\n fn3: UnaryFunction,\n fn4: UnaryFunction,\n fn5: UnaryFunction,\n fn6: UnaryFunction,\n fn7: UnaryFunction\n): UnaryFunction;\nexport function pipe(\n fn1: UnaryFunction,\n fn2: UnaryFunction,\n fn3: UnaryFunction,\n fn4: UnaryFunction,\n fn5: UnaryFunction,\n fn6: UnaryFunction,\n fn7: UnaryFunction,\n fn8: UnaryFunction\n): UnaryFunction;\nexport function pipe(\n fn1: UnaryFunction,\n fn2: UnaryFunction,\n fn3: UnaryFunction,\n fn4: UnaryFunction,\n fn5: UnaryFunction,\n fn6: UnaryFunction,\n fn7: UnaryFunction,\n fn8: UnaryFunction,\n fn9: UnaryFunction\n): UnaryFunction;\nexport function pipe(\n fn1: UnaryFunction,\n fn2: UnaryFunction,\n fn3: UnaryFunction,\n fn4: UnaryFunction,\n fn5: UnaryFunction,\n fn6: UnaryFunction,\n fn7: UnaryFunction,\n fn8: UnaryFunction,\n fn9: UnaryFunction,\n ...fns: UnaryFunction[]\n): UnaryFunction;\n\n/**\n * pipe() can be called on one or more functions, each of which can take one argument (\"UnaryFunction\")\n * and uses it to return a value.\n * It returns a function that takes one argument, passes it to the first UnaryFunction, and then\n * passes the result to the next one, passes that result to the next one, and so on. \n */\nexport function pipe(...fns: Array>): UnaryFunction {\n return pipeFromArray(fns);\n}\n\n/** @internal */\nexport function pipeFromArray(fns: Array>): UnaryFunction {\n if (fns.length === 0) {\n return identity as UnaryFunction;\n }\n\n if (fns.length === 1) {\n return fns[0];\n }\n\n return function piped(input: T): R {\n return fns.reduce((prev: any, fn: UnaryFunction) => fn(prev), input as any);\n };\n}\n", "import { Operator } from './Operator';\nimport { SafeSubscriber, Subscriber } from './Subscriber';\nimport { isSubscription, Subscription } from './Subscription';\nimport { TeardownLogic, OperatorFunction, Subscribable, Observer } from './types';\nimport { observable as Symbol_observable } from './symbol/observable';\nimport { pipeFromArray } from './util/pipe';\nimport { config } from './config';\nimport { isFunction } from './util/isFunction';\nimport { errorContext } from './util/errorContext';\n\n/**\n * A representation of any set of values over any amount of time. This is the most basic building block\n * of RxJS.\n *\n * @class Observable\n */\nexport class Observable implements Subscribable {\n /**\n * @deprecated Internal implementation detail, do not use directly. Will be made internal in v8.\n */\n source: Observable | undefined;\n\n /**\n * @deprecated Internal implementation detail, do not use directly. Will be made internal in v8.\n */\n operator: Operator | undefined;\n\n /**\n * @constructor\n * @param {Function} subscribe the function that is called when the Observable is\n * initially subscribed to. This function is given a Subscriber, to which new values\n * can be `next`ed, or an `error` method can be called to raise an error, or\n * `complete` can be called to notify of a successful completion.\n */\n constructor(subscribe?: (this: Observable, subscriber: Subscriber) => TeardownLogic) {\n if (subscribe) {\n this._subscribe = subscribe;\n }\n }\n\n // HACK: Since TypeScript inherits static properties too, we have to\n // fight against TypeScript here so Subject can have a different static create signature\n /**\n * Creates a new Observable by calling the Observable constructor\n * @owner Observable\n * @method create\n * @param {Function} subscribe? the subscriber function to be passed to the Observable constructor\n * @return {Observable} a new observable\n * @nocollapse\n * @deprecated Use `new Observable()` instead. Will be removed in v8.\n */\n static create: (...args: any[]) => any = (subscribe?: (subscriber: Subscriber) => TeardownLogic) => {\n return new Observable(subscribe);\n };\n\n /**\n * Creates a new Observable, with this Observable instance as the source, and the passed\n * operator defined as the new observable's operator.\n * @method lift\n * @param operator the operator defining the operation to take on the observable\n * @return a new observable with the Operator applied\n * @deprecated Internal implementation detail, do not use directly. Will be made internal in v8.\n * If you have implemented an operator using `lift`, it is recommended that you create an\n * operator by simply returning `new Observable()` directly. See \"Creating new operators from\n * scratch\" section here: https://rxjs.dev/guide/operators\n */\n lift(operator?: Operator): Observable {\n const observable = new Observable();\n observable.source = this;\n observable.operator = operator;\n return observable;\n }\n\n subscribe(observerOrNext?: Partial> | ((value: T) => void)): Subscription;\n /** @deprecated Instead of passing separate callback arguments, use an observer argument. Signatures taking separate callback arguments will be removed in v8. Details: https://rxjs.dev/deprecations/subscribe-arguments */\n subscribe(next?: ((value: T) => void) | null, error?: ((error: any) => void) | null, complete?: (() => void) | null): Subscription;\n /**\n * Invokes an execution of an Observable and registers Observer handlers for notifications it will emit.\n *\n * Use it when you have all these Observables, but still nothing is happening.\n *\n * `subscribe` is not a regular operator, but a method that calls Observable's internal `subscribe` function. It\n * might be for example a function that you passed to Observable's constructor, but most of the time it is\n * a library implementation, which defines what will be emitted by an Observable, and when it be will emitted. This means\n * that calling `subscribe` is actually the moment when Observable starts its work, not when it is created, as it is often\n * the thought.\n *\n * Apart from starting the execution of an Observable, this method allows you to listen for values\n * that an Observable emits, as well as for when it completes or errors. You can achieve this in two\n * of the following ways.\n *\n * The first way is creating an object that implements {@link Observer} interface. It should have methods\n * defined by that interface, but note that it should be just a regular JavaScript object, which you can create\n * yourself in any way you want (ES6 class, classic function constructor, object literal etc.). In particular, do\n * not attempt to use any RxJS implementation details to create Observers - you don't need them. Remember also\n * that your object does not have to implement all methods. If you find yourself creating a method that doesn't\n * do anything, you can simply omit it. Note however, if the `error` method is not provided and an error happens,\n * it will be thrown asynchronously. Errors thrown asynchronously cannot be caught using `try`/`catch`. Instead,\n * use the {@link onUnhandledError} configuration option or use a runtime handler (like `window.onerror` or\n * `process.on('error)`) to be notified of unhandled errors. Because of this, it's recommended that you provide\n * an `error` method to avoid missing thrown errors.\n *\n * The second way is to give up on Observer object altogether and simply provide callback functions in place of its methods.\n * This means you can provide three functions as arguments to `subscribe`, where the first function is equivalent\n * of a `next` method, the second of an `error` method and the third of a `complete` method. Just as in case of an Observer,\n * if you do not need to listen for something, you can omit a function by passing `undefined` or `null`,\n * since `subscribe` recognizes these functions by where they were placed in function call. When it comes\n * to the `error` function, as with an Observer, if not provided, errors emitted by an Observable will be thrown asynchronously.\n *\n * You can, however, subscribe with no parameters at all. This may be the case where you're not interested in terminal events\n * and you also handled emissions internally by using operators (e.g. using `tap`).\n *\n * Whichever style of calling `subscribe` you use, in both cases it returns a Subscription object.\n * This object allows you to call `unsubscribe` on it, which in turn will stop the work that an Observable does and will clean\n * up all resources that an Observable used. Note that cancelling a subscription will not call `complete` callback\n * provided to `subscribe` function, which is reserved for a regular completion signal that comes from an Observable.\n *\n * Remember that callbacks provided to `subscribe` are not guaranteed to be called asynchronously.\n * It is an Observable itself that decides when these functions will be called. For example {@link of}\n * by default emits all its values synchronously. Always check documentation for how given Observable\n * will behave when subscribed and if its default behavior can be modified with a `scheduler`.\n *\n * #### Examples\n *\n * Subscribe with an {@link guide/observer Observer}\n *\n * ```ts\n * import { of } from 'rxjs';\n *\n * const sumObserver = {\n * sum: 0,\n * next(value) {\n * console.log('Adding: ' + value);\n * this.sum = this.sum + value;\n * },\n * error() {\n * // We actually could just remove this method,\n * // since we do not really care about errors right now.\n * },\n * complete() {\n * console.log('Sum equals: ' + this.sum);\n * }\n * };\n *\n * of(1, 2, 3) // Synchronously emits 1, 2, 3 and then completes.\n * .subscribe(sumObserver);\n *\n * // Logs:\n * // 'Adding: 1'\n * // 'Adding: 2'\n * // 'Adding: 3'\n * // 'Sum equals: 6'\n * ```\n *\n * Subscribe with functions ({@link deprecations/subscribe-arguments deprecated})\n *\n * ```ts\n * import { of } from 'rxjs'\n *\n * let sum = 0;\n *\n * of(1, 2, 3).subscribe(\n * value => {\n * console.log('Adding: ' + value);\n * sum = sum + value;\n * },\n * undefined,\n * () => console.log('Sum equals: ' + sum)\n * );\n *\n * // Logs:\n * // 'Adding: 1'\n * // 'Adding: 2'\n * // 'Adding: 3'\n * // 'Sum equals: 6'\n * ```\n *\n * Cancel a subscription\n *\n * ```ts\n * import { interval } from 'rxjs';\n *\n * const subscription = interval(1000).subscribe({\n * next(num) {\n * console.log(num)\n * },\n * complete() {\n * // Will not be called, even when cancelling subscription.\n * console.log('completed!');\n * }\n * });\n *\n * setTimeout(() => {\n * subscription.unsubscribe();\n * console.log('unsubscribed!');\n * }, 2500);\n *\n * // Logs:\n * // 0 after 1s\n * // 1 after 2s\n * // 'unsubscribed!' after 2.5s\n * ```\n *\n * @param {Observer|Function} observerOrNext (optional) Either an observer with methods to be called,\n * or the first of three possible handlers, which is the handler for each value emitted from the subscribed\n * Observable.\n * @param {Function} error (optional) A handler for a terminal event resulting from an error. If no error handler is provided,\n * the error will be thrown asynchronously as unhandled.\n * @param {Function} complete (optional) A handler for a terminal event resulting from successful completion.\n * @return {Subscription} a subscription reference to the registered handlers\n * @method subscribe\n */\n subscribe(\n observerOrNext?: Partial> | ((value: T) => void) | null,\n error?: ((error: any) => void) | null,\n complete?: (() => void) | null\n ): Subscription {\n const subscriber = isSubscriber(observerOrNext) ? observerOrNext : new SafeSubscriber(observerOrNext, error, complete);\n\n errorContext(() => {\n const { operator, source } = this;\n subscriber.add(\n operator\n ? // We're dealing with a subscription in the\n // operator chain to one of our lifted operators.\n operator.call(subscriber, source)\n : source\n ? // If `source` has a value, but `operator` does not, something that\n // had intimate knowledge of our API, like our `Subject`, must have\n // set it. We're going to just call `_subscribe` directly.\n this._subscribe(subscriber)\n : // In all other cases, we're likely wrapping a user-provided initializer\n // function, so we need to catch errors and handle them appropriately.\n this._trySubscribe(subscriber)\n );\n });\n\n return subscriber;\n }\n\n /** @internal */\n protected _trySubscribe(sink: Subscriber): TeardownLogic {\n try {\n return this._subscribe(sink);\n } catch (err) {\n // We don't need to return anything in this case,\n // because it's just going to try to `add()` to a subscription\n // above.\n sink.error(err);\n }\n }\n\n /**\n * Used as a NON-CANCELLABLE means of subscribing to an observable, for use with\n * APIs that expect promises, like `async/await`. You cannot unsubscribe from this.\n *\n * **WARNING**: Only use this with observables you *know* will complete. If the source\n * observable does not complete, you will end up with a promise that is hung up, and\n * potentially all of the state of an async function hanging out in memory. To avoid\n * this situation, look into adding something like {@link timeout}, {@link take},\n * {@link takeWhile}, or {@link takeUntil} amongst others.\n *\n * #### Example\n *\n * ```ts\n * import { interval, take } from 'rxjs';\n *\n * const source$ = interval(1000).pipe(take(4));\n *\n * async function getTotal() {\n * let total = 0;\n *\n * await source$.forEach(value => {\n * total += value;\n * console.log('observable -> ' + value);\n * });\n *\n * return total;\n * }\n *\n * getTotal().then(\n * total => console.log('Total: ' + total)\n * );\n *\n * // Expected:\n * // 'observable -> 0'\n * // 'observable -> 1'\n * // 'observable -> 2'\n * // 'observable -> 3'\n * // 'Total: 6'\n * ```\n *\n * @param next a handler for each value emitted by the observable\n * @return a promise that either resolves on observable completion or\n * rejects with the handled error\n */\n forEach(next: (value: T) => void): Promise;\n\n /**\n * @param next a handler for each value emitted by the observable\n * @param promiseCtor a constructor function used to instantiate the Promise\n * @return a promise that either resolves on observable completion or\n * rejects with the handled error\n * @deprecated Passing a Promise constructor will no longer be available\n * in upcoming versions of RxJS. This is because it adds weight to the library, for very\n * little benefit. If you need this functionality, it is recommended that you either\n * polyfill Promise, or you create an adapter to convert the returned native promise\n * to whatever promise implementation you wanted. Will be removed in v8.\n */\n forEach(next: (value: T) => void, promiseCtor: PromiseConstructorLike): Promise;\n\n forEach(next: (value: T) => void, promiseCtor?: PromiseConstructorLike): Promise {\n promiseCtor = getPromiseCtor(promiseCtor);\n\n return new promiseCtor((resolve, reject) => {\n const subscriber = new SafeSubscriber({\n next: (value) => {\n try {\n next(value);\n } catch (err) {\n reject(err);\n subscriber.unsubscribe();\n }\n },\n error: reject,\n complete: resolve,\n });\n this.subscribe(subscriber);\n }) as Promise;\n }\n\n /** @internal */\n protected _subscribe(subscriber: Subscriber): TeardownLogic {\n return this.source?.subscribe(subscriber);\n }\n\n /**\n * An interop point defined by the es7-observable spec https://github.com/zenparsing/es-observable\n * @method Symbol.observable\n * @return {Observable} this instance of the observable\n */\n [Symbol_observable]() {\n return this;\n }\n\n /* tslint:disable:max-line-length */\n pipe(): Observable;\n pipe(op1: OperatorFunction): Observable;\n pipe(op1: OperatorFunction, op2: OperatorFunction): Observable;\n pipe(op1: OperatorFunction, op2: OperatorFunction, op3: OperatorFunction): Observable;\n pipe(\n op1: OperatorFunction,\n op2: OperatorFunction,\n op3: OperatorFunction,\n op4: OperatorFunction\n ): Observable;\n pipe(\n op1: OperatorFunction,\n op2: OperatorFunction,\n op3: OperatorFunction,\n op4: OperatorFunction,\n op5: OperatorFunction\n ): Observable;\n pipe(\n op1: OperatorFunction,\n op2: OperatorFunction,\n op3: OperatorFunction,\n op4: OperatorFunction,\n op5: OperatorFunction,\n op6: OperatorFunction\n ): Observable;\n pipe(\n op1: OperatorFunction,\n op2: OperatorFunction,\n op3: OperatorFunction,\n op4: OperatorFunction,\n op5: OperatorFunction,\n op6: OperatorFunction,\n op7: OperatorFunction\n ): Observable;\n pipe(\n op1: OperatorFunction,\n op2: OperatorFunction,\n op3: OperatorFunction,\n op4: OperatorFunction,\n op5: OperatorFunction,\n op6: OperatorFunction,\n op7: OperatorFunction,\n op8: OperatorFunction\n ): Observable;\n pipe(\n op1: OperatorFunction,\n op2: OperatorFunction,\n op3: OperatorFunction,\n op4: OperatorFunction,\n op5: OperatorFunction,\n op6: OperatorFunction,\n op7: OperatorFunction,\n op8: OperatorFunction,\n op9: OperatorFunction\n ): Observable;\n pipe(\n op1: OperatorFunction,\n op2: OperatorFunction,\n op3: OperatorFunction,\n op4: OperatorFunction,\n op5: OperatorFunction,\n op6: OperatorFunction,\n op7: OperatorFunction,\n op8: OperatorFunction,\n op9: OperatorFunction,\n ...operations: OperatorFunction[]\n ): Observable;\n /* tslint:enable:max-line-length */\n\n /**\n * Used to stitch together functional operators into a chain.\n * @method pipe\n * @return {Observable} the Observable result of all of the operators having\n * been called in the order they were passed in.\n *\n * ## Example\n *\n * ```ts\n * import { interval, filter, map, scan } from 'rxjs';\n *\n * interval(1000)\n * .pipe(\n * filter(x => x % 2 === 0),\n * map(x => x + x),\n * scan((acc, x) => acc + x)\n * )\n * .subscribe(x => console.log(x));\n * ```\n */\n pipe(...operations: OperatorFunction[]): Observable {\n return pipeFromArray(operations)(this);\n }\n\n /* tslint:disable:max-line-length */\n /** @deprecated Replaced with {@link firstValueFrom} and {@link lastValueFrom}. Will be removed in v8. Details: https://rxjs.dev/deprecations/to-promise */\n toPromise(): Promise;\n /** @deprecated Replaced with {@link firstValueFrom} and {@link lastValueFrom}. Will be removed in v8. Details: https://rxjs.dev/deprecations/to-promise */\n toPromise(PromiseCtor: typeof Promise): Promise;\n /** @deprecated Replaced with {@link firstValueFrom} and {@link lastValueFrom}. Will be removed in v8. Details: https://rxjs.dev/deprecations/to-promise */\n toPromise(PromiseCtor: PromiseConstructorLike): Promise;\n /* tslint:enable:max-line-length */\n\n /**\n * Subscribe to this Observable and get a Promise resolving on\n * `complete` with the last emission (if any).\n *\n * **WARNING**: Only use this with observables you *know* will complete. If the source\n * observable does not complete, you will end up with a promise that is hung up, and\n * potentially all of the state of an async function hanging out in memory. To avoid\n * this situation, look into adding something like {@link timeout}, {@link take},\n * {@link takeWhile}, or {@link takeUntil} amongst others.\n *\n * @method toPromise\n * @param [promiseCtor] a constructor function used to instantiate\n * the Promise\n * @return A Promise that resolves with the last value emit, or\n * rejects on an error. If there were no emissions, Promise\n * resolves with undefined.\n * @deprecated Replaced with {@link firstValueFrom} and {@link lastValueFrom}. Will be removed in v8. Details: https://rxjs.dev/deprecations/to-promise\n */\n toPromise(promiseCtor?: PromiseConstructorLike): Promise {\n promiseCtor = getPromiseCtor(promiseCtor);\n\n return new promiseCtor((resolve, reject) => {\n let value: T | undefined;\n this.subscribe(\n (x: T) => (value = x),\n (err: any) => reject(err),\n () => resolve(value)\n );\n }) as Promise;\n }\n}\n\n/**\n * Decides between a passed promise constructor from consuming code,\n * A default configured promise constructor, and the native promise\n * constructor and returns it. If nothing can be found, it will throw\n * an error.\n * @param promiseCtor The optional promise constructor to passed by consuming code\n */\nfunction getPromiseCtor(promiseCtor: PromiseConstructorLike | undefined) {\n return promiseCtor ?? config.Promise ?? Promise;\n}\n\nfunction isObserver(value: any): value is Observer {\n return value && isFunction(value.next) && isFunction(value.error) && isFunction(value.complete);\n}\n\nfunction isSubscriber(value: any): value is Subscriber {\n return (value && value instanceof Subscriber) || (isObserver(value) && isSubscription(value));\n}\n", "import { Observable } from '../Observable';\nimport { Subscriber } from '../Subscriber';\nimport { OperatorFunction } from '../types';\nimport { isFunction } from './isFunction';\n\n/**\n * Used to determine if an object is an Observable with a lift function.\n */\nexport function hasLift(source: any): source is { lift: InstanceType['lift'] } {\n return isFunction(source?.lift);\n}\n\n/**\n * Creates an `OperatorFunction`. Used to define operators throughout the library in a concise way.\n * @param init The logic to connect the liftedSource to the subscriber at the moment of subscription.\n */\nexport function operate(\n init: (liftedSource: Observable, subscriber: Subscriber) => (() => void) | void\n): OperatorFunction {\n return (source: Observable) => {\n if (hasLift(source)) {\n return source.lift(function (this: Subscriber, liftedSource: Observable) {\n try {\n return init(liftedSource, this);\n } catch (err) {\n this.error(err);\n }\n });\n }\n throw new TypeError('Unable to lift unknown Observable type');\n };\n}\n", "import { Subscriber } from '../Subscriber';\n\n/**\n * Creates an instance of an `OperatorSubscriber`.\n * @param destination The downstream subscriber.\n * @param onNext Handles next values, only called if this subscriber is not stopped or closed. Any\n * error that occurs in this function is caught and sent to the `error` method of this subscriber.\n * @param onError Handles errors from the subscription, any errors that occur in this handler are caught\n * and send to the `destination` error handler.\n * @param onComplete Handles completion notification from the subscription. Any errors that occur in\n * this handler are sent to the `destination` error handler.\n * @param onFinalize Additional teardown logic here. This will only be called on teardown if the\n * subscriber itself is not already closed. This is called after all other teardown logic is executed.\n */\nexport function createOperatorSubscriber(\n destination: Subscriber,\n onNext?: (value: T) => void,\n onComplete?: () => void,\n onError?: (err: any) => void,\n onFinalize?: () => void\n): Subscriber {\n return new OperatorSubscriber(destination, onNext, onComplete, onError, onFinalize);\n}\n\n/**\n * A generic helper for allowing operators to be created with a Subscriber and\n * use closures to capture necessary state from the operator function itself.\n */\nexport class OperatorSubscriber extends Subscriber {\n /**\n * Creates an instance of an `OperatorSubscriber`.\n * @param destination The downstream subscriber.\n * @param onNext Handles next values, only called if this subscriber is not stopped or closed. Any\n * error that occurs in this function is caught and sent to the `error` method of this subscriber.\n * @param onError Handles errors from the subscription, any errors that occur in this handler are caught\n * and send to the `destination` error handler.\n * @param onComplete Handles completion notification from the subscription. Any errors that occur in\n * this handler are sent to the `destination` error handler.\n * @param onFinalize Additional finalization logic here. This will only be called on finalization if the\n * subscriber itself is not already closed. This is called after all other finalization logic is executed.\n * @param shouldUnsubscribe An optional check to see if an unsubscribe call should truly unsubscribe.\n * NOTE: This currently **ONLY** exists to support the strange behavior of {@link groupBy}, where unsubscription\n * to the resulting observable does not actually disconnect from the source if there are active subscriptions\n * to any grouped observable. (DO NOT EXPOSE OR USE EXTERNALLY!!!)\n */\n constructor(\n destination: Subscriber,\n onNext?: (value: T) => void,\n onComplete?: () => void,\n onError?: (err: any) => void,\n private onFinalize?: () => void,\n private shouldUnsubscribe?: () => boolean\n ) {\n // It's important - for performance reasons - that all of this class's\n // members are initialized and that they are always initialized in the same\n // order. This will ensure that all OperatorSubscriber instances have the\n // same hidden class in V8. This, in turn, will help keep the number of\n // hidden classes involved in property accesses within the base class as\n // low as possible. If the number of hidden classes involved exceeds four,\n // the property accesses will become megamorphic and performance penalties\n // will be incurred - i.e. inline caches won't be used.\n //\n // The reasons for ensuring all instances have the same hidden class are\n // further discussed in this blog post from Benedikt Meurer:\n // https://benediktmeurer.de/2018/03/23/impact-of-polymorphism-on-component-based-frameworks-like-react/\n super(destination);\n this._next = onNext\n ? function (this: OperatorSubscriber, value: T) {\n try {\n onNext(value);\n } catch (err) {\n destination.error(err);\n }\n }\n : super._next;\n this._error = onError\n ? function (this: OperatorSubscriber, err: any) {\n try {\n onError(err);\n } catch (err) {\n // Send any errors that occur down stream.\n destination.error(err);\n } finally {\n // Ensure finalization.\n this.unsubscribe();\n }\n }\n : super._error;\n this._complete = onComplete\n ? function (this: OperatorSubscriber) {\n try {\n onComplete();\n } catch (err) {\n // Send any errors that occur down stream.\n destination.error(err);\n } finally {\n // Ensure finalization.\n this.unsubscribe();\n }\n }\n : super._complete;\n }\n\n unsubscribe() {\n if (!this.shouldUnsubscribe || this.shouldUnsubscribe()) {\n const { closed } = this;\n super.unsubscribe();\n // Execute additional teardown if we have any and we didn't already do so.\n !closed && this.onFinalize?.();\n }\n }\n}\n", "import { Subscription } from '../Subscription';\n\ninterface AnimationFrameProvider {\n schedule(callback: FrameRequestCallback): Subscription;\n requestAnimationFrame: typeof requestAnimationFrame;\n cancelAnimationFrame: typeof cancelAnimationFrame;\n delegate:\n | {\n requestAnimationFrame: typeof requestAnimationFrame;\n cancelAnimationFrame: typeof cancelAnimationFrame;\n }\n | undefined;\n}\n\nexport const animationFrameProvider: AnimationFrameProvider = {\n // When accessing the delegate, use the variable rather than `this` so that\n // the functions can be called without being bound to the provider.\n schedule(callback) {\n let request = requestAnimationFrame;\n let cancel: typeof cancelAnimationFrame | undefined = cancelAnimationFrame;\n const { delegate } = animationFrameProvider;\n if (delegate) {\n request = delegate.requestAnimationFrame;\n cancel = delegate.cancelAnimationFrame;\n }\n const handle = request((timestamp) => {\n // Clear the cancel function. The request has been fulfilled, so\n // attempting to cancel the request upon unsubscription would be\n // pointless.\n cancel = undefined;\n callback(timestamp);\n });\n return new Subscription(() => cancel?.(handle));\n },\n requestAnimationFrame(...args) {\n const { delegate } = animationFrameProvider;\n return (delegate?.requestAnimationFrame || requestAnimationFrame)(...args);\n },\n cancelAnimationFrame(...args) {\n const { delegate } = animationFrameProvider;\n return (delegate?.cancelAnimationFrame || cancelAnimationFrame)(...args);\n },\n delegate: undefined,\n};\n", "import { createErrorClass } from './createErrorClass';\n\nexport interface ObjectUnsubscribedError extends Error {}\n\nexport interface ObjectUnsubscribedErrorCtor {\n /**\n * @deprecated Internal implementation detail. Do not construct error instances.\n * Cannot be tagged as internal: https://github.com/ReactiveX/rxjs/issues/6269\n */\n new (): ObjectUnsubscribedError;\n}\n\n/**\n * An error thrown when an action is invalid because the object has been\n * unsubscribed.\n *\n * @see {@link Subject}\n * @see {@link BehaviorSubject}\n *\n * @class ObjectUnsubscribedError\n */\nexport const ObjectUnsubscribedError: ObjectUnsubscribedErrorCtor = createErrorClass(\n (_super) =>\n function ObjectUnsubscribedErrorImpl(this: any) {\n _super(this);\n this.name = 'ObjectUnsubscribedError';\n this.message = 'object unsubscribed';\n }\n);\n", "import { Operator } from './Operator';\nimport { Observable } from './Observable';\nimport { Subscriber } from './Subscriber';\nimport { Subscription, EMPTY_SUBSCRIPTION } from './Subscription';\nimport { Observer, SubscriptionLike, TeardownLogic } from './types';\nimport { ObjectUnsubscribedError } from './util/ObjectUnsubscribedError';\nimport { arrRemove } from './util/arrRemove';\nimport { errorContext } from './util/errorContext';\n\n/**\n * A Subject is a special type of Observable that allows values to be\n * multicasted to many Observers. Subjects are like EventEmitters.\n *\n * Every Subject is an Observable and an Observer. You can subscribe to a\n * Subject, and you can call next to feed values as well as error and complete.\n */\nexport class Subject extends Observable implements SubscriptionLike {\n closed = false;\n\n private currentObservers: Observer[] | null = null;\n\n /** @deprecated Internal implementation detail, do not use directly. Will be made internal in v8. */\n observers: Observer[] = [];\n /** @deprecated Internal implementation detail, do not use directly. Will be made internal in v8. */\n isStopped = false;\n /** @deprecated Internal implementation detail, do not use directly. Will be made internal in v8. */\n hasError = false;\n /** @deprecated Internal implementation detail, do not use directly. Will be made internal in v8. */\n thrownError: any = null;\n\n /**\n * Creates a \"subject\" by basically gluing an observer to an observable.\n *\n * @nocollapse\n * @deprecated Recommended you do not use. Will be removed at some point in the future. Plans for replacement still under discussion.\n */\n static create: (...args: any[]) => any = (destination: Observer, source: Observable): AnonymousSubject => {\n return new AnonymousSubject(destination, source);\n };\n\n constructor() {\n // NOTE: This must be here to obscure Observable's constructor.\n super();\n }\n\n /** @deprecated Internal implementation detail, do not use directly. Will be made internal in v8. */\n lift(operator: Operator): Observable {\n const subject = new AnonymousSubject(this, this);\n subject.operator = operator as any;\n return subject as any;\n }\n\n /** @internal */\n protected _throwIfClosed() {\n if (this.closed) {\n throw new ObjectUnsubscribedError();\n }\n }\n\n next(value: T) {\n errorContext(() => {\n this._throwIfClosed();\n if (!this.isStopped) {\n if (!this.currentObservers) {\n this.currentObservers = Array.from(this.observers);\n }\n for (const observer of this.currentObservers) {\n observer.next(value);\n }\n }\n });\n }\n\n error(err: any) {\n errorContext(() => {\n this._throwIfClosed();\n if (!this.isStopped) {\n this.hasError = this.isStopped = true;\n this.thrownError = err;\n const { observers } = this;\n while (observers.length) {\n observers.shift()!.error(err);\n }\n }\n });\n }\n\n complete() {\n errorContext(() => {\n this._throwIfClosed();\n if (!this.isStopped) {\n this.isStopped = true;\n const { observers } = this;\n while (observers.length) {\n observers.shift()!.complete();\n }\n }\n });\n }\n\n unsubscribe() {\n this.isStopped = this.closed = true;\n this.observers = this.currentObservers = null!;\n }\n\n get observed() {\n return this.observers?.length > 0;\n }\n\n /** @internal */\n protected _trySubscribe(subscriber: Subscriber): TeardownLogic {\n this._throwIfClosed();\n return super._trySubscribe(subscriber);\n }\n\n /** @internal */\n protected _subscribe(subscriber: Subscriber): Subscription {\n this._throwIfClosed();\n this._checkFinalizedStatuses(subscriber);\n return this._innerSubscribe(subscriber);\n }\n\n /** @internal */\n protected _innerSubscribe(subscriber: Subscriber) {\n const { hasError, isStopped, observers } = this;\n if (hasError || isStopped) {\n return EMPTY_SUBSCRIPTION;\n }\n this.currentObservers = null;\n observers.push(subscriber);\n return new Subscription(() => {\n this.currentObservers = null;\n arrRemove(observers, subscriber);\n });\n }\n\n /** @internal */\n protected _checkFinalizedStatuses(subscriber: Subscriber) {\n const { hasError, thrownError, isStopped } = this;\n if (hasError) {\n subscriber.error(thrownError);\n } else if (isStopped) {\n subscriber.complete();\n }\n }\n\n /**\n * Creates a new Observable with this Subject as the source. You can do this\n * to create custom Observer-side logic of the Subject and conceal it from\n * code that uses the Observable.\n * @return {Observable} Observable that the Subject casts to\n */\n asObservable(): Observable {\n const observable: any = new Observable();\n observable.source = this;\n return observable;\n }\n}\n\n/**\n * @class AnonymousSubject\n */\nexport class AnonymousSubject extends Subject {\n constructor(\n /** @deprecated Internal implementation detail, do not use directly. Will be made internal in v8. */\n public destination?: Observer,\n source?: Observable\n ) {\n super();\n this.source = source;\n }\n\n next(value: T) {\n this.destination?.next?.(value);\n }\n\n error(err: any) {\n this.destination?.error?.(err);\n }\n\n complete() {\n this.destination?.complete?.();\n }\n\n /** @internal */\n protected _subscribe(subscriber: Subscriber): Subscription {\n return this.source?.subscribe(subscriber) ?? EMPTY_SUBSCRIPTION;\n }\n}\n", "import { Subject } from './Subject';\nimport { Subscriber } from './Subscriber';\nimport { Subscription } from './Subscription';\n\n/**\n * A variant of Subject that requires an initial value and emits its current\n * value whenever it is subscribed to.\n *\n * @class BehaviorSubject\n */\nexport class BehaviorSubject extends Subject {\n constructor(private _value: T) {\n super();\n }\n\n get value(): T {\n return this.getValue();\n }\n\n /** @internal */\n protected _subscribe(subscriber: Subscriber): Subscription {\n const subscription = super._subscribe(subscriber);\n !subscription.closed && subscriber.next(this._value);\n return subscription;\n }\n\n getValue(): T {\n const { hasError, thrownError, _value } = this;\n if (hasError) {\n throw thrownError;\n }\n this._throwIfClosed();\n return _value;\n }\n\n next(value: T): void {\n super.next((this._value = value));\n }\n}\n", "import { TimestampProvider } from '../types';\n\ninterface DateTimestampProvider extends TimestampProvider {\n delegate: TimestampProvider | undefined;\n}\n\nexport const dateTimestampProvider: DateTimestampProvider = {\n now() {\n // Use the variable rather than `this` so that the function can be called\n // without being bound to the provider.\n return (dateTimestampProvider.delegate || Date).now();\n },\n delegate: undefined,\n};\n", "import { Subject } from './Subject';\nimport { TimestampProvider } from './types';\nimport { Subscriber } from './Subscriber';\nimport { Subscription } from './Subscription';\nimport { dateTimestampProvider } from './scheduler/dateTimestampProvider';\n\n/**\n * A variant of {@link Subject} that \"replays\" old values to new subscribers by emitting them when they first subscribe.\n *\n * `ReplaySubject` has an internal buffer that will store a specified number of values that it has observed. Like `Subject`,\n * `ReplaySubject` \"observes\" values by having them passed to its `next` method. When it observes a value, it will store that\n * value for a time determined by the configuration of the `ReplaySubject`, as passed to its constructor.\n *\n * When a new subscriber subscribes to the `ReplaySubject` instance, it will synchronously emit all values in its buffer in\n * a First-In-First-Out (FIFO) manner. The `ReplaySubject` will also complete, if it has observed completion; and it will\n * error if it has observed an error.\n *\n * There are two main configuration items to be concerned with:\n *\n * 1. `bufferSize` - This will determine how many items are stored in the buffer, defaults to infinite.\n * 2. `windowTime` - The amount of time to hold a value in the buffer before removing it from the buffer.\n *\n * Both configurations may exist simultaneously. So if you would like to buffer a maximum of 3 values, as long as the values\n * are less than 2 seconds old, you could do so with a `new ReplaySubject(3, 2000)`.\n *\n * ### Differences with BehaviorSubject\n *\n * `BehaviorSubject` is similar to `new ReplaySubject(1)`, with a couple of exceptions:\n *\n * 1. `BehaviorSubject` comes \"primed\" with a single value upon construction.\n * 2. `ReplaySubject` will replay values, even after observing an error, where `BehaviorSubject` will not.\n *\n * @see {@link Subject}\n * @see {@link BehaviorSubject}\n * @see {@link shareReplay}\n */\nexport class ReplaySubject extends Subject {\n private _buffer: (T | number)[] = [];\n private _infiniteTimeWindow = true;\n\n /**\n * @param bufferSize The size of the buffer to replay on subscription\n * @param windowTime The amount of time the buffered items will stay buffered\n * @param timestampProvider An object with a `now()` method that provides the current timestamp. This is used to\n * calculate the amount of time something has been buffered.\n */\n constructor(\n private _bufferSize = Infinity,\n private _windowTime = Infinity,\n private _timestampProvider: TimestampProvider = dateTimestampProvider\n ) {\n super();\n this._infiniteTimeWindow = _windowTime === Infinity;\n this._bufferSize = Math.max(1, _bufferSize);\n this._windowTime = Math.max(1, _windowTime);\n }\n\n next(value: T): void {\n const { isStopped, _buffer, _infiniteTimeWindow, _timestampProvider, _windowTime } = this;\n if (!isStopped) {\n _buffer.push(value);\n !_infiniteTimeWindow && _buffer.push(_timestampProvider.now() + _windowTime);\n }\n this._trimBuffer();\n super.next(value);\n }\n\n /** @internal */\n protected _subscribe(subscriber: Subscriber): Subscription {\n this._throwIfClosed();\n this._trimBuffer();\n\n const subscription = this._innerSubscribe(subscriber);\n\n const { _infiniteTimeWindow, _buffer } = this;\n // We use a copy here, so reentrant code does not mutate our array while we're\n // emitting it to a new subscriber.\n const copy = _buffer.slice();\n for (let i = 0; i < copy.length && !subscriber.closed; i += _infiniteTimeWindow ? 1 : 2) {\n subscriber.next(copy[i] as T);\n }\n\n this._checkFinalizedStatuses(subscriber);\n\n return subscription;\n }\n\n private _trimBuffer() {\n const { _bufferSize, _timestampProvider, _buffer, _infiniteTimeWindow } = this;\n // If we don't have an infinite buffer size, and we're over the length,\n // use splice to truncate the old buffer values off. Note that we have to\n // double the size for instances where we're not using an infinite time window\n // because we're storing the values and the timestamps in the same array.\n const adjustedBufferSize = (_infiniteTimeWindow ? 1 : 2) * _bufferSize;\n _bufferSize < Infinity && adjustedBufferSize < _buffer.length && _buffer.splice(0, _buffer.length - adjustedBufferSize);\n\n // Now, if we're not in an infinite time window, remove all values where the time is\n // older than what is allowed.\n if (!_infiniteTimeWindow) {\n const now = _timestampProvider.now();\n let last = 0;\n // Search the array for the first timestamp that isn't expired and\n // truncate the buffer up to that point.\n for (let i = 1; i < _buffer.length && (_buffer[i] as number) <= now; i += 2) {\n last = i;\n }\n last && _buffer.splice(0, last + 1);\n }\n }\n}\n", "import { Scheduler } from '../Scheduler';\nimport { Subscription } from '../Subscription';\nimport { SchedulerAction } from '../types';\n\n/**\n * A unit of work to be executed in a `scheduler`. An action is typically\n * created from within a {@link SchedulerLike} and an RxJS user does not need to concern\n * themselves about creating and manipulating an Action.\n *\n * ```ts\n * class Action extends Subscription {\n * new (scheduler: Scheduler, work: (state?: T) => void);\n * schedule(state?: T, delay: number = 0): Subscription;\n * }\n * ```\n *\n * @class Action\n */\nexport class Action extends Subscription {\n constructor(scheduler: Scheduler, work: (this: SchedulerAction, state?: T) => void) {\n super();\n }\n /**\n * Schedules this action on its parent {@link SchedulerLike} for execution. May be passed\n * some context object, `state`. May happen at some point in the future,\n * according to the `delay` parameter, if specified.\n * @param {T} [state] Some contextual data that the `work` function uses when\n * called by the Scheduler.\n * @param {number} [delay] Time to wait before executing the work, where the\n * time unit is implicit and defined by the Scheduler.\n * @return {void}\n */\n public schedule(state?: T, delay: number = 0): Subscription {\n return this;\n }\n}\n", "import type { TimerHandle } from './timerHandle';\ntype SetIntervalFunction = (handler: () => void, timeout?: number, ...args: any[]) => TimerHandle;\ntype ClearIntervalFunction = (handle: TimerHandle) => void;\n\ninterface IntervalProvider {\n setInterval: SetIntervalFunction;\n clearInterval: ClearIntervalFunction;\n delegate:\n | {\n setInterval: SetIntervalFunction;\n clearInterval: ClearIntervalFunction;\n }\n | undefined;\n}\n\nexport const intervalProvider: IntervalProvider = {\n // When accessing the delegate, use the variable rather than `this` so that\n // the functions can be called without being bound to the provider.\n setInterval(handler: () => void, timeout?: number, ...args) {\n const { delegate } = intervalProvider;\n if (delegate?.setInterval) {\n return delegate.setInterval(handler, timeout, ...args);\n }\n return setInterval(handler, timeout, ...args);\n },\n clearInterval(handle) {\n const { delegate } = intervalProvider;\n return (delegate?.clearInterval || clearInterval)(handle as any);\n },\n delegate: undefined,\n};\n", "import { Action } from './Action';\nimport { SchedulerAction } from '../types';\nimport { Subscription } from '../Subscription';\nimport { AsyncScheduler } from './AsyncScheduler';\nimport { intervalProvider } from './intervalProvider';\nimport { arrRemove } from '../util/arrRemove';\nimport { TimerHandle } from './timerHandle';\n\nexport class AsyncAction extends Action {\n public id: TimerHandle | undefined;\n public state?: T;\n // @ts-ignore: Property has no initializer and is not definitely assigned\n public delay: number;\n protected pending: boolean = false;\n\n constructor(protected scheduler: AsyncScheduler, protected work: (this: SchedulerAction, state?: T) => void) {\n super(scheduler, work);\n }\n\n public schedule(state?: T, delay: number = 0): Subscription {\n if (this.closed) {\n return this;\n }\n\n // Always replace the current state with the new state.\n this.state = state;\n\n const id = this.id;\n const scheduler = this.scheduler;\n\n //\n // Important implementation note:\n //\n // Actions only execute once by default, unless rescheduled from within the\n // scheduled callback. This allows us to implement single and repeat\n // actions via the same code path, without adding API surface area, as well\n // as mimic traditional recursion but across asynchronous boundaries.\n //\n // However, JS runtimes and timers distinguish between intervals achieved by\n // serial `setTimeout` calls vs. a single `setInterval` call. An interval of\n // serial `setTimeout` calls can be individually delayed, which delays\n // scheduling the next `setTimeout`, and so on. `setInterval` attempts to\n // guarantee the interval callback will be invoked more precisely to the\n // interval period, regardless of load.\n //\n // Therefore, we use `setInterval` to schedule single and repeat actions.\n // If the action reschedules itself with the same delay, the interval is not\n // canceled. If the action doesn't reschedule, or reschedules with a\n // different delay, the interval will be canceled after scheduled callback\n // execution.\n //\n if (id != null) {\n this.id = this.recycleAsyncId(scheduler, id, delay);\n }\n\n // Set the pending flag indicating that this action has been scheduled, or\n // has recursively rescheduled itself.\n this.pending = true;\n\n this.delay = delay;\n // If this action has already an async Id, don't request a new one.\n this.id = this.id ?? this.requestAsyncId(scheduler, this.id, delay);\n\n return this;\n }\n\n protected requestAsyncId(scheduler: AsyncScheduler, _id?: TimerHandle, delay: number = 0): TimerHandle {\n return intervalProvider.setInterval(scheduler.flush.bind(scheduler, this), delay);\n }\n\n protected recycleAsyncId(_scheduler: AsyncScheduler, id?: TimerHandle, delay: number | null = 0): TimerHandle | undefined {\n // If this action is rescheduled with the same delay time, don't clear the interval id.\n if (delay != null && this.delay === delay && this.pending === false) {\n return id;\n }\n // Otherwise, if the action's delay time is different from the current delay,\n // or the action has been rescheduled before it's executed, clear the interval id\n if (id != null) {\n intervalProvider.clearInterval(id);\n }\n\n return undefined;\n }\n\n /**\n * Immediately executes this action and the `work` it contains.\n * @return {any}\n */\n public execute(state: T, delay: number): any {\n if (this.closed) {\n return new Error('executing a cancelled action');\n }\n\n this.pending = false;\n const error = this._execute(state, delay);\n if (error) {\n return error;\n } else if (this.pending === false && this.id != null) {\n // Dequeue if the action didn't reschedule itself. Don't call\n // unsubscribe(), because the action could reschedule later.\n // For example:\n // ```\n // scheduler.schedule(function doWork(counter) {\n // /* ... I'm a busy worker bee ... */\n // var originalAction = this;\n // /* wait 100ms before rescheduling the action */\n // setTimeout(function () {\n // originalAction.schedule(counter + 1);\n // }, 100);\n // }, 1000);\n // ```\n this.id = this.recycleAsyncId(this.scheduler, this.id, null);\n }\n }\n\n protected _execute(state: T, _delay: number): any {\n let errored: boolean = false;\n let errorValue: any;\n try {\n this.work(state);\n } catch (e) {\n errored = true;\n // HACK: Since code elsewhere is relying on the \"truthiness\" of the\n // return here, we can't have it return \"\" or 0 or false.\n // TODO: Clean this up when we refactor schedulers mid-version-8 or so.\n errorValue = e ? e : new Error('Scheduled action threw falsy error');\n }\n if (errored) {\n this.unsubscribe();\n return errorValue;\n }\n }\n\n unsubscribe() {\n if (!this.closed) {\n const { id, scheduler } = this;\n const { actions } = scheduler;\n\n this.work = this.state = this.scheduler = null!;\n this.pending = false;\n\n arrRemove(actions, this);\n if (id != null) {\n this.id = this.recycleAsyncId(scheduler, id, null);\n }\n\n this.delay = null!;\n super.unsubscribe();\n }\n }\n}\n", "import { Action } from './scheduler/Action';\nimport { Subscription } from './Subscription';\nimport { SchedulerLike, SchedulerAction } from './types';\nimport { dateTimestampProvider } from './scheduler/dateTimestampProvider';\n\n/**\n * An execution context and a data structure to order tasks and schedule their\n * execution. Provides a notion of (potentially virtual) time, through the\n * `now()` getter method.\n *\n * Each unit of work in a Scheduler is called an `Action`.\n *\n * ```ts\n * class Scheduler {\n * now(): number;\n * schedule(work, delay?, state?): Subscription;\n * }\n * ```\n *\n * @class Scheduler\n * @deprecated Scheduler is an internal implementation detail of RxJS, and\n * should not be used directly. Rather, create your own class and implement\n * {@link SchedulerLike}. Will be made internal in v8.\n */\nexport class Scheduler implements SchedulerLike {\n public static now: () => number = dateTimestampProvider.now;\n\n constructor(private schedulerActionCtor: typeof Action, now: () => number = Scheduler.now) {\n this.now = now;\n }\n\n /**\n * A getter method that returns a number representing the current time\n * (at the time this function was called) according to the scheduler's own\n * internal clock.\n * @return {number} A number that represents the current time. May or may not\n * have a relation to wall-clock time. May or may not refer to a time unit\n * (e.g. milliseconds).\n */\n public now: () => number;\n\n /**\n * Schedules a function, `work`, for execution. May happen at some point in\n * the future, according to the `delay` parameter, if specified. May be passed\n * some context object, `state`, which will be passed to the `work` function.\n *\n * The given arguments will be processed an stored as an Action object in a\n * queue of actions.\n *\n * @param {function(state: ?T): ?Subscription} work A function representing a\n * task, or some unit of work to be executed by the Scheduler.\n * @param {number} [delay] Time to wait before executing the work, where the\n * time unit is implicit and defined by the Scheduler itself.\n * @param {T} [state] Some contextual data that the `work` function uses when\n * called by the Scheduler.\n * @return {Subscription} A subscription in order to be able to unsubscribe\n * the scheduled work.\n */\n public schedule(work: (this: SchedulerAction, state?: T) => void, delay: number = 0, state?: T): Subscription {\n return new this.schedulerActionCtor(this, work).schedule(state, delay);\n }\n}\n", "import { Scheduler } from '../Scheduler';\nimport { Action } from './Action';\nimport { AsyncAction } from './AsyncAction';\nimport { TimerHandle } from './timerHandle';\n\nexport class AsyncScheduler extends Scheduler {\n public actions: Array> = [];\n /**\n * A flag to indicate whether the Scheduler is currently executing a batch of\n * queued actions.\n * @type {boolean}\n * @internal\n */\n public _active: boolean = false;\n /**\n * An internal ID used to track the latest asynchronous task such as those\n * coming from `setTimeout`, `setInterval`, `requestAnimationFrame`, and\n * others.\n * @type {any}\n * @internal\n */\n public _scheduled: TimerHandle | undefined;\n\n constructor(SchedulerAction: typeof Action, now: () => number = Scheduler.now) {\n super(SchedulerAction, now);\n }\n\n public flush(action: AsyncAction): void {\n const { actions } = this;\n\n if (this._active) {\n actions.push(action);\n return;\n }\n\n let error: any;\n this._active = true;\n\n do {\n if ((error = action.execute(action.state, action.delay))) {\n break;\n }\n } while ((action = actions.shift()!)); // exhaust the scheduler queue\n\n this._active = false;\n\n if (error) {\n while ((action = actions.shift()!)) {\n action.unsubscribe();\n }\n throw error;\n }\n }\n}\n", "import { AsyncAction } from './AsyncAction';\nimport { AsyncScheduler } from './AsyncScheduler';\n\n/**\n *\n * Async Scheduler\n *\n * Schedule task as if you used setTimeout(task, duration)\n *\n * `async` scheduler schedules tasks asynchronously, by putting them on the JavaScript\n * event loop queue. It is best used to delay tasks in time or to schedule tasks repeating\n * in intervals.\n *\n * If you just want to \"defer\" task, that is to perform it right after currently\n * executing synchronous code ends (commonly achieved by `setTimeout(deferredTask, 0)`),\n * better choice will be the {@link asapScheduler} scheduler.\n *\n * ## Examples\n * Use async scheduler to delay task\n * ```ts\n * import { asyncScheduler } from 'rxjs';\n *\n * const task = () => console.log('it works!');\n *\n * asyncScheduler.schedule(task, 2000);\n *\n * // After 2 seconds logs:\n * // \"it works!\"\n * ```\n *\n * Use async scheduler to repeat task in intervals\n * ```ts\n * import { asyncScheduler } from 'rxjs';\n *\n * function task(state) {\n * console.log(state);\n * this.schedule(state + 1, 1000); // `this` references currently executing Action,\n * // which we reschedule with new state and delay\n * }\n *\n * asyncScheduler.schedule(task, 3000, 0);\n *\n * // Logs:\n * // 0 after 3s\n * // 1 after 4s\n * // 2 after 5s\n * // 3 after 6s\n * ```\n */\n\nexport const asyncScheduler = new AsyncScheduler(AsyncAction);\n\n/**\n * @deprecated Renamed to {@link asyncScheduler}. Will be removed in v8.\n */\nexport const async = asyncScheduler;\n", "import { AsyncAction } from './AsyncAction';\nimport { Subscription } from '../Subscription';\nimport { QueueScheduler } from './QueueScheduler';\nimport { SchedulerAction } from '../types';\nimport { TimerHandle } from './timerHandle';\n\nexport class QueueAction extends AsyncAction {\n constructor(protected scheduler: QueueScheduler, protected work: (this: SchedulerAction, state?: T) => void) {\n super(scheduler, work);\n }\n\n public schedule(state?: T, delay: number = 0): Subscription {\n if (delay > 0) {\n return super.schedule(state, delay);\n }\n this.delay = delay;\n this.state = state;\n this.scheduler.flush(this);\n return this;\n }\n\n public execute(state: T, delay: number): any {\n return delay > 0 || this.closed ? super.execute(state, delay) : this._execute(state, delay);\n }\n\n protected requestAsyncId(scheduler: QueueScheduler, id?: TimerHandle, delay: number = 0): TimerHandle {\n // If delay exists and is greater than 0, or if the delay is null (the\n // action wasn't rescheduled) but was originally scheduled as an async\n // action, then recycle as an async action.\n\n if ((delay != null && delay > 0) || (delay == null && this.delay > 0)) {\n return super.requestAsyncId(scheduler, id, delay);\n }\n\n // Otherwise flush the scheduler starting with this action.\n scheduler.flush(this);\n\n // HACK: In the past, this was returning `void`. However, `void` isn't a valid\n // `TimerHandle`, and generally the return value here isn't really used. So the\n // compromise is to return `0` which is both \"falsy\" and a valid `TimerHandle`,\n // as opposed to refactoring every other instanceo of `requestAsyncId`.\n return 0;\n }\n}\n", "import { AsyncScheduler } from './AsyncScheduler';\n\nexport class QueueScheduler extends AsyncScheduler {\n}\n", "import { QueueAction } from './QueueAction';\nimport { QueueScheduler } from './QueueScheduler';\n\n/**\n *\n * Queue Scheduler\n *\n * Put every next task on a queue, instead of executing it immediately\n *\n * `queue` scheduler, when used with delay, behaves the same as {@link asyncScheduler} scheduler.\n *\n * When used without delay, it schedules given task synchronously - executes it right when\n * it is scheduled. However when called recursively, that is when inside the scheduled task,\n * another task is scheduled with queue scheduler, instead of executing immediately as well,\n * that task will be put on a queue and wait for current one to finish.\n *\n * This means that when you execute task with `queue` scheduler, you are sure it will end\n * before any other task scheduled with that scheduler will start.\n *\n * ## Examples\n * Schedule recursively first, then do something\n * ```ts\n * import { queueScheduler } from 'rxjs';\n *\n * queueScheduler.schedule(() => {\n * queueScheduler.schedule(() => console.log('second')); // will not happen now, but will be put on a queue\n *\n * console.log('first');\n * });\n *\n * // Logs:\n * // \"first\"\n * // \"second\"\n * ```\n *\n * Reschedule itself recursively\n * ```ts\n * import { queueScheduler } from 'rxjs';\n *\n * queueScheduler.schedule(function(state) {\n * if (state !== 0) {\n * console.log('before', state);\n * this.schedule(state - 1); // `this` references currently executing Action,\n * // which we reschedule with new state\n * console.log('after', state);\n * }\n * }, 0, 3);\n *\n * // In scheduler that runs recursively, you would expect:\n * // \"before\", 3\n * // \"before\", 2\n * // \"before\", 1\n * // \"after\", 1\n * // \"after\", 2\n * // \"after\", 3\n *\n * // But with queue it logs:\n * // \"before\", 3\n * // \"after\", 3\n * // \"before\", 2\n * // \"after\", 2\n * // \"before\", 1\n * // \"after\", 1\n * ```\n */\n\nexport const queueScheduler = new QueueScheduler(QueueAction);\n\n/**\n * @deprecated Renamed to {@link queueScheduler}. Will be removed in v8.\n */\nexport const queue = queueScheduler;\n", "import { AsyncAction } from './AsyncAction';\nimport { AnimationFrameScheduler } from './AnimationFrameScheduler';\nimport { SchedulerAction } from '../types';\nimport { animationFrameProvider } from './animationFrameProvider';\nimport { TimerHandle } from './timerHandle';\n\nexport class AnimationFrameAction extends AsyncAction {\n constructor(protected scheduler: AnimationFrameScheduler, protected work: (this: SchedulerAction, state?: T) => void) {\n super(scheduler, work);\n }\n\n protected requestAsyncId(scheduler: AnimationFrameScheduler, id?: TimerHandle, delay: number = 0): TimerHandle {\n // If delay is greater than 0, request as an async action.\n if (delay !== null && delay > 0) {\n return super.requestAsyncId(scheduler, id, delay);\n }\n // Push the action to the end of the scheduler queue.\n scheduler.actions.push(this);\n // If an animation frame has already been requested, don't request another\n // one. If an animation frame hasn't been requested yet, request one. Return\n // the current animation frame request id.\n return scheduler._scheduled || (scheduler._scheduled = animationFrameProvider.requestAnimationFrame(() => scheduler.flush(undefined)));\n }\n\n protected recycleAsyncId(scheduler: AnimationFrameScheduler, id?: TimerHandle, delay: number = 0): TimerHandle | undefined {\n // If delay exists and is greater than 0, or if the delay is null (the\n // action wasn't rescheduled) but was originally scheduled as an async\n // action, then recycle as an async action.\n if (delay != null ? delay > 0 : this.delay > 0) {\n return super.recycleAsyncId(scheduler, id, delay);\n }\n // If the scheduler queue has no remaining actions with the same async id,\n // cancel the requested animation frame and set the scheduled flag to\n // undefined so the next AnimationFrameAction will request its own.\n const { actions } = scheduler;\n if (id != null && actions[actions.length - 1]?.id !== id) {\n animationFrameProvider.cancelAnimationFrame(id as number);\n scheduler._scheduled = undefined;\n }\n // Return undefined so the action knows to request a new async id if it's rescheduled.\n return undefined;\n }\n}\n", "import { AsyncAction } from './AsyncAction';\nimport { AsyncScheduler } from './AsyncScheduler';\n\nexport class AnimationFrameScheduler extends AsyncScheduler {\n public flush(action?: AsyncAction): void {\n this._active = true;\n // The async id that effects a call to flush is stored in _scheduled.\n // Before executing an action, it's necessary to check the action's async\n // id to determine whether it's supposed to be executed in the current\n // flush.\n // Previous implementations of this method used a count to determine this,\n // but that was unsound, as actions that are unsubscribed - i.e. cancelled -\n // are removed from the actions array and that can shift actions that are\n // scheduled to be executed in a subsequent flush into positions at which\n // they are executed within the current flush.\n const flushId = this._scheduled;\n this._scheduled = undefined;\n\n const { actions } = this;\n let error: any;\n action = action || actions.shift()!;\n\n do {\n if ((error = action.execute(action.state, action.delay))) {\n break;\n }\n } while ((action = actions[0]) && action.id === flushId && actions.shift());\n\n this._active = false;\n\n if (error) {\n while ((action = actions[0]) && action.id === flushId && actions.shift()) {\n action.unsubscribe();\n }\n throw error;\n }\n }\n}\n", "import { AnimationFrameAction } from './AnimationFrameAction';\nimport { AnimationFrameScheduler } from './AnimationFrameScheduler';\n\n/**\n *\n * Animation Frame Scheduler\n *\n * Perform task when `window.requestAnimationFrame` would fire\n *\n * When `animationFrame` scheduler is used with delay, it will fall back to {@link asyncScheduler} scheduler\n * behaviour.\n *\n * Without delay, `animationFrame` scheduler can be used to create smooth browser animations.\n * It makes sure scheduled task will happen just before next browser content repaint,\n * thus performing animations as efficiently as possible.\n *\n * ## Example\n * Schedule div height animation\n * ```ts\n * // html:
\n * import { animationFrameScheduler } from 'rxjs';\n *\n * const div = document.querySelector('div');\n *\n * animationFrameScheduler.schedule(function(height) {\n * div.style.height = height + \"px\";\n *\n * this.schedule(height + 1); // `this` references currently executing Action,\n * // which we reschedule with new state\n * }, 0, 0);\n *\n * // You will see a div element growing in height\n * ```\n */\n\nexport const animationFrameScheduler = new AnimationFrameScheduler(AnimationFrameAction);\n\n/**\n * @deprecated Renamed to {@link animationFrameScheduler}. Will be removed in v8.\n */\nexport const animationFrame = animationFrameScheduler;\n", "import { Observable } from '../Observable';\nimport { SchedulerLike } from '../types';\n\n/**\n * A simple Observable that emits no items to the Observer and immediately\n * emits a complete notification.\n *\n * Just emits 'complete', and nothing else.\n *\n * ![](empty.png)\n *\n * A simple Observable that only emits the complete notification. It can be used\n * for composing with other Observables, such as in a {@link mergeMap}.\n *\n * ## Examples\n *\n * Log complete notification\n *\n * ```ts\n * import { EMPTY } from 'rxjs';\n *\n * EMPTY.subscribe({\n * next: () => console.log('Next'),\n * complete: () => console.log('Complete!')\n * });\n *\n * // Outputs\n * // Complete!\n * ```\n *\n * Emit the number 7, then complete\n *\n * ```ts\n * import { EMPTY, startWith } from 'rxjs';\n *\n * const result = EMPTY.pipe(startWith(7));\n * result.subscribe(x => console.log(x));\n *\n * // Outputs\n * // 7\n * ```\n *\n * Map and flatten only odd numbers to the sequence `'a'`, `'b'`, `'c'`\n *\n * ```ts\n * import { interval, mergeMap, of, EMPTY } from 'rxjs';\n *\n * const interval$ = interval(1000);\n * const result = interval$.pipe(\n * mergeMap(x => x % 2 === 1 ? of('a', 'b', 'c') : EMPTY),\n * );\n * result.subscribe(x => console.log(x));\n *\n * // Results in the following to the console:\n * // x is equal to the count on the interval, e.g. (0, 1, 2, 3, ...)\n * // x will occur every 1000ms\n * // if x % 2 is equal to 1, print a, b, c (each on its own)\n * // if x % 2 is not equal to 1, nothing will be output\n * ```\n *\n * @see {@link Observable}\n * @see {@link NEVER}\n * @see {@link of}\n * @see {@link throwError}\n */\nexport const EMPTY = new Observable((subscriber) => subscriber.complete());\n\n/**\n * @param scheduler A {@link SchedulerLike} to use for scheduling\n * the emission of the complete notification.\n * @deprecated Replaced with the {@link EMPTY} constant or {@link scheduled} (e.g. `scheduled([], scheduler)`). Will be removed in v8.\n */\nexport function empty(scheduler?: SchedulerLike) {\n return scheduler ? emptyScheduled(scheduler) : EMPTY;\n}\n\nfunction emptyScheduled(scheduler: SchedulerLike) {\n return new Observable((subscriber) => scheduler.schedule(() => subscriber.complete()));\n}\n", "import { SchedulerLike } from '../types';\nimport { isFunction } from './isFunction';\n\nexport function isScheduler(value: any): value is SchedulerLike {\n return value && isFunction(value.schedule);\n}\n", "import { SchedulerLike } from '../types';\nimport { isFunction } from './isFunction';\nimport { isScheduler } from './isScheduler';\n\nfunction last(arr: T[]): T | undefined {\n return arr[arr.length - 1];\n}\n\nexport function popResultSelector(args: any[]): ((...args: unknown[]) => unknown) | undefined {\n return isFunction(last(args)) ? args.pop() : undefined;\n}\n\nexport function popScheduler(args: any[]): SchedulerLike | undefined {\n return isScheduler(last(args)) ? args.pop() : undefined;\n}\n\nexport function popNumber(args: any[], defaultValue: number): number {\n return typeof last(args) === 'number' ? args.pop()! : defaultValue;\n}\n", "export const isArrayLike = ((x: any): x is ArrayLike => x && typeof x.length === 'number' && typeof x !== 'function');", "import { isFunction } from \"./isFunction\";\n\n/**\n * Tests to see if the object is \"thennable\".\n * @param value the object to test\n */\nexport function isPromise(value: any): value is PromiseLike {\n return isFunction(value?.then);\n}\n", "import { InteropObservable } from '../types';\nimport { observable as Symbol_observable } from '../symbol/observable';\nimport { isFunction } from './isFunction';\n\n/** Identifies an input as being Observable (but not necessary an Rx Observable) */\nexport function isInteropObservable(input: any): input is InteropObservable {\n return isFunction(input[Symbol_observable]);\n}\n", "import { isFunction } from './isFunction';\n\nexport function isAsyncIterable(obj: any): obj is AsyncIterable {\n return Symbol.asyncIterator && isFunction(obj?.[Symbol.asyncIterator]);\n}\n", "/**\n * Creates the TypeError to throw if an invalid object is passed to `from` or `scheduled`.\n * @param input The object that was passed.\n */\nexport function createInvalidObservableTypeError(input: any) {\n // TODO: We should create error codes that can be looked up, so this can be less verbose.\n return new TypeError(\n `You provided ${\n input !== null && typeof input === 'object' ? 'an invalid object' : `'${input}'`\n } where a stream was expected. You can provide an Observable, Promise, ReadableStream, Array, AsyncIterable, or Iterable.`\n );\n}\n", "export function getSymbolIterator(): symbol {\n if (typeof Symbol !== 'function' || !Symbol.iterator) {\n return '@@iterator' as any;\n }\n\n return Symbol.iterator;\n}\n\nexport const iterator = getSymbolIterator();\n", "import { iterator as Symbol_iterator } from '../symbol/iterator';\nimport { isFunction } from './isFunction';\n\n/** Identifies an input as being an Iterable */\nexport function isIterable(input: any): input is Iterable {\n return isFunction(input?.[Symbol_iterator]);\n}\n", "import { ReadableStreamLike } from '../types';\nimport { isFunction } from './isFunction';\n\nexport async function* readableStreamLikeToAsyncGenerator(readableStream: ReadableStreamLike): AsyncGenerator {\n const reader = readableStream.getReader();\n try {\n while (true) {\n const { value, done } = await reader.read();\n if (done) {\n return;\n }\n yield value!;\n }\n } finally {\n reader.releaseLock();\n }\n}\n\nexport function isReadableStreamLike(obj: any): obj is ReadableStreamLike {\n // We don't want to use instanceof checks because they would return\n // false for instances from another Realm, like an