dottxt-ai · brandonwillard · Aug 31, 2024 · Jul 27, 2024 · Jul 27, 2024 · Jul 27, 2024
diff --git a/benchmarks/bench_cfg_guide.py b/benchmarks/bench_cfg_guide.py
@@ -0,0 +1,69 @@
+import random
+
+from transformers import AutoTokenizer
+
+import outlines.grammars
+from outlines.caching import cache_disabled
+from outlines.fsm.guide import CFGGuide
+from outlines.models.transformers import TransformerTokenizer
+
+from .common import ensure_numba_compiled
+
+random.seed(42)
+
+
+def get_tiny_tokenizer():
+    """1000 tokens in vocabulary"""
+    return TransformerTokenizer(
+        AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
+    )
+
+
+benched_grammars = {
+    "json": outlines.grammars.json,
+    "arithmetic": outlines.grammars.arithmetic,
+}
+
+
+class CFGGuideBenchmark:
+    params = benched_grammars.keys()
+
+    def setup(self, grammar_name):
+        self.tokenizer = get_tiny_tokenizer()
+        ensure_numba_compiled(
+            self.tokenizer
+        )  # numba not currently used, but will be in the future
+        self.prebuilt_cfg_guide = CFGGuide(
+            benched_grammars[grammar_name], self.tokenizer
+        )
+
+    @staticmethod
+    def _run_random_cfg(guide, rejection_sampling=True):
+        state = guide.initial_state
+        token_ids = list(guide.tokenizer.vocabulary.values())
+        for i in range(40):
+            # simulate ordering of logits top prob to lowest prob
+            random.shuffle(token_ids)
+            # simulate sampling and state update
+            if rejection_sampling:
+                next_token_id = next(guide.iter_valid_token_ids(state, token_ids))
+                state = guide.get_next_state(state, next_token_id)
+            else:
+                next_token_id = random.choice(guide.get_next_instruction(state).tokens)
+                state = guide.get_next_state(state, next_token_id)
+
+    @cache_disabled()
+    def time_cfg_guide_setup(self, grammar_name):
+        CFGGuide(benched_grammars[grammar_name], self.tokenizer)
+
+    @cache_disabled()
+    def time_cfg_guide_run_rejection_sampling(self, grammar):
+        self._run_random_cfg(self.prebuilt_cfg_guide, rejection_sampling=True)
+
+    @cache_disabled()
+    def time_cfg_guide_run(self, grammar):
+        self._run_random_cfg(self.prebuilt_cfg_guide, rejection_sampling=False)
+
+    @cache_disabled()
+    def peakmem_cfg_guide_run(self, grammar):
+        self._run_random_cfg(self.prebuilt_cfg_guide)
diff --git a/docs/reference/generation/cfg.md b/docs/reference/generation/cfg.md
@@ -30,10 +30,15 @@ print(sequence)
 # (8-2)
 ```
 
-!!! Note "Performance"
+###### Disclaimer
 
-    The implementation of grammar-structured generation in Outlines is very naive. This does not reflect the performance of [.txt](https://dottxt.co)'s product, where we made grammar-structured generation as fast as regex-structured generation.
+!!! Note "Experimental"
 
+    Outlines current **community-contributed** implementation of CFG-structured generation is experimental. This does not reflect the performance of [.txt](https://dottxt.co)'s product, where we have optimized grammar-structured generation to be as fast as regex-structured generation. Additionally, it does not fully align with the approach described in our [technical report](https://arxiv.org/pdf/2307.09702), aside from its use of incremental/partial parsing. This feature is still a work in progress, requiring performance enhancements and bug fixes for an ideal implementation. For more details, please see our [grammar-related open issues on GitHub](https://github.com/outlines-dev/outlines/issues?q=is%3Aissue+is%3Aopen+label%3Agrammar).
+
+!!! Note "Greedy"
+
+    To mitigate performance issues, CFG-structured generation will use rejection sampling and iterate over the candidate tokens highest logit first,, completing once a single valid token ID is selected. This is effectively greedy generation.
 
 ## Ready-to-use grammars
 

diff --git a/docs/reference/generation/creating_grammars.md b/docs/reference/generation/creating_grammars.md
@@ -0,0 +1,99 @@
+# Overview
+
+Outlines allows the use of [Lark](https://github.com/lark-parser/lark) grammars to guide generation. These grammars are used to construct parsers that filter out incompatible tokens during the generation process The result is a generation that adheres to the grammar's production rules.
+
+# Primer on Creating Grammars
+
+To create grammars for Outlines, a solid understanding of Lark grammars is necessary. Here's how you can get started:
+
+- Read Lark's grammars documentations [here](https://lark-parser.readthedocs.io/en/latest/grammar.html).
+- Review Outlines' existing grammars [here](/outlines/grammars).
+
+
+# Compatibility With Outlines
+
+It's important to note that not all Lark grammars work with Outlines. Changes may be necessary to ensure compatability.
+
+### LALR(1) Parser
+
+Outlines utilizes Larks LALR(1) parser, meaning the grammar must be unambiguous at least up to the next token (one token lookahead). Read Lark's official LALR(1) parser documentation [here](https://lark-parser.readthedocs.io/en/stable/parsers.html#lalr-1).
+
+If your grammar is ambiguous, you will recieve the following error at runtime:
+
+```
+GrammarError: Reduce/Reduce collision in Terminal('B') between the following rules:
+```
+
+### Regex Terminal Restrictions
+
+Outlines converts terminals to finite state machines using the [Interegular](https://github.com/MegaIng/interegular/) library. Not all regular expressions work with Interegular, mitigation is described in the subsections which follow.
+
+
+#### Avoid Lookarounds
+
+Examples of removing lookaround while maintaining the same functionality
+
+##### Example: Escaped String
+
+From Outlines' modified `ESCAPED_STRING` in [common.lark](/outlines/grammars/common.lark).
+
+Before:
+```
+_STRING_INNER: /.*?/
+_STRING_ESC_INNER: _STRING_INNER /(?<!\\)(\\\\)*?/
+
+ESCAPED_STRING : "\"" _STRING_ESC_INNER "\""
+```
+
+After:
+```
+_NON_CONTROL_CHAR: /([^"\\\x00-\x1F\x7F-\x9F])/
+_ESCAPED_CHAR: /\\/ (_NON_CONTROL_CHAR | /\\/ | /"/)
+ESCAPED_STRING_INNER: _NON_CONTROL_CHAR | _ESCAPED_CHAR
+ESCAPED_STRING: /"/ ESCAPED_STRING_INNER* /"/
+```
+
+#### Avoid Backreferences
+
+Backreferences, for example `([ab]^*)\1`, cannot be simulated by a finite state machine, and will result in an error if used.
+
+# Creating a Valid Grammar
+
+You can use Outlines' test suite to verify your grammar.
+
+### 1) Create Your Grammar
+
+Create your grammar file named `your_new_grammar.lark`, adhering to the guidelines provided above. Add it to `outlines/grammars/` (ensure attribution is included and license is compatible).
+
+Update `outlines/grammars.py` with a line including your grammar.
+
+### 2) Test Your Grammar
+
+Test grammar for false negatives, ensure sample grammars can be generated:
+- Add valid example outputs which are compliant with the grammar to `tests/benchmark/cfg_samples/your_new_grammar/`
+- Run the tests for your grammar via `pytest -s tests/fsm/test_cfg_guide.py::test_cfg_grammar_sample -k "your_new_grammar"`
+
+Test grammar for false positives, ensure invalid outputs aren't generated.
+
+Currently there isn't a builtin false positive testing utility. It is recommended you smoke test via
+```
+from outlines import models, generate, grammars
+model = models.transformers("mistralai/Mistral-7B-v0.1")
+generator = generate.cfg(model, grammars.your_new_grammar)
+result = generator(<your prompt to generate output for your grammar>)
+print(result)
+```
+
+# Converting
+There are a few tools available for converting from other grammars to lark. These tools serve as a starting point. However, you will typically need to make additional adjustments to ensure full compatibility and proper functioning within Outlines.
+
+Tools:
+- Larks built in "Nearley-to-Lark" converter https://lark-parser.readthedocs.io/en/latest/tools.html
+- Convert ANTLR4 to Lark (Note, most antlr4 grammars are not LALR(1) compatible, so will require additional tweaking) https://github.com/kaby76/Domemtech.Trash/blob/main/src/trconvert/readme.md
+- Extract EBNF from Yacc files https://www.bottlecaps.de/rr/ui
+
+Reference Grammars:
+- Github Lark Grammars https://github.com/search?q=path%3A*.lark&type=code
+- Github Nearley Grammars https://github.com/search?q=path%3A*.ne+%22-%3E%22&type=code
+- Antlr4 grammars https://github.com/antlr/grammars-v4/
+- Grammar zoo https://slebok.github.io/zoo/index.html#html
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -131,6 +131,7 @@ nav:
             - Type constraints: reference/generation/format.md
             - JSON (function calling): reference/generation/json.md
             - Grammar: reference/generation/cfg.md
+            - Creating Grammars: reference/generation/creating_grammars.md
             - Custom FSM operations: reference/generation/custom_fsm_ops.md
     - Utilities:
         - Serve with vLLM: reference/serve/vllm.md

diff --git a/outlines/fsm/fsm.py b/outlines/fsm/fsm.py
@@ -1,7 +1,7 @@
 import warnings
 from typing import TYPE_CHECKING, Iterable, NewType, Optional
 
-from outlines.fsm.guide import CFGGuide, RegexGuide, StopAtEOSGuide
+from outlines.fsm.guide import RegexGuide, StopAtEOSGuide
 
 if TYPE_CHECKING:
     from outlines.models.tokenizer import Tokenizer
@@ -45,25 +45,3 @@ def allowed_token_ids(self, state: FSMState) -> Optional[Iterable[int]]:
 
     def next_state(self, state: FSMState, token_id: int) -> FSMState:
         return FSMState(self.get_next_state(state, token_id))
-
-
-class CFGFSM(CFGGuide):
-    """FSM to generate text that is in the language of a context-free grammar."""
-
-    def __init__(self, cfg_string: str, tokenizer):
-        warnings.warn(
-            UserWarning(
-                "The `CFGFSM` interface is deprecated and will be removed on 2024-06-01. Please use `CFGGuide` instead."
-            )
-        )
-        super().__init__(cfg_string, tokenizer)
-
-    def allowed_token_ids(self, state: FSMState) -> Optional[Iterable[int]]:
-        return self.get_next_instruction(state).tokens
-
-    def next_state(self, state: FSMState, token_id: int) -> FSMState:
-        return FSMState(self.get_next_state(state, token_id))
-
-    def copy(self) -> "CFGFSM":
-        """Create a copy of the FSM."""
-        return CFGFSM(self.cfg_string, self.tokenizer)