SignalRT
diff --git a/‎LLama.Examples/ExampleRunner.cs
Lines changed: 0 additions & 1 deletion b/‎LLama.Examples/ExampleRunner.cs
Lines changed: 0 additions & 1 deletion
diff --git a/‎LLama.Examples/Examples/BatchedExecutorBeamSearch.cs
Lines changed: 7 additions & 5 deletions b/‎LLama.Examples/Examples/BatchedExecutorBeamSearch.cs
Lines changed: 7 additions & 5 deletions
diff --git a/‎LLama.Examples/Examples/BatchedExecutorBoolQ.cs
Lines changed: 30 additions & 14 deletions b/‎LLama.Examples/Examples/BatchedExecutorBoolQ.cs
Lines changed: 30 additions & 14 deletions
diff --git a/‎LLama.Examples/Examples/BatchedExecutorFork.cs
Lines changed: 1 addition & 5 deletions b/‎LLama.Examples/Examples/BatchedExecutorFork.cs
Lines changed: 1 addition & 5 deletions
diff --git a/‎LLama.Examples/Examples/BatchedExecutorGuidance.cs
Lines changed: 0 additions & 123 deletions b/‎LLama.Examples/Examples/BatchedExecutorGuidance.cs
Lines changed: 0 additions & 123 deletions
diff --git a/‎LLama.Examples/Examples/BatchedExecutorLLava.cs
Lines changed: 1 addition & 1 deletion b/‎LLama.Examples/Examples/BatchedExecutorLLava.cs
Lines changed: 1 addition & 1 deletion
diff --git a/‎LLama.Examples/Examples/BatchedExecutorRewind.cs
Lines changed: 1 addition & 1 deletion b/‎LLama.Examples/Examples/BatchedExecutorRewind.cs
Lines changed: 1 addition & 1 deletion
diff --git a/‎LLama.Examples/Examples/BatchedExecutorSaveAndLoad.cs
Lines changed: 2 additions & 2 deletions b/‎LLama.Examples/Examples/BatchedExecutorSaveAndLoad.cs
Lines changed: 2 additions & 2 deletions
diff --git a/‎LLama.Examples/Examples/ChatChineseGB2312.cs
Lines changed: 1 addition & 2 deletions b/‎LLama.Examples/Examples/ChatChineseGB2312.cs
Lines changed: 1 addition & 2 deletions
diff --git a/‎LLama.Examples/Examples/ChatSessionStripRoleName.cs
Lines changed: 0 additions & 1 deletion b/‎LLama.Examples/Examples/ChatSessionStripRoleName.cs
Lines changed: 0 additions & 1 deletion
diff --git a/‎LLama.Examples/Examples/ChatSessionWithHistory.cs
Lines changed: 0 additions & 1 deletion b/‎LLama.Examples/Examples/ChatSessionWithHistory.cs
Lines changed: 0 additions & 1 deletion
diff --git a/‎LLama.Examples/Examples/ChatSessionWithRestart.cs
Lines changed: 0 additions & 1 deletion b/‎LLama.Examples/Examples/ChatSessionWithRestart.cs
Lines changed: 0 additions & 1 deletion
diff --git a/‎LLama.Examples/Examples/ChatSessionWithRoleName.cs
Lines changed: 0 additions & 1 deletion b/‎LLama.Examples/Examples/ChatSessionWithRoleName.cs
Lines changed: 0 additions & 1 deletion
@@ -31,7 +31,6 @@ public class ExampleRunner
         { "Batched Executor: Save/Load", BatchedExecutorSaveAndLoad.Run },
         { "Batched Executor: Fork", BatchedExecutorFork.Run },
         { "Batched Executor: Rewind", BatchedExecutorRewind.Run },
-        { "Batched Executor: Guidance", BatchedExecutorGuidance.Run },
         { "Batched Executor: LLava", BatchedExecutorLLava.Run },
         { "Batched Executor: BoolQ Benchmark", BatchedExecutorBoolQ.Run },
         { "Batched Executor: Beam Search", BatchedExecutorBeamSearch.Run },
 
@@ -55,7 +55,9 @@ from beam in oldBeam.Sample(beamsCount)
             while (beams.Count > beamsCount)
             {
                 var beam = beams[0];
-                AnsiConsole.MarkupLineInterpolated($"[red]Culling Beam {beam.Conversation.ConversationId} (prob:{beam.CumulativeProbability:P10})[/]: {beam}");
+
+                var text = beam.ToString().EscapeMarkup();
+                AnsiConsole.MarkupLine($"[red]Culling Beam {beam.Conversation.ConversationId} (prob:{beam.CumulativeProbability:P5})[/]: {text}");
 
                 beam.Dispose();
                 beams.RemoveAt(0);
@@ -121,7 +123,7 @@ public List<Beam> Sample(int nbeams)
         {
             // Apply softmax, this calculates probabilities and sorts tokens into descending order
             var logitsArr = LLamaTokenDataArray.Create(Conversation.Sample());
-            logitsArr.Softmax(Conversation.Executor.Context.NativeHandle);
+            logitsArr.Softmax();
 
             // Create new forked conversations, one for each beam
             var results = new List<Beam>();
@@ -135,14 +137,14 @@ public List<Beam> Sample(int nbeams)
                 var c = Conversation.Fork();
 
                 // Extend the conversation with the selected token.
-                c.Prompt(item.id);
+                c.Prompt(item.ID);
 
                 // Keep track of the cumulative probability of this entire sequence.
-                var p = CumulativeProbability * item.p;
+                var p = CumulativeProbability * item.Probability;
 
                 // Keep track of all tokens in this sequence, for decoding later
                 var t = Tokens.ToList();
-                t.Add(item.id);
+                t.Add(item.ID);
 
                 // Keep track of which beam this beam was derived from.
                 var s = Sequence.ToList();
 
@@ -1,7 +1,6 @@
 using System.Text;
 using LLama.Batched;
 using LLama.Common;
-using LLama.Grammars;
 using LLama.Native;
 using Spectre.Console;
 using LLama.Sampling;
@@ -10,6 +9,9 @@ namespace LLama.Examples.Examples;
 
 public class BatchedExecutorBoolQ
 {
+    // Answers may start with a space, and then must produce one of the listed strings followed by a newline character and nothing else.
+    private static readonly Grammar AnswerGrammar = new("root ::= (\" \")? (\"true\" | \"false\" | \"yes\" | \"no\") \"\\n\"", "root");
+
     public static async Task Run()
     {
         // Load model weights
@@ -21,9 +23,6 @@ public static async Task Run()
         var sys = AnsiConsole.Ask("System prompt", "Answer the question with a single word answer.");
         var hint = AnsiConsole.Ask("Provide hints to model (test reading comprehension instead of knowledge)", true);
 
-        // Answers may start with a space, and then must produce one of the listed strings followed by a newline character and nothing else.
-        var grammar = Grammar.Parse("root ::= (\" \")? (\"true\" | \"false\" | \"yes\" | \"no\") \"\\n\"", "root");
-
         // Create an executor that can evaluate a batch of conversations together
         using var executor = new BatchedExecutor(model, parameters);
 
@@ -53,7 +52,7 @@ await AnsiConsole.Progress()
 
             foreach (var chunk in chunks)
             {
-                var result = await RunBatch(executor, tokensGenerate, grammar, sys, hint, chunk);
+                var result = await RunBatch(executor, tokensGenerate, sys, hint, chunk);
                 results.Add(result);
 
                 reporter.Increment(1);
@@ -87,10 +86,10 @@ await AnsiConsole.Progress()
         }
     }
 
-    private static async Task<BatchResult> RunBatch(BatchedExecutor executor, int maxTokens, Grammar grammar, string sys, bool hint, IEnumerable<(string, bool, string)> batch)
+    private static async Task<BatchResult> RunBatch(BatchedExecutor executor, int maxTokens, string sys, bool hint, IEnumerable<(string, bool, string)> batch)
     {
         var conversations = (from item in batch
-                             select new ConversationRunner(executor, grammar, sys, item.Item1, item.Item2, hint ? item.Item3 : null)).ToArray();
+                             select new ConversationRunner(executor, sys, item.Item1, item.Item2, hint ? item.Item3 : null)).ToArray();
 
         for (var i = 0; i < maxTokens; i++)
         {
@@ -135,6 +134,9 @@ private record BatchResult(int TruePositive, int TrueNegative, int FalsePositive
         public float Accuracy => (float)Correct / Total;
     }
 
+    /// <summary>
+    /// All of the mechanics necessary to run a conversation to answer a single question
+    /// </summary>
     private class ConversationRunner
         : IDisposable
     {
@@ -149,14 +151,11 @@ private class ConversationRunner
         public string Question { get; }
         public bool Answer { get; }
 
-        public ConversationRunner(BatchedExecutor executor, Grammar grammar, string sys, string question, bool answer, string? hint)
+        public ConversationRunner(BatchedExecutor executor, string sys, string question, bool answer, string? hint)
         {
             _executor = executor;
             _decoder = new StreamingTokenDecoder(executor.Context);
-            _sampler = new GreedySamplingPipeline
-            {
-                Grammar = grammar.CreateInstance(),
-            };
+            _sampler = new GreedySamplingWithGrammarPipeline { Grammar = AnswerGrammar };
 
             // Make sure question ends with question mark
             if (!question.EndsWith('?'))
@@ -192,7 +191,7 @@ public void Sample()
             if (!_conversation.RequiresSampling)
                 return;
 
-            var token = _sampler.Sample(_executor.Context.NativeHandle, _conversation.Sample(), []);
+            var token = _sampler.Sample(_executor.Context, _conversation.GetSampleIndex());
 
             var tokens = _executor.Context.NativeHandle.ModelHandle.Tokens;
             if (tokens.IsEndOfGeneration(token) || tokens.Newline == token)
@@ -216,7 +215,7 @@ public void Prompt()
             var token = _sampledToken.Value;
             _sampledToken = default;
 
-            _sampler.Accept(_executor.Context.NativeHandle, token);
+            _sampler.Accept(token);
             _decoder.Add(token);
             _conversation.Prompt(token);
         }
@@ -245,4 +244,21 @@ public void Dispose()
             _sampler.Dispose();
         }
     }
+
+    /// <summary>
+    /// A sampling pipeline which always selects the most likely token (after applying a grammar)
+    /// </summary>
+    public class GreedySamplingWithGrammarPipeline
+        : BaseSamplingPipeline
+    {
+        public required Grammar Grammar { get; init; }
+
+        protected override SafeLLamaSamplerChainHandle CreateChain(SafeLLamaContextHandle context)
+        {
+            var chain = SafeLLamaSamplerHandle.CreateChain(LLamaSamplerChainParams.Default());
+            chain.Add(SafeLLamaSamplerHandle.CreateGrammar(context.ModelHandle, Grammar.Gbnf, Grammar.Root));
+            chain.Add(SafeLLamaSamplerHandle.CreateGreedySampler());
+            return chain;
+        }
+    }
 }
@@ -1,6 +1,5 @@
 using LLama.Batched;
 using LLama.Common;
-using LLama.Native;
 using LLama.Sampling;
 using Spectre.Console;
 
@@ -77,9 +76,7 @@ await AnsiConsole
         // Print some stats
         var timings = executor.Context.NativeHandle.GetTimings();
         AnsiConsole.MarkupLine($"Total Tokens Evaluated: {timings.TokensEvaluated}");
-        AnsiConsole.MarkupLine($"Total Tokens Sampled: {timings.TokensSampled}");
         AnsiConsole.MarkupLine($"Eval Time: {(timings.Eval + timings.PromptEval).TotalMilliseconds}ms");
-        AnsiConsole.MarkupLine($"Sample Time: {timings.Sampling.TotalMilliseconds}ms");
     }
 
     private class Node
@@ -114,8 +111,7 @@ public void Sample()
 
             // Sample one token
             var ctx = _conversation.Executor.Context.NativeHandle;
-            var token = _sampler.Sample(ctx, _conversation.Sample(), Array.Empty<LLamaToken>());
-            _sampler.Accept(ctx, token);
+            var token = _sampler.Sample(ctx, _conversation.GetSampleIndex());
             _decoder.Add(token);
 
             // Prompt the conversation with this token, to continue generating from there
 
@@ -75,7 +75,7 @@ await AnsiConsole
 
                       await executor.Infer();
 
-                      var token = sampler.Sample(executor.Context.NativeHandle, conversation.Sample(), Array.Empty<LLamaToken>());
+                      var token = sampler.Sample(executor.Context.NativeHandle, conversation.GetSampleIndex());
                       if (executor.Context.NativeHandle.ModelHandle.Tokens.IsEndOfGeneration(token))
                           break;
 
 
@@ -91,7 +91,7 @@ private class Node
 
         public LLamaToken Sample(Conversation conversation)
         {
-            var token = _sampler.Sample(conversation.Executor.Context.NativeHandle, conversation.Sample(), []);
+            var token = _sampler.Sample(conversation.Executor.Context.NativeHandle, conversation.GetSampleIndex());
             _tokens.Add(token);
             return token;
         }
 
@@ -1,4 +1,4 @@
-using LLama.Batched;
+using LLama.Batched;
 using LLama.Common;
 using LLama.Native;
 using LLama.Sampling;
@@ -94,7 +94,7 @@ private static async Task<LLamaToken> GenerateTokens(BatchedExecutor executor, C
             await executor.Infer();
 
             // Use sampling pipeline to pick a token
-            token = sampler.Sample(executor.Context.NativeHandle, conversation.Sample(), ReadOnlySpan<LLamaToken>.Empty);
+            token = sampler.Sample(executor.Context.NativeHandle, conversation.GetSampleIndex());
 
             // Add it to the decoder, so it can be converted into text later
             decoder.Add(token);
 
@@ -1,4 +1,4 @@
-using System.Text;
+using System.Text;
 using LLama.Common;
 
 namespace LLama.Examples.Examples;
@@ -27,7 +27,6 @@ public static async Task Run()
 
         var parameters = new ModelParams(modelPath)
         {
-            Seed = 1337,
             GpuLayerCount = 5,
             Encoding = Encoding.UTF8
         };
 
@@ -13,7 +13,6 @@ public static async Task Run()
 
         var parameters = new ModelParams(modelPath)
         {
-            Seed = 1337,
             GpuLayerCount = 5
         };
         using var model = await LLamaWeights.LoadFromFileAsync(parameters);
 
@@ -11,7 +11,6 @@ public static async Task Run()
 
         var parameters = new ModelParams(modelPath)
         {
-            Seed = 1337,
             GpuLayerCount = 5
         };
         using var model = await LLamaWeights.LoadFromFileAsync(parameters);
 
@@ -11,7 +11,6 @@ public static async Task Run()
 
         var parameters = new ModelParams(modelPath)
         {
-            Seed = 1337,
             GpuLayerCount = 5
         };
         using var model = await LLamaWeights.LoadFromFileAsync(parameters);
 
@@ -11,7 +11,6 @@ public static async Task Run()
 
         var parameters = new ModelParams(modelPath)
         {
-            Seed = 1337,
             GpuLayerCount = 5
         };
         using var model = await LLamaWeights.LoadFromFileAsync(parameters);
Original file line number	Diff line number	Diff line change
`@@ -91,7 +91,7 @@ private class Node`
`91`	`91`
`92`	`92`	`public LLamaToken Sample(Conversation conversation)`
`93`	`93`	`{`
`94`		`- var token = _sampler.Sample(conversation.Executor.Context.NativeHandle, conversation.Sample(), []);`
	`94`	`+ var token = _sampler.Sample(conversation.Executor.Context.NativeHandle, conversation.GetSampleIndex());`
`95`	`95`	`_tokens.Add(token);`
`96`	`96`	`return token;`
`97`	`97`	`}`
Original file line number	Diff line number	Diff line change
`@@ -13,7 +13,6 @@ public static async Task Run()`
`13`	`13`
`14`	`14`	`var parameters = new ModelParams(modelPath)`
`15`	`15`	`{`
`16`		`- Seed = 1337,`
`17`	`16`	`GpuLayerCount = 5`
`18`	`17`	`};`
`19`	`18`	`using var model = await LLamaWeights.LoadFromFileAsync(parameters);`
Original file line number	Diff line number	Diff line change
`@@ -11,7 +11,6 @@ public static async Task Run()`
`11`	`11`
`12`	`12`	`var parameters = new ModelParams(modelPath)`
`13`	`13`	`{`
`14`		`- Seed = 1337,`
`15`	`14`	`GpuLayerCount = 5`
`16`	`15`	`};`
`17`	`16`	`using var model = await LLamaWeights.LoadFromFileAsync(parameters);`