diff --git a/.jules/bolt.md b/.jules/bolt.md index d81dff90..fb3099a8 100644 --- a/.jules/bolt.md +++ b/.jules/bolt.md @@ -1,3 +1,5 @@ -## 2025-05-15 - [Sequential session destruction in SDKs] -**Learning:** All Copilot SDKs (Node.js, Python, Go, .NET) were initially implementing session destruction sequentially during client shutdown. This leads to a linear increase in shutdown time as the number of active sessions grows, especially when individual destructions involve retries and backoff. -**Action:** Parallelize session cleanup using language-specific concurrency primitives (e.g., `Promise.all` in Node.js, `asyncio.gather` in Python, `Task.WhenAll` in .NET, or WaitGroups/Channels in Go) to ensure shutdown time remains constant and minimal. +# Bolt's Performance Journal + +## 2025-05-15 - Parallel Session Destruction +**Learning:** Sequential session destruction during client shutdown is a significant bottleneck when multiple sessions are active, as each destruction involves a JSON-RPC request over IPC/Network. Parallelizing this across all SDKs (Go, .NET, Python) brings them to parity with the Node.js implementation and significantly reduces shutdown latency. +**Action:** Always prefer parallel execution for independent I/O-bound cleanup tasks during client shutdown. diff --git a/dotnet/src/Client.cs b/dotnet/src/Client.cs index 74f1c66f..6f2538ef 100644 --- a/dotnet/src/Client.cs +++ b/dotnet/src/Client.cs @@ -3,6 +3,7 @@ *--------------------------------------------------------------------------------------------*/ using Microsoft.Extensions.AI; +using System.Linq; using Microsoft.Extensions.Logging; using Microsoft.Extensions.Logging.Abstractions; using StreamJsonRpc; @@ -210,25 +211,42 @@ async Task StartCoreAsync(CancellationToken ct) /// public async Task StopAsync() { - var errors = new List(); + // Destroy all active sessions in parallel to speed up shutdown. + // This reduces StopAsync() time from O(N*T) to O(T) where N is the number of sessions. + var errors = new ConcurrentBag(); - foreach (var session in _sessions.Values.ToArray()) + var sessionTasks = _sessions.Values.ToArray().Select(async session => { - try - { - await session.DisposeAsync(); - } - catch (Exception ex) + for (int attempt = 1; attempt <= 3; attempt++) { - errors.Add(new Exception($"Failed to destroy session {session.SessionId}: {ex.Message}", ex)); + try + { + await session.DisposeAsync(); + return; // Success + } + catch (Exception ex) + { + if (attempt == 3) + { + errors.Add(new Exception($"Failed to destroy session {session.SessionId} after 3 attempts: {ex.Message}", ex)); + } + else + { + // Exponential backoff: 100ms, 200ms + await Task.Delay(100 * attempt); + } + } } - } + }); + + await Task.WhenAll(sessionTasks); _sessions.Clear(); - await CleanupConnectionAsync(errors); + var errorList = errors.ToList(); + await CleanupConnectionAsync(errorList); _connectionTask = null; - ThrowErrors(errors); + ThrowErrors(errorList); } /// diff --git a/go/client.go b/go/client.go index 319c6588..d28b3969 100644 --- a/go/client.go +++ b/go/client.go @@ -294,10 +294,38 @@ func (c *Client) Stop() error { } c.sessionsMux.Unlock() + // Destroy all active sessions in parallel to speed up shutdown. + // This reduces Stop() time from O(N*T) to O(T) where N is the number of sessions. + var wg sync.WaitGroup + errsChan := make(chan error, len(sessions)) + for _, session := range sessions { - if err := session.Destroy(); err != nil { - errs = append(errs, fmt.Errorf("failed to destroy session %s: %w", session.SessionID, err)) - } + wg.Add(1) + go func(s *Session) { + defer wg.Done() + var lastErr error + for attempt := 1; attempt <= 3; attempt++ { + if err := s.Destroy(); err != nil { + lastErr = err + if attempt < 3 { + // Exponential backoff: 100ms, 200ms + time.Sleep(time.Duration(100*attempt) * time.Millisecond) + } + } else { + lastErr = nil + break + } + } + if lastErr != nil { + errsChan <- fmt.Errorf("failed to destroy session %s after 3 attempts: %w", s.SessionID, lastErr) + } + }(session) + } + + wg.Wait() + close(errsChan) + for err := range errsChan { + errs = append(errs, err) } c.sessionsMux.Lock() diff --git a/python/copilot/client.py b/python/copilot/client.py index 85b72897..30947b9b 100644 --- a/python/copilot/client.py +++ b/python/copilot/client.py @@ -313,13 +313,34 @@ async def stop(self) -> list["StopError"]: sessions_to_destroy = list(self._sessions.values()) self._sessions.clear() - for session in sessions_to_destroy: - try: - await session.destroy() - except Exception as e: - errors.append( - StopError(message=f"Failed to destroy session {session.session_id}: {e}") + # Destroy all active sessions in parallel to speed up shutdown. + # This reduces stop() time from O(N*T) to O(T) where N is the number of sessions. + async def destroy_with_retry(session: CopilotSession) -> Optional[StopError]: + last_error = None + for attempt in range(1, 4): + try: + await session.destroy() + return None + except Exception as e: + last_error = e + if attempt < 3: + # Exponential backoff: 100ms, 200ms + await asyncio.sleep(0.1 * attempt) + return StopError( + message=( + f"Failed to destroy session {session.session_id} after 3 attempts: {last_error}" ) + ) + + if sessions_to_destroy: + results = await asyncio.gather( + *[destroy_with_retry(s) for s in sessions_to_destroy], return_exceptions=True + ) + for res in results: + if isinstance(res, StopError): + errors.append(res) + elif isinstance(res, Exception): + errors.append(StopError(message=str(res))) # Close client if self._client: