Skip to content

Commit 5434d8d

Browse files
committed
cocalc-api: tweak test config
1 parent 0d70752 commit 5434d8d

File tree

2 files changed

+203
-69
lines changed

2 files changed

+203
-69
lines changed

src/python/cocalc-api/tests/conftest.py

Lines changed: 168 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,54 @@
1010
from cocalc_api import Hub, Project
1111

1212
from psycopg2 import pool as pg_pool
13+
from typing import Callable, TypeVar, Any
1314

1415
# Database configuration examples (DRY principle)
1516
PGHOST_SOCKET_EXAMPLE = "/path/to/cocalc-data/socket"
1617
PGHOST_NETWORK_EXAMPLE = "localhost"
1718

19+
T = TypeVar('T')
20+
21+
22+
def retry_with_backoff(
23+
func: Callable[[], T],
24+
max_retries: int = 3,
25+
retry_delay: int = 5,
26+
error_condition: Callable[[RuntimeError], bool] = lambda e: any(
27+
keyword in str(e).lower() for keyword in ["timeout", "closed", "connection", "reset", "broken"]
28+
),
29+
) -> T:
30+
"""
31+
Retry a function call with exponential backoff for timeout and connection errors.
32+
33+
This helper is useful for operations that may timeout or fail on first attempt due to
34+
cold starts (e.g., kernel launches) or transient connection issues.
35+
36+
Args:
37+
func: Callable that performs the operation
38+
max_retries: Maximum number of attempts (default: 3)
39+
retry_delay: Delay in seconds between retries (default: 5)
40+
error_condition: Function to determine if an error should trigger retry.
41+
Defaults to checking for timeout/connection-related keywords.
42+
43+
Returns:
44+
The result of the function call
45+
46+
Raises:
47+
RuntimeError: If all retries fail or error condition doesn't match
48+
"""
49+
for attempt in range(max_retries):
50+
try:
51+
return func()
52+
except RuntimeError as e:
53+
error_msg = str(e).lower()
54+
is_retryable = error_condition(e)
55+
if is_retryable and attempt < max_retries - 1:
56+
print(f"Attempt {attempt + 1} failed ({error_msg[:50]}...), retrying in {retry_delay}s...")
57+
time.sleep(retry_delay)
58+
else:
59+
raise
60+
1861

1962
def assert_valid_uuid(value, description="value"):
2063
"""
@@ -125,39 +168,110 @@ def project_client(temporary_project, api_key, cocalc_host):
125168
return Project(project_id=temporary_project['project_id'], api_key=api_key, host=cocalc_host)
126169

127170

171+
@pytest.fixture(autouse=True)
172+
def cleanup_kernels_after_test(request, project_client):
173+
"""
174+
Clean up excess Jupyter kernels after test classes that use them.
175+
176+
Kernel accumulation happens because the kernel pool reuses kernels, but under
177+
heavy test load, old kernels aren't always properly cleaned up by the pool.
178+
This fixture cleans up accumulated kernels BETWEEN test classes (not between
179+
individual tests) to avoid interfering with the pool's reuse strategy.
180+
181+
The fixture only runs for tests in classes that deal with Jupyter kernels
182+
(TestJupyterExecuteViaHub, TestJupyterExecuteViaProject, TestJupyterKernelManagement)
183+
to avoid interfering with other tests.
184+
"""
185+
yield # Allow test to run
186+
187+
# Only cleanup for Jupyter-related tests
188+
test_class = request.cls
189+
if test_class is None:
190+
return
191+
192+
jupyter_test_classes = {
193+
'TestJupyterExecuteViaHub',
194+
'TestJupyterExecuteViaProject',
195+
'TestJupyterKernelManagement',
196+
}
197+
198+
if test_class.__name__ not in jupyter_test_classes:
199+
return
200+
201+
# Clean up accumulated kernels carefully
202+
# Only cleanup if we have more kernels than the pool can manage (> 3)
203+
# This gives some buffer to the pool's reuse mechanism
204+
try:
205+
import time
206+
kernels = project_client.system.list_jupyter_kernels()
207+
208+
# Only cleanup if significantly over pool size (pool size is 2)
209+
# We use threshold of 3 to trigger cleanup
210+
if len(kernels) > 3:
211+
# Keep the 2 most recent kernels (higher PIDs), stop older ones
212+
kernels_sorted = sorted(kernels, key=lambda k: k.get("pid", 0))
213+
kernels_to_stop = kernels_sorted[:-2] # All but the 2 newest
214+
215+
for kernel in kernels_to_stop:
216+
try:
217+
project_client.system.stop_jupyter_kernel(pid=kernel["pid"])
218+
time.sleep(0.1) # Small delay between kills
219+
except Exception:
220+
# Silently ignore individual kernel failures
221+
pass
222+
except Exception:
223+
# If listing kernels fails, just continue
224+
pass
225+
226+
128227
def ensure_python3_kernel(project_client: Project):
129228
"""
130229
Ensure the default python3 Jupyter kernel is installed in the project.
131230
132231
If not available, install ipykernel and register the kernelspec.
133232
"""
134233

135-
def has_python_kernel() -> bool:
234+
def try_exec(command: list[str], timeout: int = 60, capture_stdout: bool = False):
136235
try:
137236
result = project_client.system.exec(
138-
command="python3",
139-
args=["-m", "jupyter", "kernelspec", "list", "--json"],
140-
timeout=60,
237+
command=command[0],
238+
args=command[1:],
239+
timeout=timeout,
141240
)
142-
data = json.loads(result["stdout"])
143-
kernelspecs = data.get("kernelspecs", {})
144-
return "python3" in kernelspecs
241+
return (True, result["stdout"] if capture_stdout else None)
242+
except Exception as err:
243+
print(f"Warning: command {command} failed: {err}")
244+
return (False, None)
245+
246+
def has_python_kernel() -> bool:
247+
ok, stdout = try_exec(
248+
["python3", "-m", "jupyter", "kernelspec", "list", "--json"],
249+
capture_stdout=True,
250+
)
251+
if not ok or stdout is None:
252+
return False
253+
try:
254+
data = json.loads(stdout)
255+
return "python3" in data.get("kernelspecs", {})
145256
except Exception as err:
146-
print(f"Warning: Failed to list kernelspecs: {err}")
257+
print(f"Warning: Failed to parse kernelspec list: {err}")
147258
return False
148259

149260
if has_python_kernel():
150261
return
151262

152263
print("Installing python3 kernelspec in project...")
153-
project_client.system.exec(
154-
command="python3",
155-
args=["-m", "pip", "install", "--user", "ipykernel"],
156-
timeout=300,
157-
)
158-
project_client.system.exec(
159-
command="python3",
160-
args=[
264+
# Install pip if needed
265+
try_exec(["python3", "-m", "ensurepip", "--user"], timeout=120)
266+
# Upgrade pip but ignore errors (not fatal)
267+
try_exec(["python3", "-m", "pip", "install", "--user", "--upgrade", "pip"], timeout=120)
268+
269+
if not try_exec(["python3", "-m", "pip", "install", "--user", "ipykernel"], timeout=300):
270+
raise RuntimeError("Failed to install ipykernel via pip")
271+
272+
if not try_exec(
273+
[
274+
"python3",
161275
"-m",
162276
"ipykernel",
163277
"install",
@@ -166,7 +280,8 @@ def has_python_kernel() -> bool:
166280
"--display-name=Python 3",
167281
],
168282
timeout=120,
169-
)
283+
):
284+
raise RuntimeError("Failed to install python3 kernelspec")
170285

171286
if not has_python_kernel():
172287
raise RuntimeError("Failed to ensure python3 kernelspec is installed in project")
@@ -500,3 +615,39 @@ def cleanup():
500615
request.addfinalizer(cleanup)
501616

502617
yield
618+
619+
620+
@pytest.fixture(scope="session", autouse=True)
621+
def cleanup_jupyter_kernels_session(project_client):
622+
"""
623+
Clean up all Jupyter kernels created during the test session.
624+
625+
This session-scoped fixture ensures that all kernels spawned during testing
626+
are properly terminated at the end of the test session. This prevents
627+
orphaned processes from accumulating in the system.
628+
629+
The fixture runs AFTER all tests complete (via yield), ensuring no
630+
interference with test execution while still guaranteeing cleanup.
631+
"""
632+
yield # Allow all tests to run first
633+
634+
# After all tests complete, clean up all remaining kernels
635+
try:
636+
kernels = project_client.system.list_jupyter_kernels()
637+
if kernels:
638+
print(f"\n{'='*70}")
639+
print(f"CLEANING UP {len(kernels)} JUPYTER KERNELS FROM TEST SESSION")
640+
print(f"{'='*70}")
641+
for kernel in kernels:
642+
try:
643+
pid = kernel.get("pid")
644+
result = project_client.system.stop_jupyter_kernel(pid=pid)
645+
if result.get("success"):
646+
print(f"✓ Stopped kernel PID {pid}")
647+
else:
648+
print(f"✗ Failed to stop kernel PID {pid}")
649+
except Exception as e:
650+
print(f"✗ Error stopping kernel: {e}")
651+
print(f"{'='*70}\n")
652+
except Exception as e:
653+
print(f"Warning: Failed to clean up jupyter kernels: {e}")

src/python/cocalc-api/tests/test_jupyter.py

Lines changed: 35 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,10 @@
22
Tests for Jupyter kernel functionality.
33
"""
44

5-
import time
6-
from typing import Optional
5+
import pytest
6+
7+
# Import helper from conftest
8+
from tests.conftest import retry_with_backoff
79

810

911
class TestJupyterKernelSetup:
@@ -68,10 +70,15 @@ class TestJupyterExecuteViaHub:
6870
"""Tests for executing code via hub.jupyter.execute()."""
6971

7072
def test_execute_simple_sum(self, hub, temporary_project):
71-
"""Test executing a simple sum using the python3 kernel."""
73+
"""Test executing a simple sum using the python3 kernel.
74+
75+
Note: First execution may take longer as kernel needs to start up (30+ seconds).
76+
"""
7277
project_id = temporary_project["project_id"]
7378

74-
result = hub.jupyter.execute(input="sum(range(100))", kernel="python3", project_id=project_id)
79+
result = retry_with_backoff(
80+
lambda: hub.jupyter.execute(input="sum(range(100))", kernel="python3", project_id=project_id)
81+
)
7582

7683
# Check the result structure
7784
assert isinstance(result, dict)
@@ -92,7 +99,9 @@ def test_execute_with_history(self, hub, temporary_project):
9299
"""Test executing code with history context."""
93100
project_id = temporary_project["project_id"]
94101

95-
result = hub.jupyter.execute(history=["a = 100"], input="sum(range(a + 1))", kernel="python3", project_id=project_id)
102+
result = retry_with_backoff(
103+
lambda: hub.jupyter.execute(history=["a = 100"], input="sum(range(a + 1))", kernel="python3", project_id=project_id)
104+
)
96105

97106
# Check the result (sum of 0..100 = 5050)
98107
assert isinstance(result, dict)
@@ -107,10 +116,15 @@ def test_execute_with_history(self, hub, temporary_project):
107116
assert first_output["data"]["text/plain"] == "5050"
108117

109118
def test_execute_print_statement(self, hub, temporary_project):
110-
"""Test executing code that prints output."""
119+
"""Test executing code that prints output.
120+
121+
Note: First execution may take longer as kernel needs to start up (30+ seconds).
122+
"""
111123
project_id = temporary_project["project_id"]
112124

113-
result = hub.jupyter.execute(input='print("Hello from Jupyter")', kernel="python3", project_id=project_id)
125+
result = retry_with_backoff(
126+
lambda: hub.jupyter.execute(input='print("Hello from Jupyter")', kernel="python3", project_id=project_id)
127+
)
114128

115129
# Check that we got output
116130
assert isinstance(result, dict)
@@ -138,21 +152,9 @@ def test_jupyter_execute_simple_sum(self, project_client):
138152
139153
Note: First execution may take longer as kernel needs to start up (30+ seconds).
140154
"""
141-
# Retry logic for first kernel startup
142-
max_retries = 3
143-
retry_delay = 15
144-
result: Optional[list] = None
145-
146-
for attempt in range(max_retries):
147-
try:
148-
result = project_client.system.jupyter_execute(input="sum(range(100))", kernel="python3")
149-
break
150-
except RuntimeError as e:
151-
if "timeout" in str(e).lower() and attempt < max_retries - 1:
152-
print(f"Attempt {attempt + 1} timed out, retrying in {retry_delay}s...")
153-
time.sleep(retry_delay)
154-
else:
155-
raise
155+
result = retry_with_backoff(
156+
lambda: project_client.system.jupyter_execute(input="sum(range(100))", kernel="python3")
157+
)
156158

157159
# Result is a list, not a dict with 'output' key
158160
assert isinstance(result, list)
@@ -169,8 +171,12 @@ def test_jupyter_execute_with_history(self, project_client):
169171
Test executing code with history via project API.
170172
171173
The result is a list of output items directly.
174+
175+
Note: First execution may take longer as kernel needs to start up (30+ seconds).
172176
"""
173-
result = project_client.system.jupyter_execute(history=["b = 50"], input="b * 2", kernel="python3")
177+
result = retry_with_backoff(
178+
lambda: project_client.system.jupyter_execute(history=["b = 50"], input="b * 2", kernel="python3")
179+
)
174180

175181
# Result is a list
176182
assert isinstance(result, list)
@@ -188,21 +194,9 @@ def test_jupyter_execute_list_operation(self, project_client):
188194
189195
The result is a list of output items directly.
190196
"""
191-
# Retry logic for kernel startup
192-
max_retries = 3
193-
retry_delay = 15
194-
result: Optional[list] = None
195-
196-
for attempt in range(max_retries):
197-
try:
198-
result = project_client.system.jupyter_execute(input="[x**2 for x in range(5)]", kernel="python3")
199-
break
200-
except RuntimeError as e:
201-
if "timeout" in str(e).lower() and attempt < max_retries - 1:
202-
print(f"Attempt {attempt + 1} timed out, retrying in {retry_delay}s...")
203-
time.sleep(retry_delay)
204-
else:
205-
raise
197+
result = retry_with_backoff(
198+
lambda: project_client.system.jupyter_execute(input="[x**2 for x in range(5)]", kernel="python3")
199+
)
206200

207201
# Result is a list
208202
assert isinstance(result, list)
@@ -221,20 +215,9 @@ class TestJupyterKernelManagement:
221215
def test_list_jupyter_kernels(self, project_client):
222216
"""Test listing running Jupyter kernels."""
223217
# First execute some code to ensure a kernel is running
224-
# Retry logic for first kernel startup (may take longer in CI)
225-
max_retries = 3
226-
retry_delay = 15
227-
228-
for attempt in range(max_retries):
229-
try:
230-
project_client.system.jupyter_execute(input="1+1", kernel="python3")
231-
break
232-
except RuntimeError as e:
233-
if "timeout" in str(e).lower() and attempt < max_retries - 1:
234-
print(f"Attempt {attempt + 1} timed out, retrying in {retry_delay}s...")
235-
time.sleep(retry_delay)
236-
else:
237-
raise
218+
retry_with_backoff(
219+
lambda: project_client.system.jupyter_execute(input="1+1", kernel="python3")
220+
)
238221

239222
# List kernels
240223
kernels = project_client.system.list_jupyter_kernels()

0 commit comments

Comments
 (0)