1010from cocalc_api import Hub , Project
1111
1212from psycopg2 import pool as pg_pool
13+ from typing import Callable , TypeVar , Any
1314
1415# Database configuration examples (DRY principle)
1516PGHOST_SOCKET_EXAMPLE = "/path/to/cocalc-data/socket"
1617PGHOST_NETWORK_EXAMPLE = "localhost"
1718
19+ T = TypeVar ('T' )
20+
21+
22+ def retry_with_backoff (
23+ func : Callable [[], T ],
24+ max_retries : int = 3 ,
25+ retry_delay : int = 5 ,
26+ error_condition : Callable [[RuntimeError ], bool ] = lambda e : any (
27+ keyword in str (e ).lower () for keyword in ["timeout" , "closed" , "connection" , "reset" , "broken" ]
28+ ),
29+ ) -> T :
30+ """
31+ Retry a function call with exponential backoff for timeout and connection errors.
32+
33+ This helper is useful for operations that may timeout or fail on first attempt due to
34+ cold starts (e.g., kernel launches) or transient connection issues.
35+
36+ Args:
37+ func: Callable that performs the operation
38+ max_retries: Maximum number of attempts (default: 3)
39+ retry_delay: Delay in seconds between retries (default: 5)
40+ error_condition: Function to determine if an error should trigger retry.
41+ Defaults to checking for timeout/connection-related keywords.
42+
43+ Returns:
44+ The result of the function call
45+
46+ Raises:
47+ RuntimeError: If all retries fail or error condition doesn't match
48+ """
49+ for attempt in range (max_retries ):
50+ try :
51+ return func ()
52+ except RuntimeError as e :
53+ error_msg = str (e ).lower ()
54+ is_retryable = error_condition (e )
55+ if is_retryable and attempt < max_retries - 1 :
56+ print (f"Attempt { attempt + 1 } failed ({ error_msg [:50 ]} ...), retrying in { retry_delay } s..." )
57+ time .sleep (retry_delay )
58+ else :
59+ raise
60+
1861
1962def assert_valid_uuid (value , description = "value" ):
2063 """
@@ -125,39 +168,110 @@ def project_client(temporary_project, api_key, cocalc_host):
125168 return Project (project_id = temporary_project ['project_id' ], api_key = api_key , host = cocalc_host )
126169
127170
171+ @pytest .fixture (autouse = True )
172+ def cleanup_kernels_after_test (request , project_client ):
173+ """
174+ Clean up excess Jupyter kernels after test classes that use them.
175+
176+ Kernel accumulation happens because the kernel pool reuses kernels, but under
177+ heavy test load, old kernels aren't always properly cleaned up by the pool.
178+ This fixture cleans up accumulated kernels BETWEEN test classes (not between
179+ individual tests) to avoid interfering with the pool's reuse strategy.
180+
181+ The fixture only runs for tests in classes that deal with Jupyter kernels
182+ (TestJupyterExecuteViaHub, TestJupyterExecuteViaProject, TestJupyterKernelManagement)
183+ to avoid interfering with other tests.
184+ """
185+ yield # Allow test to run
186+
187+ # Only cleanup for Jupyter-related tests
188+ test_class = request .cls
189+ if test_class is None :
190+ return
191+
192+ jupyter_test_classes = {
193+ 'TestJupyterExecuteViaHub' ,
194+ 'TestJupyterExecuteViaProject' ,
195+ 'TestJupyterKernelManagement' ,
196+ }
197+
198+ if test_class .__name__ not in jupyter_test_classes :
199+ return
200+
201+ # Clean up accumulated kernels carefully
202+ # Only cleanup if we have more kernels than the pool can manage (> 3)
203+ # This gives some buffer to the pool's reuse mechanism
204+ try :
205+ import time
206+ kernels = project_client .system .list_jupyter_kernels ()
207+
208+ # Only cleanup if significantly over pool size (pool size is 2)
209+ # We use threshold of 3 to trigger cleanup
210+ if len (kernels ) > 3 :
211+ # Keep the 2 most recent kernels (higher PIDs), stop older ones
212+ kernels_sorted = sorted (kernels , key = lambda k : k .get ("pid" , 0 ))
213+ kernels_to_stop = kernels_sorted [:- 2 ] # All but the 2 newest
214+
215+ for kernel in kernels_to_stop :
216+ try :
217+ project_client .system .stop_jupyter_kernel (pid = kernel ["pid" ])
218+ time .sleep (0.1 ) # Small delay between kills
219+ except Exception :
220+ # Silently ignore individual kernel failures
221+ pass
222+ except Exception :
223+ # If listing kernels fails, just continue
224+ pass
225+
226+
128227def ensure_python3_kernel (project_client : Project ):
129228 """
130229 Ensure the default python3 Jupyter kernel is installed in the project.
131230
132231 If not available, install ipykernel and register the kernelspec.
133232 """
134233
135- def has_python_kernel () -> bool :
234+ def try_exec ( command : list [ str ], timeout : int = 60 , capture_stdout : bool = False ) :
136235 try :
137236 result = project_client .system .exec (
138- command = "python3" ,
139- args = [ "-m" , "jupyter" , "kernelspec" , "list" , "--json" ],
140- timeout = 60 ,
237+ command = command [ 0 ] ,
238+ args = command [ 1 : ],
239+ timeout = timeout ,
141240 )
142- data = json .loads (result ["stdout" ])
143- kernelspecs = data .get ("kernelspecs" , {})
144- return "python3" in kernelspecs
241+ return (True , result ["stdout" ] if capture_stdout else None )
242+ except Exception as err :
243+ print (f"Warning: command { command } failed: { err } " )
244+ return (False , None )
245+
246+ def has_python_kernel () -> bool :
247+ ok , stdout = try_exec (
248+ ["python3" , "-m" , "jupyter" , "kernelspec" , "list" , "--json" ],
249+ capture_stdout = True ,
250+ )
251+ if not ok or stdout is None :
252+ return False
253+ try :
254+ data = json .loads (stdout )
255+ return "python3" in data .get ("kernelspecs" , {})
145256 except Exception as err :
146- print (f"Warning: Failed to list kernelspecs : { err } " )
257+ print (f"Warning: Failed to parse kernelspec list : { err } " )
147258 return False
148259
149260 if has_python_kernel ():
150261 return
151262
152263 print ("Installing python3 kernelspec in project..." )
153- project_client .system .exec (
154- command = "python3" ,
155- args = ["-m" , "pip" , "install" , "--user" , "ipykernel" ],
156- timeout = 300 ,
157- )
158- project_client .system .exec (
159- command = "python3" ,
160- args = [
264+ # Install pip if needed
265+ try_exec (["python3" , "-m" , "ensurepip" , "--user" ], timeout = 120 )
266+ # Upgrade pip but ignore errors (not fatal)
267+ try_exec (["python3" , "-m" , "pip" , "install" , "--user" , "--upgrade" , "pip" ], timeout = 120 )
268+
269+ if not try_exec (["python3" , "-m" , "pip" , "install" , "--user" , "ipykernel" ], timeout = 300 ):
270+ raise RuntimeError ("Failed to install ipykernel via pip" )
271+
272+ if not try_exec (
273+ [
274+ "python3" ,
161275 "-m" ,
162276 "ipykernel" ,
163277 "install" ,
@@ -166,7 +280,8 @@ def has_python_kernel() -> bool:
166280 "--display-name=Python 3" ,
167281 ],
168282 timeout = 120 ,
169- )
283+ ):
284+ raise RuntimeError ("Failed to install python3 kernelspec" )
170285
171286 if not has_python_kernel ():
172287 raise RuntimeError ("Failed to ensure python3 kernelspec is installed in project" )
@@ -500,3 +615,39 @@ def cleanup():
500615 request .addfinalizer (cleanup )
501616
502617 yield
618+
619+
620+ @pytest .fixture (scope = "session" , autouse = True )
621+ def cleanup_jupyter_kernels_session (project_client ):
622+ """
623+ Clean up all Jupyter kernels created during the test session.
624+
625+ This session-scoped fixture ensures that all kernels spawned during testing
626+ are properly terminated at the end of the test session. This prevents
627+ orphaned processes from accumulating in the system.
628+
629+ The fixture runs AFTER all tests complete (via yield), ensuring no
630+ interference with test execution while still guaranteeing cleanup.
631+ """
632+ yield # Allow all tests to run first
633+
634+ # After all tests complete, clean up all remaining kernels
635+ try :
636+ kernels = project_client .system .list_jupyter_kernels ()
637+ if kernels :
638+ print (f"\n { '=' * 70 } " )
639+ print (f"CLEANING UP { len (kernels )} JUPYTER KERNELS FROM TEST SESSION" )
640+ print (f"{ '=' * 70 } " )
641+ for kernel in kernels :
642+ try :
643+ pid = kernel .get ("pid" )
644+ result = project_client .system .stop_jupyter_kernel (pid = pid )
645+ if result .get ("success" ):
646+ print (f"✓ Stopped kernel PID { pid } " )
647+ else :
648+ print (f"✗ Failed to stop kernel PID { pid } " )
649+ except Exception as e :
650+ print (f"✗ Error stopping kernel: { e } " )
651+ print (f"{ '=' * 70 } \n " )
652+ except Exception as e :
653+ print (f"Warning: Failed to clean up jupyter kernels: { e } " )
0 commit comments