⚡️ Speed up function module_exists_in_site_packages by 38%
#598
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
📄 38% (0.38x) speedup for
module_exists_in_site_packagesinmarimo/_utils/site_packages.py⏱️ Runtime :
6.33 milliseconds→4.57 milliseconds(best of42runs)📝 Explanation and details
The optimization achieves a 38% speedup by addressing the most expensive operations in the original code's hot loop. The key improvements are:
Primary optimization - Early filtering before string splits:
The original code performed
entry.split("-", 1)[0]on every file in site-packages directories (16,503 hits taking 23.5% of total time). The optimized version first checksif not entry.endswith(suffixes)to skip entries that can't possibly be package metadata files, reducing splits from 16,503 to just 4,340 operations - a 74% reduction in the most expensive operation.Function call locality:
Moving
os.path.join,os.path.isdir,os.path.isfile,os.path.exists, andos.listdirto local variables eliminates repeated global lookups in the inner loops, providing consistent micro-performance gains across all directory operations.Optimized suffix checking:
Using a single
endswith(suffixes)call with a tuple instead of three separateorconditions reduces function calls and improves readability of the filtering logic.Better error handling:
Adding a try/except around
os.listdirspecifically handles permission errors per directory without affecting the overall exception handling, making the function more robust.Performance characteristics:
test_large_number_of_egg_info_dirs: 60.6% faster)test_module_name_with_dash: 36.5% faster)The optimization is particularly effective for real-world site-packages directories that contain many files and directories, where the early filtering prevents unnecessary string processing on irrelevant entries.
✅ Correctness verification report:
⚙️ Existing Unit Tests and Runtime
_utils/test_site_packages.py::test_module_exists_in_site_packages🌀 Generated Regression Tests and Runtime
import os
import shutil
import site
import sys
import tempfile
imports
import pytest
from marimo._utils.site_packages import module_exists_in_site_packages
------------------ UNIT TESTS ------------------
Helper function to temporarily add/remove a fake site-packages directory
class TempSitePackages:
def init(self):
self.temp_dir = tempfile.mkdtemp(prefix="fake_site_packages_")
self.orig_getsitepackages = site.getsitepackages
self.orig_getusersitepackages = getattr(site, "getusersitepackages", None)
------------------ BASIC TEST CASES ------------------
def test_existing_standard_module_by_py_file():
# Should return False: standard library modules are not in site-packages
codeflash_output = not module_exists_in_site_packages("os") # 138μs -> 141μs (1.99% slower)
codeflash_output = not module_exists_in_site_packages("sys") # 101μs -> 104μs (3.02% slower)
def test_existing_installed_module():
# Should return True for a module that is pip-installed and present in site-packages
# We'll use 'pytest' itself, which should be installed for these tests
codeflash_output = module_exists_in_site_packages("pytest"); result = codeflash_output # 17.4μs -> 17.2μs (0.754% faster)
def test_nonexistent_module():
# Should return False for a module that doesn't exist
codeflash_output = not module_exists_in_site_packages("this_module_should_not_exist_12345") # 123μs -> 127μs (3.18% slower)
def test_existing_module_by_package_dir():
# Simulate a package directory in site-packages
with TempSitePackages() as site_dir:
pkg_dir = os.path.join(site_dir, "mypackage")
os.mkdir(pkg_dir)
codeflash_output = module_exists_in_site_packages("mypackage") # 7.47μs -> 8.45μs (11.6% slower)
def test_existing_module_by_py_file():
# Simulate a .py file in site-packages
with TempSitePackages() as site_dir:
py_file = os.path.join(site_dir, "myfile.py")
with open(py_file, "w") as f:
f.write("# dummy python file")
codeflash_output = module_exists_in_site_packages("myfile") # 14.7μs -> 14.8μs (0.828% slower)
def test_existing_module_by_dist_info():
# Simulate a .dist-info directory in site-packages
with TempSitePackages() as site_dir:
dist_info = os.path.join(site_dir, "somepkg-1.0.0.dist-info")
os.mkdir(dist_info)
codeflash_output = module_exists_in_site_packages("somepkg") # 23.7μs -> 23.8μs (0.265% slower)
def test_existing_module_by_egg_info():
# Simulate a .egg-info directory in site-packages
with TempSitePackages() as site_dir:
egg_info = os.path.join(site_dir, "anotherpkg-2.3.4.egg-info")
os.mkdir(egg_info)
codeflash_output = module_exists_in_site_packages("anotherpkg") # 22.4μs -> 22.6μs (0.651% slower)
def test_existing_module_by_egg():
# Simulate a .egg directory in site-packages
with TempSitePackages() as site_dir:
egg_dir = os.path.join(site_dir, "yetanother-0.1.egg")
os.mkdir(egg_dir)
codeflash_output = module_exists_in_site_packages("yetanother") # 22.1μs -> 22.4μs (1.43% slower)
------------------ EDGE TEST CASES ------------------
def test_empty_string_module_name():
# Should return False for empty string
codeflash_output = module_exists_in_site_packages("") # 19.6μs -> 19.5μs (0.543% faster)
def test_special_characters_in_module_name():
# Should return False for invalid module names
codeflash_output = module_exists_in_site_packages("!@#$%^&*()") # 132μs -> 136μs (3.31% slower)
def test_module_name_with_dot():
# Should not match anything in site-packages (dot in name)
with TempSitePackages() as site_dir:
pkg_dir = os.path.join(site_dir, "foo.bar")
os.mkdir(pkg_dir)
# The function checks for exact match, so this should work
codeflash_output = module_exists_in_site_packages("foo.bar") # 8.00μs -> 8.20μs (2.40% slower)
def test_module_name_with_dash():
# Should match if the directory or egg-info uses dash
with TempSitePackages() as site_dir:
egg_info = os.path.join(site_dir, "foo-bar-1.0.egg-info")
os.mkdir(egg_info)
codeflash_output = module_exists_in_site_packages("foo-bar") # 31.4μs -> 23.0μs (36.5% faster)
def test_module_name_with_underscore():
# Should match if the directory or egg-info uses underscore
with TempSitePackages() as site_dir:
egg_info = os.path.join(site_dir, "foo_bar-1.0.egg-info")
os.mkdir(egg_info)
codeflash_output = module_exists_in_site_packages("foo_bar") # 21.6μs -> 21.9μs (1.27% slower)
def test_site_packages_dir_does_not_exist():
# Simulate site-packages directory that does not exist
orig_getsitepackages = site.getsitepackages
site.getsitepackages = lambda: ["/path/does/not/exist"]
try:
codeflash_output = module_exists_in_site_packages("anymodule")
finally:
site.getsitepackages = orig_getsitepackages
def test_permission_denied_on_site_packages(monkeypatch):
# Simulate permission denied by raising an exception in os.listdir
with TempSitePackages() as site_dir:
pkg_dir = os.path.join(site_dir, "mypkg")
os.mkdir(pkg_dir)
def raise_permission_error(path):
raise PermissionError("No access")
monkeypatch.setattr(os, "listdir", raise_permission_error)
codeflash_output = module_exists_in_site_packages("mypkg") # 7.76μs -> 7.91μs (1.82% slower)
def test_module_exists_in_multiple_site_packages():
# Should find module in any of the site-packages dirs
with tempfile.TemporaryDirectory() as dir1, tempfile.TemporaryDirectory() as dir2:
orig_getsitepackages = site.getsitepackages
site.getsitepackages = lambda: [dir1, dir2]
try:
# Place in dir2 only
pkg_dir = os.path.join(dir2, "mysharedpkg")
os.mkdir(pkg_dir)
codeflash_output = module_exists_in_site_packages("mysharedpkg")
finally:
site.getsitepackages = orig_getsitepackages
def test_module_exists_with_confusing_egg_info_name():
# Should match only the prefix before first dash
with TempSitePackages() as site_dir:
egg_info = os.path.join(site_dir, "foo-bar-baz-1.0.egg-info")
os.mkdir(egg_info)
# Only 'foo' should match
codeflash_output = module_exists_in_site_packages("foo") # 23.0μs -> 23.5μs (2.38% slower)
codeflash_output = module_exists_in_site_packages("foo-bar")
codeflash_output = module_exists_in_site_packages("foo-bar-baz") # 17.7μs -> 10.4μs (69.7% faster)
------------------ LARGE SCALE TEST CASES ------------------
def test_large_number_of_entries_performance():
# Create a site-packages with many unrelated entries and one valid package
with TempSitePackages() as site_dir:
# Add 999 unrelated dirs
for i in range(999):
os.mkdir(os.path.join(site_dir, f"unrelated_{i}"))
# Add a valid package dir
os.mkdir(os.path.join(site_dir, "targetpkg"))
codeflash_output = module_exists_in_site_packages("targetpkg") # 11.4μs -> 11.4μs (0.369% faster)
# Should not find a non-existent package
codeflash_output = module_exists_in_site_packages("notpresent")
def test_large_number_of_egg_info_entries():
# Create many egg-info dirs, only one matches
with TempSitePackages() as site_dir:
for i in range(998):
os.mkdir(os.path.join(site_dir, f"pkg{i}-1.0.egg-info"))
os.mkdir(os.path.join(site_dir, "specialpkg-9.9.egg-info"))
codeflash_output = module_exists_in_site_packages("specialpkg") # 267μs -> 276μs (3.23% slower)
codeflash_output = module_exists_in_site_packages("pkg999")
def test_large_number_of_py_files():
# Create many .py files, only one matches
with TempSitePackages() as site_dir:
for i in range(997):
with open(os.path.join(site_dir, f"file{i}.py"), "w") as f:
f.write("# dummy")
with open(os.path.join(site_dir, "uniquefile.py"), "w") as f:
f.write("# unique dummy")
codeflash_output = module_exists_in_site_packages("uniquefile") # 20.1μs -> 19.6μs (2.28% faster)
codeflash_output = module_exists_in_site_packages("file998")
def test_large_number_of_site_packages_dirs():
# Simulate many site-packages dirs, only one contains the module
temp_dirs = [tempfile.mkdtemp(prefix="sp_") for _ in range(10)]
try:
orig_getsitepackages = site.getsitepackages
site.getsitepackages = lambda: temp_dirs
# Place the module in the 7th dir
os.mkdir(os.path.join(temp_dirs[6], "bigpkg"))
codeflash_output = module_exists_in_site_packages("bigpkg")
# Should not find a module not present in any dir
codeflash_output = module_exists_in_site_packages("notthere")
finally:
site.getsitepackages = orig_getsitepackages
for d in temp_dirs:
shutil.rmtree(d)
def test_performance_with_mixed_entries():
# Mix of dirs, py files, egg-info, and dist-info
with TempSitePackages() as site_dir:
for i in range(250):
os.mkdir(os.path.join(site_dir, f"dir{i}"))
with open(os.path.join(site_dir, f"file{i}.py"), "w") as f:
f.write("# dummy")
os.mkdir(os.path.join(site_dir, f"pkg{i}-0.1.egg-info"))
os.mkdir(os.path.join(site_dir, f"pkg{i}-0.1.dist-info"))
# Add a unique package as .py file
with open(os.path.join(site_dir, "superpkg.py"), "w") as f:
f.write("# superpkg")
codeflash_output = module_exists_in_site_packages("superpkg") # 19.6μs -> 19.9μs (1.30% slower)
# Add a unique package as dist-info
os.mkdir(os.path.join(site_dir, "distonly-1.0.dist-info"))
codeflash_output = module_exists_in_site_packages("distonly")
# Non-existent
codeflash_output = module_exists_in_site_packages("notfoundhere") # 336μs -> 353μs (4.71% slower)
codeflash_output is used to check that the output of the original code is the same as that of the optimized code.
#------------------------------------------------
import os
import shutil
import sys
import tempfile
import types
imports
import pytest
from marimo._utils.site_packages import module_exists_in_site_packages
=======================
Unit tests start here
=======================
Helper: context manager to add a temp directory to site.getsitepackages()
class TempSitePackages:
"""Temporarily add a directory to site.getsitepackages() and getusersitepackages()."""
def init(self):
self.temp_dir = tempfile.mkdtemp()
self.orig_getsitepackages = None
self.orig_getusersitepackages = None
-----------------------
1. Basic Test Cases
-----------------------
def test_existing_package_dir():
"""Should return True for a package directory in site-packages."""
with TempSitePackages() as site_dir:
pkg_dir = os.path.join(site_dir, "mytestpkg")
os.mkdir(pkg_dir)
codeflash_output = module_exists_in_site_packages("mytestpkg") # 7.02μs -> 7.28μs (3.61% slower)
def test_existing_module_py_file():
"""Should return True for a .py file in site-packages."""
with TempSitePackages() as site_dir:
py_file = os.path.join(site_dir, "mymodule.py")
with open(py_file, "w") as f:
f.write("# dummy module")
codeflash_output = module_exists_in_site_packages("mymodule") # 14.7μs -> 14.5μs (0.971% faster)
def test_existing_egg_info():
"""Should return True for a .egg-info entry in site-packages."""
with TempSitePackages() as site_dir:
egginfo = os.path.join(site_dir, "mylib.egg-info")
with open(egginfo, "w") as f:
f.write("")
codeflash_output = module_exists_in_site_packages("mylib") # 29.1μs -> 20.9μs (38.9% faster)
def test_existing_dist_info():
"""Should return True for a .dist-info entry in site-packages."""
with TempSitePackages() as site_dir:
distinfo = os.path.join(site_dir, "otherlib-1.0.0.dist-info")
os.mkdir(distinfo)
codeflash_output = module_exists_in_site_packages("otherlib") # 21.5μs -> 22.1μs (2.92% slower)
def test_existing_egg():
"""Should return True for a .egg entry in site-packages."""
with TempSitePackages() as site_dir:
egg = os.path.join(site_dir, "eggmodule-0.1.egg")
os.mkdir(egg)
codeflash_output = module_exists_in_site_packages("eggmodule") # 22.0μs -> 22.6μs (2.62% slower)
def test_nonexistent_module():
"""Should return False for a module that does not exist."""
with TempSitePackages() as site_dir:
codeflash_output = module_exists_in_site_packages("idontexist") # 28.6μs -> 20.7μs (38.1% faster)
-----------------------
2. Edge Test Cases
-----------------------
def test_empty_module_name():
"""Should return False for an empty string as module name."""
with TempSitePackages() as site_dir:
codeflash_output = module_exists_in_site_packages("") # 8.00μs -> 8.15μs (1.87% slower)
def test_module_name_with_dot():
"""Should not match submodules, only top-level modules/packages."""
with TempSitePackages() as site_dir:
# Create a package "foo", but not "foo.bar"
pkg_dir = os.path.join(site_dir, "foo")
os.mkdir(pkg_dir)
codeflash_output = module_exists_in_site_packages("foo.bar") # 29.4μs -> 22.3μs (32.1% faster)
def test_module_name_with_special_chars():
"""Should handle module names with special characters."""
with TempSitePackages() as site_dir:
# Valid Python module names can't have hyphens, but eggs/dist-info might
distinfo = os.path.join(site_dir, "my_mod-1.0.0.dist-info")
os.mkdir(distinfo)
codeflash_output = module_exists_in_site_packages("my_mod") # 21.5μs -> 21.6μs (0.448% slower)
codeflash_output = module_exists_in_site_packages("my-mod")
def test_case_sensitivity():
"""Should be case-sensitive on case-sensitive filesystems."""
with TempSitePackages() as site_dir:
pkg_dir = os.path.join(site_dir, "CaseTest")
os.mkdir(pkg_dir)
# On case-insensitive filesystems (Windows/macOS), this may pass, but on Linux it should not
codeflash_output = module_exists_in_site_packages("casetest"); result = codeflash_output # 28.5μs -> 20.2μs (41.0% faster)
# The correct case should always be found
codeflash_output = module_exists_in_site_packages("CaseTest")
def test_site_packages_dir_missing():
"""Should handle non-existent site-packages directory gracefully."""
import site
orig_getsitepackages = site.getsitepackages
orig_getusersitepackages = getattr(site, "getusersitepackages", None)
def fake_getsitepackages():
return ["/nonexistent/path/to/site-packages"]
def fake_getusersitepackages():
return "/nonexistent/path/to/user-site-packages"
site.getsitepackages = fake_getsitepackages
site.getusersitepackages = fake_getusersitepackages
try:
codeflash_output = module_exists_in_site_packages("anymodule")
finally:
site.getsitepackages = orig_getsitepackages
if orig_getusersitepackages is not None:
site.getusersitepackages = orig_getusersitepackages
def test_site_packages_not_callable():
"""Should handle site.getsitepackages not present (e.g., in some virtualenvs)."""
import site
orig_getsitepackages = getattr(site, "getsitepackages", None)
orig_getusersitepackages = getattr(site, "getusersitepackages", None)
if hasattr(site, "getsitepackages"):
delattr(site, "getsitepackages")
if hasattr(site, "getusersitepackages"):
delattr(site, "getusersitepackages")
try:
# Should not raise, should return False
codeflash_output = module_exists_in_site_packages("foo")
finally:
if orig_getsitepackages is not None:
site.getsitepackages = orig_getsitepackages
if orig_getusersitepackages is not None:
site.getusersitepackages = orig_getusersitepackages
def test_permission_error_on_site_packages():
"""Should return False if site-packages cannot be listed due to permissions."""
with TempSitePackages() as site_dir:
os.chmod(site_dir, 0o000) # remove all permissions
try:
codeflash_output = module_exists_in_site_packages("foo")
finally:
os.chmod(site_dir, 0o700)
def test_symlinked_package_dir():
"""Should detect package that is a symlink in site-packages."""
with TempSitePackages() as site_dir:
real_dir = tempfile.mkdtemp()
symlink_dir = os.path.join(site_dir, "symlinkpkg")
os.symlink(real_dir, symlink_dir)
codeflash_output = module_exists_in_site_packages("symlinkpkg") # 8.35μs -> 8.43μs (0.937% slower)
shutil.rmtree(real_dir)
def test_symlinked_py_file():
"""Should detect .py file that is a symlink in site-packages."""
with TempSitePackages() as site_dir:
real_py = tempfile.mkstemp(suffix=".py")[1]
symlink_py = os.path.join(site_dir, "symlinkmod.py")
os.symlink(real_py, symlink_py)
codeflash_output = module_exists_in_site_packages("symlinkmod") # 15.3μs -> 15.0μs (1.85% faster)
os.remove(real_py)
def test_file_named_like_egginfo():
"""Should not match files that are not .egg-info/.dist-info/.egg directories."""
with TempSitePackages() as site_dir:
# Create a regular file, not a directory
file_path = os.path.join(site_dir, "notapkg-1.0.0.egg-info")
with open(file_path, "w") as f:
f.write("")
codeflash_output = module_exists_in_site_packages("notapkg") # 21.3μs -> 21.5μs (1.11% slower)
def test_dotfile_and_hidden_entries():
"""Should ignore hidden files and dotfiles."""
with TempSitePackages() as site_dir:
hidden = os.path.join(site_dir, ".hiddenpkg")
os.mkdir(hidden)
codeflash_output = module_exists_in_site_packages(".hiddenpkg") # 7.80μs -> 8.19μs (4.80% slower)
# But not when searching for unhidden name
codeflash_output = module_exists_in_site_packages("hiddenpkg")
-----------------------
3. Large Scale Test Cases
-----------------------
def test_large_number_of_packages():
"""Should efficiently handle a site-packages with many packages."""
with TempSitePackages() as site_dir:
# Create 500 dummy packages
for i in range(500):
os.mkdir(os.path.join(site_dir, f"pkg{i}"))
# Add a real one at the end
os.mkdir(os.path.join(site_dir, "specialpkg"))
codeflash_output = module_exists_in_site_packages("specialpkg") # 11.4μs -> 11.2μs (2.10% faster)
codeflash_output = module_exists_in_site_packages("pkg250")
codeflash_output = module_exists_in_site_packages("notpresent") # 4.04μs -> 3.86μs (4.64% faster)
def test_large_number_of_files():
"""Should efficiently handle a site-packages with many files."""
with TempSitePackages() as site_dir:
# Create 500 .py files
for i in range(500):
with open(os.path.join(site_dir, f"mod{i}.py"), "w") as f:
f.write("# dummy\n")
with open(os.path.join(site_dir, "targetmod.py"), "w") as f:
f.write("# target module\n")
codeflash_output = module_exists_in_site_packages("targetmod") # 19.7μs -> 20.0μs (1.24% slower)
codeflash_output = module_exists_in_site_packages("mod499")
codeflash_output = module_exists_in_site_packages("notamodule") # 9.03μs -> 8.64μs (4.41% faster)
def test_large_number_of_egg_info_dirs():
"""Should efficiently handle many .egg-info and .dist-info directories."""
with TempSitePackages() as site_dir:
for i in range(500):
os.mkdir(os.path.join(site_dir, f"lib{i}-0.0.1.egg-info"))
os.mkdir(os.path.join(site_dir, "final-egg-1.2.3.egg-info"))
codeflash_output = module_exists_in_site_packages("final-egg") # 335μs -> 209μs (60.6% faster)
codeflash_output = module_exists_in_site_packages("lib123")
codeflash_output = module_exists_in_site_packages("notegg") # 121μs -> 131μs (7.82% slower)
def test_performance_with_mixed_entries():
"""Should not degrade with a mix of files, dirs, and egg-info entries."""
with TempSitePackages() as site_dir:
# 200 dirs, 200 files, 200 egg-info
for i in range(200):
os.mkdir(os.path.join(site_dir, f"dirpkg{i}"))
with open(os.path.join(site_dir, f"filemod{i}.py"), "w") as f:
f.write("# dummy\n")
os.mkdir(os.path.join(site_dir, f"eggmod{i}-0.0.1.egg-info"))
# Add a unique one
with open(os.path.join(site_dir, "uniquemod.py"), "w") as f:
f.write("# unique\n")
codeflash_output = module_exists_in_site_packages("uniquemod") # 20.3μs -> 20.7μs (1.91% slower)
codeflash_output = module_exists_in_site_packages("dirpkg199")
codeflash_output = module_exists_in_site_packages("eggmod199") # 4.35μs -> 4.20μs (3.79% faster)
codeflash_output = module_exists_in_site_packages("notfound")
codeflash_output is used to check that the output of the original code is the same as that of the optimized code.
#------------------------------------------------
from marimo._utils.site_packages import module_exists_in_site_packages
def test_module_exists_in_site_packages():
module_exists_in_site_packages('')
🔎 Concolic Coverage Tests and Runtime
codeflash_concolic_bps3n5s8/tmpbxnbh6u7/test_concolic_coverage.py::test_module_exists_in_site_packagesTo edit these changes
git checkout codeflash/optimize-module_exists_in_site_packages-mhv6qv43and push.