Skip to content

Commit

Permalink
Use an LRU to hold documents loaded from the filesystem. WIP #720
Browse files Browse the repository at this point in the history
  • Loading branch information
fabioz committed Aug 10, 2022
1 parent 0954a07 commit ae83702
Show file tree
Hide file tree
Showing 4 changed files with 249 additions and 30 deletions.
115 changes: 111 additions & 4 deletions robocorp-python-ls-core/src/robocorp_ls_core/cache.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
from robocorp_ls_core.basic import implements
from robocorp_ls_core.constants import NULL
from robocorp_ls_core.protocols import IDirCache, check_implements, T
from robocorp_ls_core.protocols import IDirCache, check_implements, T, Sentinel
from robocorp_ls_core.robotframework_log import get_logger

from collections import namedtuple
from pathlib import Path
from typing import Any, Generic, Callable, Optional
from typing import Any, Generic, Callable, Optional, TypeVar
import functools
import os
import collections


log = get_logger(__name__)
Expand Down Expand Up @@ -39,8 +40,6 @@ def cache_this(self):

@functools.wraps(func)
def new_func(self, *args, **kwargs):
from robocorp_ls_core.protocols import Sentinel

try:
cache = getattr(self, "__instance_cache__")
except:
Expand Down Expand Up @@ -207,3 +206,111 @@ def _get_mtime_cache_info(self, file_path: Path) -> Optional[CachedFileMTimeInfo

def is_cache_valid(self) -> bool:
return self._mtime_info == self._get_mtime_cache_info(self.file_path)


KT = TypeVar("KT") # Key type.
VT = TypeVar("VT") # Value type.


class LRUCache(Generic[KT, VT]):
"""
A cache which has a max size and a size to resize to when we'd go over the
limit available in the LRU.
It's possible to customize the maximum size of the cache and to give a
custom weight for each entry.
Note that this is not an exact-value capacity. It's possible that the
capacity is exceeded in a case where an item added and the resize_to +
item_size exceeds the max_size.
This cache is NOT thread safe (users must lock if needed).
It's designed so that it's a dict where the key points to a list with
[key, value, access_time, entry_size]
"""

def __init__(
self,
max_size: int = 100,
resize_to: Optional[int] = None,
get_size: Callable[[Any], int] = lambda obj: 1,
):
assert max_size >= 1
self.max_size = max_size

if resize_to is None:
resize_to = int(max_size * 0.7)
assert resize_to < max_size

self.resize_to = resize_to
self._get_size = get_size

self._dict: Any = collections.OrderedDict()
self._current_size_usage = 0

def clear(self) -> None:
self._current_size_usage = 0
self._dict.clear()

@property
def current_size_usage(self) -> int:
return self._current_size_usage

def __getitem__(self, key: KT) -> VT:
item = self._dict[key]
self._dict.move_to_end(key)
return item[0]

def __contains__(self, key: KT) -> bool:
try:
self[key]
return True
except KeyError:
return False

def __len__(self) -> int:
return len(self._dict)

def __delitem__(self, key: KT) -> None:
entry = self._dict.pop(key)
self._current_size_usage -= entry[-1]

def pop(self, key: KT, default=Sentinel.SENTINEL) -> Optional[Any]:
try:
entry = self._dict.pop(key)
self._current_size_usage -= entry[-1]
return entry[0]
except KeyError:
if default is not Sentinel.SENTINEL:
return default
raise

def __setitem__(self, key: KT, value: VT) -> None:
item = self._dict.get(key)
if item is None:
new_size = self._get_size(value)

if self._current_size_usage + new_size > self.max_size:
# Note that we may exceed the size cache because we'll resize
# to the target value without accounting for the size of
# the new entry (and that's ok for this use case).
self._resize_to()

self._current_size_usage += new_size
item = [value, new_size]
self._dict[key] = item
else:
item[0] = value
self._dict.move_to_end(key)

def get(self, key: KT, default: Optional[Any] = None) -> Optional[Any]:
try:
return self[key]
except KeyError:
return default

def _resize_to(self) -> None:
while self._current_size_usage > self.resize_to:
_key, entry = self._dict.popitem(last=False)
self._current_size_usage -= entry[-1]
6 changes: 3 additions & 3 deletions robocorp-python-ls-core/src/robocorp_ls_core/uris.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def _normalize_win_path(path):
return path, netloc


@lru_cache(200)
@lru_cache(500)
def from_fs_path(path: str) -> str:
"""Returns a URI for the given filesystem path."""
scheme = "file"
Expand All @@ -90,14 +90,14 @@ def from_fs_path(path: str) -> str:
return urlunparse((scheme, netloc, path, params, query, fragment))


@lru_cache(200)
@lru_cache(500)
def normalize_uri(uri: str) -> str:
if uri_scheme(uri) == "file":
return from_fs_path(to_fs_path(uri))
return uri


@lru_cache(200)
@lru_cache(500)
def to_fs_path(uri: str) -> str:
"""Returns the filesystem path of the given URI.
Expand Down
78 changes: 55 additions & 23 deletions robocorp-python-ls-core/src/robocorp_ls_core/workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,7 @@ def __init__(
) -> None:
from robocorp_ls_core.lsp import WorkspaceFolder
from robocorp_ls_core.callbacks import Callback
from robocorp_ls_core.cache import LRUCache

self._main_thread = threading.current_thread()

Expand All @@ -337,7 +338,36 @@ def __init__(
self._docs: Dict[str, IDocument] = {}

# Contains the docs pointing to the filesystem.
self._filesystem_docs: Dict[str, IDocument] = {}
def _get_size(doc: IDocument):
# In a simplistic way we say that each char in a document occupies
# 8 bytes and then multiply this by 7.5 to account for the supposed
# amount of memory used by the AST which is cached along in the document
return 8 * len(doc.source) * 7.5

one_gb_in_bytes = int(1e9)
target_memory_in_bytes: int = one_gb_in_bytes # Default value

target_memory_in_bytes_str = os.environ.get(
"RFLS_FILES_TARGET_MEMORY_IN_BYTES", None
)
if target_memory_in_bytes_str:
try:
target_memory_in_bytes = int(target_memory_in_bytes_str)
except:
log.critical(
"Expected RFLS_FILES_TARGET_MEMORY_IN_BYTES to evaluate to an int. Found: %s",
target_memory_in_bytes,
)

fifty_mb_in_bytes = int(5e7)
if target_memory_in_bytes <= fifty_mb_in_bytes:
target_memory_in_bytes = fifty_mb_in_bytes

# Whenever we reach 1GB of used memory we clear up to use 700 MB.
self._filesystem_docs: LRUCache[str, IDocument] = LRUCache(
target_memory_in_bytes, get_size=_get_size
)
self._filesystem_docs_lock = threading.Lock()

self.on_file_changed = Callback()

Expand Down Expand Up @@ -422,31 +452,31 @@ def get_folder_paths(self) -> List[str]:

@implements(IWorkspace.get_document)
def get_document(self, doc_uri: str, accept_from_file: bool) -> Optional[IDocument]:
# Ok, thread-safe (does not mutate the _docs dict -- contents in the _filesystem_docs
# may end up stale or we may have multiple loads when we wouldn't need,
# but that should be ok).
# Ok, thread-safe (does not mutate the _docs dict so the GIL keeps us
# safe -- contents in the _filesystem_docs need a lock though).
doc = self._docs.get(normalize_uri(doc_uri))
if doc is not None:
return doc

if accept_from_file:
doc = self._filesystem_docs.get(doc_uri)

if doc is not None:
if not doc.is_source_in_sync():
self._filesystem_docs.pop(doc_uri, None)
doc = None

if doc is None:
try:
doc = self._create_document(doc_uri, force_load_source=True)
except:
log.debug("Unable to load contents from: %s", doc_uri)
# Unable to load contents: file does not exist.
doc = None
else:
doc.immutable = True
self._filesystem_docs[doc_uri] = doc
with self._filesystem_docs_lock:
doc = self._filesystem_docs.get(doc_uri)

if doc is not None:
if not doc.is_source_in_sync():
self._filesystem_docs.pop(doc_uri, None)
doc = None

if doc is None:
try:
doc = self._create_document(doc_uri, force_load_source=True)
except:
log.debug("Unable to load contents from: %s", doc_uri)
# Unable to load contents: file does not exist.
doc = None
else:
doc.immutable = True
self._filesystem_docs[doc_uri] = doc

return doc

Expand All @@ -473,7 +503,8 @@ def put_document(self, text_document: TextDocumentItem) -> IDocument:
_source = doc.source
except:
doc.source = ""
self._filesystem_docs.pop(normalized_doc_uri, None)
with self._filesystem_docs_lock:
self._filesystem_docs.pop(normalized_doc_uri, None)
return doc

@implements(IWorkspace.remove_document)
Expand Down Expand Up @@ -528,7 +559,8 @@ def dispose(self):
self.remove_folder(folder_uri)

self._docs = {}
self._filesystem_docs = {}
with self._filesystem_docs_lock:
self._filesystem_docs.clear()

def __typecheckself__(self) -> None:
from robocorp_ls_core.protocols import check_implements
Expand Down
80 changes: 80 additions & 0 deletions robocorp-python-ls-core/tests/robocorp_ls_core_tests/test_lru.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import pytest


class Item(object):
def __init__(self, size):
self.size = size


def test_lru():
from robocorp_ls_core.cache import LRUCache

cache: LRUCache[int, Item] = LRUCache(100, get_size=lambda a: a.size)
assert round(cache.resize_to) == round(70)
cache[1] = Item(45)

# Here we'd go over the capacity (and that's ok for this class as we'll
# resize to 70 and the sole item is below this).
cache[2] = Item(40)

assert len(cache) == 2
assert cache.current_size_usage == 40 + 45

# We'll still be over capacity and we'll have cycled only one element.
cache[3] = Item(39)

assert len(cache) == 2
assert 1 not in cache
assert cache.current_size_usage == 40 + 39

# Make it the last accessed in the LRU (so it's kept instead of the one with
# key == 3)
assert cache[2].size == 40

cache[4] = Item(38)
assert len(cache) == 2
assert 3 not in cache
assert 2 in cache
assert 4 in cache
assert cache.current_size_usage == 40 + 38

cache.clear()
assert cache.current_size_usage == 0
assert len(cache) == 0


def test_lru_unitary_size():
from robocorp_ls_core.cache import LRUCache

cache = LRUCache(3, 1)
cache[1] = Item(1)
assert len(cache) == 1

cache[2] = Item(2)
assert len(cache) == 2

cache[3] = Item(3)
assert len(cache) == 3

cache[4] = Item(4)
# It was resized to 1 and then item 4 was added
assert len(cache) == 2

assert cache.get(5) is None
assert cache.current_size_usage == 2

del cache[4]
assert cache.current_size_usage == 1
assert len(cache) == 1

with pytest.raises(KeyError):
del cache[4]

assert 3 in cache

item4 = cache[4] = Item(4)
assert cache.pop(4) is item4

with pytest.raises(KeyError):
cache.pop(4)
assert cache.pop(4, "foo") == "foo"

0 comments on commit ae83702

Please sign in to comment.