Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/workflows/python-notebook-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@ jobs:
fail-fast: false # Continue running all jobs even if one fails
env:
DEBUG: 1
GRAPHRAG_API_KEY: ${{ secrets.OPENAI_NOTEBOOK_KEY }}
GRAPHRAG_API_KEY: ${{ secrets.OPENAI_API_KEY }}
GRAPHRAG_API_BASE: ${{ secrets.GRAPHRAG_API_BASE }}

runs-on: ${{ matrix.os }}
steps:
Expand Down
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ output/lancedb
venv/
.conda
.tmp
packages/graphrag-llm/notebooks/metrics
packages/graphrag-llm/notebooks/cache

.env
build.zip
Expand Down
4 changes: 2 additions & 2 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@
"module": "graphrag",
"args": [
"query",
"${input:query}",
"--root",
"${input:root_folder}",
"--method", "${input:query_method}",
"--query", "${input:query}"
"--method", "${input:query_method}"
]
},
{
Expand Down
1 change: 1 addition & 0 deletions dictionary.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ noqa
dtypes
ints
genid
isinstance

# Azure
abfs
Expand Down
15 changes: 13 additions & 2 deletions packages/graphrag-cache/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,15 @@
```python
import asyncio
from graphrag_storage import StorageConfig, create_storage, StorageType
from graphrag_cache import CacheConfig, create_cache, CacheType
from graphrag_cache import CacheConfig, create_cache, CacheType, create_cache_key

async def run():
cache = create_cache()

# The above is equivalent to the following:
cache = create_cache(
CacheConfig(
type=CacheType.Json
type=CacheType.Json,
storage=StorageConfig(
type=StorageType.File,
base_dir="cache"
Expand All @@ -21,6 +24,14 @@ async def run():
await cache.set("my_key", {"some": "object to cache"})
print(await cache.get("my_key"))

# create cache key from data dict.
cache_key = create_cache_key({
"some_arg": "some_value",
"something_else": 5
})
await cache.set(cache_key, {"some": "object to cache"})
print(await cache.get(cache_key))

if __name__ == "__main__":
asyncio.run(run())
```
Expand Down
3 changes: 3 additions & 0 deletions packages/graphrag-cache/graphrag_cache/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,15 @@
from graphrag_cache.cache import Cache
from graphrag_cache.cache_config import CacheConfig
from graphrag_cache.cache_factory import create_cache, register_cache
from graphrag_cache.cache_key import CacheKeyCreator, create_cache_key
from graphrag_cache.cache_type import CacheType

__all__ = [
"Cache",
"CacheConfig",
"CacheKeyCreator",
"CacheType",
"create_cache",
"create_cache_key",
"register_cache",
]
7 changes: 5 additions & 2 deletions packages/graphrag-cache/graphrag_cache/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,17 @@
from __future__ import annotations

from abc import ABC, abstractmethod
from typing import Any
from typing import TYPE_CHECKING, Any

if TYPE_CHECKING:
from graphrag_storage import Storage


class Cache(ABC):
"""Provide a cache interface for the pipeline."""

@abstractmethod
def __init__(self, **kwargs: Any) -> None:
def __init__(self, *, storage: Storage | None, **kwargs: Any) -> None:
"""Create a cache instance."""

@abstractmethod
Expand Down
4 changes: 2 additions & 2 deletions packages/graphrag-cache/graphrag_cache/cache_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

"""Cache configuration model."""

from graphrag_storage import StorageConfig
from graphrag_storage import StorageConfig, StorageType
from pydantic import BaseModel, ConfigDict, Field

from graphrag_cache.cache_type import CacheType
Expand All @@ -22,5 +22,5 @@ class CacheConfig(BaseModel):

storage: StorageConfig | None = Field(
description="The storage configuration to use for file-based caches such as 'Json'.",
default=None,
default_factory=lambda: StorageConfig(type=StorageType.File, base_dir="cache"),
)
26 changes: 19 additions & 7 deletions packages/graphrag-cache/graphrag_cache/cache_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,22 @@
"""Cache factory implementation."""

from collections.abc import Callable
from typing import TYPE_CHECKING

from graphrag_common.factory import Factory, ServiceScope
from graphrag_storage import Storage
from graphrag_common.factory import Factory
from graphrag_storage import create_storage

from graphrag_cache.cache import Cache
from graphrag_cache.cache_config import CacheConfig
from graphrag_cache.cache_type import CacheType

if TYPE_CHECKING:
from graphrag_common.factory import ServiceScope
from graphrag_storage import Storage

class CacheFactory(Factory[Cache]):
from graphrag_cache.cache import Cache


class CacheFactory(Factory["Cache"]):
"""A factory class for cache implementations."""


Expand All @@ -23,8 +29,8 @@ class CacheFactory(Factory[Cache]):

def register_cache(
cache_type: str,
cache_initializer: Callable[..., Cache],
scope: ServiceScope = "transient",
cache_initializer: Callable[..., "Cache"],
scope: "ServiceScope" = "transient",
) -> None:
"""Register a custom cache implementation.

Expand All @@ -38,7 +44,9 @@ def register_cache(
cache_factory.register(cache_type, cache_initializer, scope)


def create_cache(config: CacheConfig, storage: Storage | None = None) -> Cache:
def create_cache(
config: CacheConfig | None = None, storage: "Storage | None" = None
) -> "Cache":
"""Create a cache implementation based on the given configuration.

Args
Expand All @@ -53,9 +61,13 @@ def create_cache(config: CacheConfig, storage: Storage | None = None) -> Cache:
Cache
The created cache implementation.
"""
config = config or CacheConfig()
config_model = config.model_dump()
cache_strategy = config.type

if not storage and config.storage:
storage = create_storage(config.storage)

if cache_strategy not in cache_factory:
match cache_strategy:
case CacheType.Json:
Expand Down
36 changes: 36 additions & 0 deletions packages/graphrag-cache/graphrag_cache/cache_key.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License

"""Create cache key."""

from typing import Any, Protocol, runtime_checkable

from graphrag_common.hasher import hash_data


@runtime_checkable
class CacheKeyCreator(Protocol):
"""Create cache key function protocol.

Args
----
input_args: dict[str, Any]
The input arguments for creating the cache key.

Returns
-------
str
The generated cache key.
"""

def __call__(
self,
input_args: dict[str, Any],
) -> str:
"""Create cache key."""
...


def create_cache_key(input_args: dict[str, Any]) -> str:
"""Create a cache key based on the input arguments."""
return hash_data(input_args)
31 changes: 23 additions & 8 deletions packages/graphrag-common/graphrag_common/factory/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
from dataclasses import dataclass
from typing import Any, ClassVar, Generic, Literal, TypeVar

from graphrag_common.hasher import hash_data

T = TypeVar("T", covariant=True)

ServiceScope = Literal["singleton", "transient"]
Expand Down Expand Up @@ -57,9 +59,14 @@ def register(

Args
----
strategy: The name of the strategy.
initializer: A callable that creates an instance of T.
scope: The service scope, either 'singleton' or 'transient'.
strategy: str
The name of the strategy.
initializer: Callable[..., T]
A callable that creates an instance of T.
scope: ServiceScope (default: "transient")
The scope of the service ("singleton" or "transient").
Singleton services are cached based on their init args
so that the same instance is returned for the same init args.
"""
self._service_initializers[strategy] = _ServiceDescriptor(scope, initializer)

Expand All @@ -69,8 +76,10 @@ def create(self, strategy: str, init_args: dict[str, Any] | None = None) -> T:

Args
----
strategy: The name of the strategy.
init_args: Dict of keyword arguments to pass to the service initializer.
strategy: str
The name of the strategy.
init_args: dict[str, Any] | None
A dictionary of keyword arguments to pass to the service initializer.

Returns
-------
Expand All @@ -85,14 +94,20 @@ def create(self, strategy: str, init_args: dict[str, Any] | None = None) -> T:
raise ValueError(msg)

# Delete entries with value None
# That way services can have default values
init_args = {k: v for k, v in (init_args or {}).items() if v is not None}

service_descriptor = self._service_initializers[strategy]
if service_descriptor.scope == "singleton":
if strategy not in self._initialized_services:
self._initialized_services[strategy] = service_descriptor.initializer(
cache_key = hash_data({
"strategy": strategy,
"init_args": init_args,
})

if cache_key not in self._initialized_services:
self._initialized_services[cache_key] = service_descriptor.initializer(
**init_args
)
return self._initialized_services[strategy]
return self._initialized_services[cache_key]

return service_descriptor.initializer(**(init_args or {}))
18 changes: 18 additions & 0 deletions packages/graphrag-common/graphrag_common/hasher/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License

"""The GraphRAG hasher module."""

from graphrag_common.hasher.hasher import (
Hasher,
hash_data,
make_yaml_serializable,
sha256_hasher,
)

__all__ = [
"Hasher",
"hash_data",
"make_yaml_serializable",
"sha256_hasher",
]
59 changes: 59 additions & 0 deletions packages/graphrag-common/graphrag_common/hasher/hasher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License

"""The GraphRAG hasher module."""

import hashlib
from collections.abc import Callable
from typing import Any

import yaml

Hasher = Callable[[str], str]
"""Type alias for a hasher function (data: str) -> str."""


def sha256_hasher(data: str) -> str:
"""Generate a SHA-256 hash for the input data."""
return hashlib.sha256(data.encode("utf-8")).hexdigest()


def make_yaml_serializable(data: Any) -> Any:
"""Convert data to a YAML-serializable format."""
if isinstance(data, (list, tuple)):
return tuple(make_yaml_serializable(item) for item in data)

if isinstance(data, set):
return tuple(sorted(make_yaml_serializable(item) for item in data))

if isinstance(data, dict):
return tuple(
sorted((key, make_yaml_serializable(value)) for key, value in data.items())
)

return str(data)


def hash_data(data: Any, *, hasher: Hasher | None = None) -> str:
"""Hash the input data dictionary using the specified hasher function.

Args
----
data: dict[str, Any]
The input data to be hashed.
The input data is serialized using yaml
to support complex data structures such as classes and functions.
hasher: Hasher | None (default: sha256_hasher)
The hasher function to use. (data: str) -> str

Returns
-------
str
The resulting hash of the input data.

"""
hasher = hasher or sha256_hasher
try:
return hasher(yaml.dump(data, sort_keys=True))
except TypeError:
return hasher(yaml.dump(make_yaml_serializable(data), sort_keys=True))
Loading