Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .semversioner/next-release/major-20250909002702300683.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"type": "major",
"description": "Re-implement graspologic methods to remove dependency. Remove visualization steps."
}
2 changes: 1 addition & 1 deletion graphrag/index/operations/cluster_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
import logging

import networkx as nx
from graspologic.partition import hierarchical_leiden

from graphrag.index.utils.graphs import hierarchical_leiden
from graphrag.index.utils.stable_lcc import stable_largest_connected_component

Communities = list[tuple[int, int, int, list[str]]]
Expand Down
4 changes: 2 additions & 2 deletions graphrag/index/operations/prune_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@

from typing import TYPE_CHECKING, cast

import graspologic as glc
import networkx as nx
import numpy as np

import graphrag.data_model.schemas as schemas
from graphrag.index.utils.graphs import largest_connected_component

if TYPE_CHECKING:
from networkx.classes.reportviews import DegreeView
Expand Down Expand Up @@ -78,7 +78,7 @@ def prune_graph(
])

if lcc_only:
return glc.utils.largest_connected_component(graph) # type: ignore
return largest_connected_component(graph)

return graph

Expand Down
125 changes: 118 additions & 7 deletions graphrag/index/utils/graphs.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,136 @@
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License

"""Collection of graph utility functions."""
"""
Collection of graph utility functions.

These are largely copies/re-implementations of graspologic methods to avoid dependency issues.
"""

import logging
from typing import cast
import math
from collections import defaultdict
from typing import Any, cast

import graspologic_native as gn
import networkx as nx
import numpy as np
import pandas as pd
from graspologic.partition import hierarchical_leiden, modularity
from graspologic.utils import largest_connected_component

from graphrag.config.enums import ModularityMetric

logger = logging.getLogger(__name__)


def largest_connected_component(graph: nx.Graph) -> nx.Graph:
"""Return the largest connected component of the graph."""
graph = graph.copy()
lcc_nodes = max(nx.connected_components(graph), key=len)
lcc = graph.subgraph(lcc_nodes).copy()
lcc.remove_nodes_from([n for n in lcc if n not in lcc_nodes])
return cast("nx.Graph", lcc)


def _nx_to_edge_list(
graph: nx.Graph,
weight_attribute: str = "weight",
weight_default: float = 1.0,
) -> list[tuple[str, str, float]]:
"""
Convert an undirected, non-multigraph networkx graph to a list of edges.

Each edge is represented as a tuple of (source_str, target_str, weight).
"""
edge_list: list[tuple[str, str, float]] = []

# Decide how to retrieve the weight data
edge_iter = graph.edges(data=weight_attribute, default=weight_default) # type: ignore

for source, target, weight in edge_iter:
source_str = str(source)
target_str = str(target)
edge_list.append((source_str, target_str, float(weight)))

return edge_list


def hierarchical_leiden(
graph: nx.Graph,
max_cluster_size: int = 10,
random_seed: int | None = 0xDEADBEEF,
) -> Any:
"""Run hierarchical leiden on the graph."""
return gn.hierarchical_leiden(
edges=_nx_to_edge_list(graph),
max_cluster_size=max_cluster_size,
seed=random_seed,
starting_communities=None,
resolution=1.0,
randomness=0.001,
use_modularity=True,
iterations=1,
)


def modularity(
graph: nx.Graph,
partitions: dict[Any, int],
weight_attribute: str = "weight",
resolution: float = 1.0,
) -> float:
"""Given an undirected graph and a dictionary of vertices to community ids, calculate the modularity."""
components = _modularity_components(graph, partitions, weight_attribute, resolution)
return sum(components.values())


def _modularity_component(
intra_community_degree: float,
total_community_degree: float,
network_degree_sum: float,
resolution: float,
) -> float:
community_degree_ratio = math.pow(total_community_degree, 2.0) / (
2.0 * network_degree_sum
)
return (intra_community_degree - resolution * community_degree_ratio) / (
2.0 * network_degree_sum
)


def _modularity_components(
graph: nx.Graph,
partitions: dict[Any, int],
weight_attribute: str = "weight",
resolution: float = 1.0,
) -> dict[int, float]:
total_edge_weight = 0.0
communities = set(partitions.values())

degree_sums_within_community: dict[int, float] = defaultdict(lambda: 0.0)
degree_sums_for_community: dict[int, float] = defaultdict(lambda: 0.0)
for vertex, neighbor_vertex, weight in graph.edges(data=weight_attribute):
vertex_community = partitions[vertex]
neighbor_community = partitions[neighbor_vertex]
if vertex_community == neighbor_community:
if vertex == neighbor_vertex:
degree_sums_within_community[vertex_community] += weight
else:
degree_sums_within_community[vertex_community] += weight * 2.0
degree_sums_for_community[vertex_community] += weight
degree_sums_for_community[neighbor_community] += weight
total_edge_weight += weight

return {
comm: _modularity_component(
degree_sums_within_community[comm],
degree_sums_for_community[comm],
total_edge_weight,
resolution,
)
for comm in communities
}


def calculate_root_modularity(
graph: nx.Graph,
max_cluster_size: int = 10,
Expand Down Expand Up @@ -147,9 +261,6 @@ def calculate_modularity(
random_seed=random_seed,
use_root_modularity=use_root_modularity,
)
case _:
msg = f"Unknown modularity metric type: {modularity_metric}"
raise ValueError(msg)


def calculate_pmi_edge_weights(
Expand Down
5 changes: 2 additions & 3 deletions graphrag/index/utils/stable_lcc.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,11 @@

import networkx as nx

from graphrag.index.utils.graphs import largest_connected_component


def stable_largest_connected_component(graph: nx.Graph) -> nx.Graph:
"""Return the largest connected component of the graph, with nodes and edges sorted in a stable way."""
# NOTE: The import is done here to reduce the initial import time of the module
from graspologic.utils import largest_connected_component

graph = graph.copy()
graph = cast("nx.Graph", largest_connected_component(graph))
graph = normalize_node_names(graph)
Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ dependencies = [
"tiktoken>=0.9.0",
# Data-Science
"numpy>=1.25.2",
"graspologic>=3.4.1",
"networkx>=3.4.2",
"pandas>=2.2.3",
"pyarrow>=17.0.0",
Expand All @@ -66,6 +65,7 @@ dependencies = [
"tqdm>=4.67.1",
"textblob>=0.18.0.post0",
"spacy>=3.8.4",
"graspologic-native>=1.2.5",
]

[project.optional-dependencies]
Expand Down Expand Up @@ -260,4 +260,4 @@ exclude = ["**/node_modules", "**/__pycache__"]
asyncio_default_fixture_loop_scope = "function"
asyncio_mode = "auto"
timeout = 1000
env_files = [".env"]
env_files = [".env"]
26 changes: 0 additions & 26 deletions tests/unit/indexing/graph/utils/test_stable_lcc.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,32 +20,6 @@ def test_undirected_graph_run_twice_produces_same_graph(self):
nx.generate_graphml(graph_out_2)
)

def test_directed_graph_keeps_source_target_intact(self):
# create the test graph as a directed graph
graph_in = self._create_strongly_connected_graph_with_edges_flipped(
digraph=True
)
graph_out = stable_largest_connected_component(graph_in.copy())

# Make sure edges are the same and the direction is preserved
edges_1 = [f"{edge[0]} -> {edge[1]}" for edge in graph_in.edges(data=True)]
edges_2 = [f"{edge[0]} -> {edge[1]}" for edge in graph_out.edges(data=True)]

assert edges_1 == edges_2

def test_directed_graph_run_twice_produces_same_graph(self):
# create the test graph as a directed graph
graph_in = self._create_strongly_connected_graph_with_edges_flipped(
digraph=True
)
graph_out_1 = stable_largest_connected_component(graph_in.copy())
graph_out_2 = stable_largest_connected_component(graph_in.copy())

# Make sure the output is identical when run multiple times
assert "".join(nx.generate_graphml(graph_out_1)) == "".join(
nx.generate_graphml(graph_out_2)
)

def _create_strongly_connected_graph(self, digraph=False):
graph = nx.Graph() if not digraph else nx.DiGraph()
graph.add_node("1", node_name=1)
Expand Down
Loading
Loading