diff --git a/.semversioner/next-release/patch-20260211211707376370.json b/.semversioner/next-release/patch-20260211211707376370.json new file mode 100644 index 000000000..0158f4380 --- /dev/null +++ b/.semversioner/next-release/patch-20260211211707376370.json @@ -0,0 +1,4 @@ +{ + "type": "patch", + "description": "Move document ID, human_readable_id, and raw_data initialization from create_final_documents into load_input_documents and load_update_documents." +} diff --git a/docs/examples_notebooks/api_overview.ipynb b/docs/examples_notebooks/api_overview.ipynb index abcd7832f..2a0c0f15d 100644 --- a/docs/examples_notebooks/api_overview.ipynb +++ b/docs/examples_notebooks/api_overview.ipynb @@ -28,11 +28,10 @@ "from pathlib import Path\n", "from pprint import pprint\n", "\n", + "import graphrag.api as api\n", "import pandas as pd\n", "from graphrag.config.load_config import load_config\n", - "from graphrag.index.typing.pipeline_run_result import PipelineRunResult\n", - "\n", - "import graphrag.api as api" + "from graphrag.index.typing.pipeline_run_result import PipelineRunResult" ] }, { diff --git a/docs/examples_notebooks/input_documents.ipynb b/docs/examples_notebooks/input_documents.ipynb index 505c0fe1f..5657770ea 100644 --- a/docs/examples_notebooks/input_documents.ipynb +++ b/docs/examples_notebooks/input_documents.ipynb @@ -30,11 +30,10 @@ "from pathlib import Path\n", "from pprint import pprint\n", "\n", + "import graphrag.api as api\n", "import pandas as pd\n", "from graphrag.config.load_config import load_config\n", - "from graphrag.index.typing.pipeline_run_result import PipelineRunResult\n", - "\n", - "import graphrag.api as api" + "from graphrag.index.typing.pipeline_run_result import PipelineRunResult" ] }, { diff --git a/packages/graphrag/graphrag/index/workflows/create_final_documents.py b/packages/graphrag/graphrag/index/workflows/create_final_documents.py index 57c67229e..ccbd96782 100644 --- a/packages/graphrag/graphrag/index/workflows/create_final_documents.py +++ b/packages/graphrag/graphrag/index/workflows/create_final_documents.py @@ -65,10 +65,4 @@ def create_final_documents( copy=False, ).reset_index(drop=True) - rejoined["id"] = rejoined["id"].astype(str) - rejoined["human_readable_id"] = rejoined.index - - if "raw_data" not in rejoined.columns: - rejoined["raw_data"] = pd.Series(dtype="object") - return rejoined.loc[:, DOCUMENTS_FINAL_COLUMNS] diff --git a/packages/graphrag/graphrag/index/workflows/load_input_documents.py b/packages/graphrag/graphrag/index/workflows/load_input_documents.py index ed7f83c8e..8e27ed0a2 100644 --- a/packages/graphrag/graphrag/index/workflows/load_input_documents.py +++ b/packages/graphrag/graphrag/index/workflows/load_input_documents.py @@ -39,4 +39,8 @@ async def run_workflow( async def load_input_documents(input_reader: InputReader) -> pd.DataFrame: """Load and parse input documents into a standard format.""" - return pd.DataFrame(await input_reader.read_files()) + output = pd.DataFrame(await input_reader.read_files()) + output["human_readable_id"] = output.index + if "raw_data" not in output.columns: + output["raw_data"] = pd.Series(dtype="object") + return output diff --git a/packages/graphrag/graphrag/index/workflows/load_update_documents.py b/packages/graphrag/graphrag/index/workflows/load_update_documents.py index 3f4417d3e..a61a22849 100644 --- a/packages/graphrag/graphrag/index/workflows/load_update_documents.py +++ b/packages/graphrag/graphrag/index/workflows/load_update_documents.py @@ -51,6 +51,9 @@ async def load_update_documents( ) -> pd.DataFrame: """Load and parse update-only input documents into a standard format.""" input_documents = pd.DataFrame(await input_reader.read_files()) + input_documents["human_readable_id"] = input_documents.index + if "raw_data" not in input_documents.columns: + input_documents["raw_data"] = pd.Series(dtype="object") # previous table provider has the output of the previous run # we'll use this to diff the input from the prior delta_documents = await get_delta_docs(input_documents, previous_table_provider) diff --git a/tests/verbs/test_create_community_reports.py b/tests/verbs/test_create_community_reports.py index 5a40b0544..68d8d1be9 100644 --- a/tests/verbs/test_create_community_reports.py +++ b/tests/verbs/test_create_community_reports.py @@ -3,14 +3,14 @@ from graphrag.data_model.schemas import COMMUNITY_REPORTS_FINAL_COLUMNS -from graphrag.index.workflows.create_community_reports import ( - run_workflow, -) - from graphrag.index.operations.summarize_communities.community_reports_extractor import ( CommunityReportResponse, FindingModel, ) +from graphrag.index.workflows.create_community_reports import ( + run_workflow, +) + from tests.unit.config.utils import get_default_graphrag_config from .util import ( diff --git a/unified-search-app/app/app_logic.py b/unified-search-app/app/app_logic.py index a573b9daa..dc64e0e77 100644 --- a/unified-search-app/app/app_logic.py +++ b/unified-search-app/app/app_logic.py @@ -7,6 +7,7 @@ import logging from typing import TYPE_CHECKING +import graphrag.api as api import streamlit as st from knowledge_loader.data_sources.loader import ( create_datasource, @@ -17,8 +18,6 @@ from state.session_variables import SessionVariables from ui.search import display_search_result -import graphrag.api as api - if TYPE_CHECKING: import pandas as pd