diff --git a/.vscode/launch.json b/.vscode/launch.json index 9f949cb12f..3d563d6f74 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -14,6 +14,18 @@ ], "console": "integratedTerminal" }, + { + "name": "Update", + "type": "debugpy", + "request": "launch", + "module": "graphrag", + "args": [ + "update", + "--root", + "${input:root_folder}" + ], + "console": "integratedTerminal" + }, { "name": "Query", "type": "debugpy", diff --git a/docs/config/yaml.md b/docs/config/yaml.md index 1885db1fe5..ac724b6ce8 100644 --- a/docs/config/yaml.md +++ b/docs/config/yaml.md @@ -85,7 +85,7 @@ Our pipeline can ingest .csv, .txt, or .json data from an input location. See th - `base_dir` **str** - The base directory to write output artifacts to, relative to the root. - `connection_string` **str** - (blob/cosmosdb only) The Azure Storage connection string. - `container_name` **str** - (blob/cosmosdb only) The Azure Storage container name. - - `storage_account_blob_url` **str** - (blob only) The storage account blob URL to use. + - `account_url` **str** - (blob only) The storage account blob URL to use. - `cosmosdb_account_blob_url` **str** - (cosmosdb only) The CosmosDB account blob URL to use. - `type` **text|csv|json** - The type of input data to load. Default is `text` - `encoding` **str** - The encoding of the input file. Default is `utf-8` @@ -118,10 +118,10 @@ This section controls the storage mechanism used by the pipeline used for export - `base_dir` **str** - The base directory to write output artifacts to, relative to the root. - `connection_string` **str** - (blob/cosmosdb only) The Azure Storage connection string. - `container_name` **str** - (blob/cosmosdb only) The Azure Storage container name. -- `storage_account_blob_url` **str** - (blob only) The storage account blob URL to use. +- `account_url` **str** - (blob only) The storage account blob URL to use. - `cosmosdb_account_blob_url` **str** - (cosmosdb only) The CosmosDB account blob URL to use. -### update_index_output +### update_output_storage The section defines a secondary storage location for running incremental indexing, to preserve your original outputs. @@ -131,7 +131,7 @@ The section defines a secondary storage location for running incremental indexin - `base_dir` **str** - The base directory to write output artifacts to, relative to the root. - `connection_string` **str** - (blob/cosmosdb only) The Azure Storage connection string. - `container_name` **str** - (blob/cosmosdb only) The Azure Storage container name. -- `storage_account_blob_url` **str** - (blob only) The storage account blob URL to use. +- `account_url` **str** - (blob only) The storage account blob URL to use. - `cosmosdb_account_blob_url` **str** - (cosmosdb only) The CosmosDB account blob URL to use. ### cache @@ -146,7 +146,7 @@ This section controls the cache mechanism used by the pipeline. This is used to - `base_dir` **str** - The base directory to write output artifacts to, relative to the root. - `connection_string` **str** - (blob/cosmosdb only) The Azure Storage connection string. - `container_name` **str** - (blob/cosmosdb only) The Azure Storage container name. - - `storage_account_blob_url` **str** - (blob only) The storage account blob URL to use. + - `account_url` **str** - (blob only) The storage account blob URL to use. - `cosmosdb_account_blob_url` **str** - (cosmosdb only) The CosmosDB account blob URL to use. ### reporting @@ -159,7 +159,7 @@ This section controls the reporting mechanism used by the pipeline, for common e - `base_dir` **str** - The base directory to write reports to, relative to the root. - `connection_string` **str** - (blob only) The Azure Storage connection string. - `container_name` **str** - (blob only) The Azure Storage container name. -- `storage_account_blob_url` **str** - The storage account blob URL to use. +- `account_url` **str** - The storage account blob URL to use. ### vector_store diff --git a/packages/graphrag-storage/graphrag_storage/azure_blob_storage.py b/packages/graphrag-storage/graphrag_storage/azure_blob_storage.py index bec1bdb465..68cca2c017 100644 --- a/packages/graphrag-storage/graphrag_storage/azure_blob_storage.py +++ b/packages/graphrag-storage/graphrag_storage/azure_blob_storage.py @@ -27,7 +27,7 @@ class AzureBlobStorage(Storage): _container_name: str _base_dir: str | None _encoding: str - _storage_account_blob_url: str | None + _account_url: str | None _blob_service_client: BlobServiceClient _storage_account_name: str | None @@ -42,7 +42,7 @@ def __init__( ) -> None: """Create a new BlobStorage instance.""" if connection_string is not None and account_url is not None: - msg = "AzureBlobStorage requires only one of connection_string or storage_account_blob_url to be specified, not both." + msg = "AzureBlobStorage requires only one of connection_string or account_url to be specified, not both." logger.error(msg) raise ValueError(msg) @@ -63,7 +63,7 @@ def __init__( credential=DefaultAzureCredential(), ) else: - msg = "AzureBlobStorage requires either a connection_string or storage_account_blob_url to be specified." + msg = "AzureBlobStorage requires either a connection_string or account_url to be specified." logger.error(msg) raise ValueError(msg) @@ -71,7 +71,7 @@ def __init__( self._container_name = container_name self._connection_string = connection_string self._base_dir = base_dir - self._storage_account_blob_url = account_url + self._account_url = account_url self._storage_account_name = ( account_url.split("//")[1].split(".")[0] if account_url else None ) @@ -225,7 +225,7 @@ def child(self, name: str | None) -> "Storage": container_name=self._container_name, encoding=self._encoding, base_dir=path, - account_url=self._storage_account_blob_url, + account_url=self._account_url, ) def keys(self) -> list[str]: diff --git a/packages/graphrag/graphrag/config/defaults.py b/packages/graphrag/graphrag/config/defaults.py index cc07438c85..e0ff8f0ac2 100644 --- a/packages/graphrag/graphrag/config/defaults.py +++ b/packages/graphrag/graphrag/config/defaults.py @@ -221,7 +221,7 @@ class StorageDefaults: base_dir: str | None = None azure_connection_string: None = None azure_container_name: None = None - azure_storage_account_blob_url: None = None + azure_account_url: None = None azure_cosmosdb_account_url: None = None diff --git a/packages/graphrag/graphrag/config/models/reporting_config.py b/packages/graphrag/graphrag/config/models/reporting_config.py index 0e33736058..7443ea2961 100644 --- a/packages/graphrag/graphrag/config/models/reporting_config.py +++ b/packages/graphrag/graphrag/config/models/reporting_config.py @@ -28,7 +28,7 @@ class ReportingConfig(BaseModel): description="The reporting container name to use.", default=graphrag_config_defaults.reporting.container_name, ) - storage_account_blob_url: str | None = Field( + account_url: str | None = Field( description="The storage account blob url to use.", default=graphrag_config_defaults.reporting.storage_account_blob_url, ) diff --git a/packages/graphrag/graphrag/index/run/run_pipeline.py b/packages/graphrag/graphrag/index/run/run_pipeline.py index f82e774423..d552acecac 100644 --- a/packages/graphrag/graphrag/index/run/run_pipeline.py +++ b/packages/graphrag/graphrag/index/run/run_pipeline.py @@ -160,6 +160,6 @@ async def _copy_previous_output( copy_storage: Storage, ): for file in storage.find(re.compile(r"\.parquet$")): - base_name = file[0].replace(".parquet", "") + base_name = file.replace(".parquet", "") table = await load_table_from_storage(base_name, storage) await write_table_to_storage(table, base_name, copy_storage) diff --git a/packages/graphrag/graphrag/logger/blob_workflow_logger.py b/packages/graphrag/graphrag/logger/blob_workflow_logger.py index ae4893c6e8..fd7ccac4fe 100644 --- a/packages/graphrag/graphrag/logger/blob_workflow_logger.py +++ b/packages/graphrag/graphrag/logger/blob_workflow_logger.py @@ -26,7 +26,7 @@ def __init__( container_name: str | None, blob_name: str = "", base_dir: str | None = None, - storage_account_blob_url: str | None = None, + account_url: str | None = None, level: int = logging.NOTSET, ): """Create a new instance of the BlobWorkflowLogger class.""" @@ -35,24 +35,24 @@ def __init__( if container_name is None: msg = "No container name provided for blob storage." raise ValueError(msg) - if connection_string is None and storage_account_blob_url is None: + if connection_string is None and account_url is None: msg = "No storage account blob url provided for blob storage." raise ValueError(msg) self._connection_string = connection_string - self._storage_account_blob_url = storage_account_blob_url + self.account_url = account_url if self._connection_string: self._blob_service_client = BlobServiceClient.from_connection_string( self._connection_string ) else: - if storage_account_blob_url is None: - msg = "Either connection_string or storage_account_blob_url must be provided." + if account_url is None: + msg = "Either connection_string or account_url must be provided." raise ValueError(msg) self._blob_service_client = BlobServiceClient( - storage_account_blob_url, + account_url, credential=DefaultAzureCredential(), ) @@ -107,7 +107,7 @@ def _write_log(self, log: dict[str, Any]): self.__init__( self._connection_string, self._container_name, - storage_account_blob_url=self._storage_account_blob_url, + account_url=self.account_url, ) blob_client = self._blob_service_client.get_blob_client( diff --git a/packages/graphrag/graphrag/logger/factory.py b/packages/graphrag/graphrag/logger/factory.py index 1094d080f3..7b5d28ef5e 100644 --- a/packages/graphrag/graphrag/logger/factory.py +++ b/packages/graphrag/graphrag/logger/factory.py @@ -54,7 +54,7 @@ def create_blob_logger(**kwargs) -> logging.Handler: connection_string=kwargs["connection_string"], container_name=kwargs["container_name"], base_dir=kwargs["base_dir"], - storage_account_blob_url=kwargs["storage_account_blob_url"], + account_url=kwargs["account_url"], ) diff --git a/tests/unit/config/utils.py b/tests/unit/config/utils.py index c02285125a..9caa37d332 100644 --- a/tests/unit/config/utils.py +++ b/tests/unit/config/utils.py @@ -123,7 +123,7 @@ def assert_reporting_configs( assert actual.base_dir == expected.base_dir assert actual.connection_string == expected.connection_string assert actual.container_name == expected.container_name - assert actual.storage_account_blob_url == expected.storage_account_blob_url + assert actual.account_url == expected.account_url def assert_storage_config(actual: StorageConfig, expected: StorageConfig) -> None: