Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,18 @@
],
"console": "integratedTerminal"
},
{
"name": "Update",
"type": "debugpy",
"request": "launch",
"module": "graphrag",
"args": [
"update",
"--root",
"${input:root_folder}"
],
"console": "integratedTerminal"
},
{
"name": "Query",
"type": "debugpy",
Expand Down
12 changes: 6 additions & 6 deletions docs/config/yaml.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ Our pipeline can ingest .csv, .txt, or .json data from an input location. See th
- `base_dir` **str** - The base directory to write output artifacts to, relative to the root.
- `connection_string` **str** - (blob/cosmosdb only) The Azure Storage connection string.
- `container_name` **str** - (blob/cosmosdb only) The Azure Storage container name.
- `storage_account_blob_url` **str** - (blob only) The storage account blob URL to use.
- `account_url` **str** - (blob only) The storage account blob URL to use.
- `cosmosdb_account_blob_url` **str** - (cosmosdb only) The CosmosDB account blob URL to use.
- `type` **text|csv|json** - The type of input data to load. Default is `text`
- `encoding` **str** - The encoding of the input file. Default is `utf-8`
Expand Down Expand Up @@ -118,10 +118,10 @@ This section controls the storage mechanism used by the pipeline used for export
- `base_dir` **str** - The base directory to write output artifacts to, relative to the root.
- `connection_string` **str** - (blob/cosmosdb only) The Azure Storage connection string.
- `container_name` **str** - (blob/cosmosdb only) The Azure Storage container name.
- `storage_account_blob_url` **str** - (blob only) The storage account blob URL to use.
- `account_url` **str** - (blob only) The storage account blob URL to use.
- `cosmosdb_account_blob_url` **str** - (cosmosdb only) The CosmosDB account blob URL to use.

### update_index_output
### update_output_storage

The section defines a secondary storage location for running incremental indexing, to preserve your original outputs.

Expand All @@ -131,7 +131,7 @@ The section defines a secondary storage location for running incremental indexin
- `base_dir` **str** - The base directory to write output artifacts to, relative to the root.
- `connection_string` **str** - (blob/cosmosdb only) The Azure Storage connection string.
- `container_name` **str** - (blob/cosmosdb only) The Azure Storage container name.
- `storage_account_blob_url` **str** - (blob only) The storage account blob URL to use.
- `account_url` **str** - (blob only) The storage account blob URL to use.
- `cosmosdb_account_blob_url` **str** - (cosmosdb only) The CosmosDB account blob URL to use.

### cache
Expand All @@ -146,7 +146,7 @@ This section controls the cache mechanism used by the pipeline. This is used to
- `base_dir` **str** - The base directory to write output artifacts to, relative to the root.
- `connection_string` **str** - (blob/cosmosdb only) The Azure Storage connection string.
- `container_name` **str** - (blob/cosmosdb only) The Azure Storage container name.
- `storage_account_blob_url` **str** - (blob only) The storage account blob URL to use.
- `account_url` **str** - (blob only) The storage account blob URL to use.
- `cosmosdb_account_blob_url` **str** - (cosmosdb only) The CosmosDB account blob URL to use.

### reporting
Expand All @@ -159,7 +159,7 @@ This section controls the reporting mechanism used by the pipeline, for common e
- `base_dir` **str** - The base directory to write reports to, relative to the root.
- `connection_string` **str** - (blob only) The Azure Storage connection string.
- `container_name` **str** - (blob only) The Azure Storage container name.
- `storage_account_blob_url` **str** - The storage account blob URL to use.
- `account_url` **str** - The storage account blob URL to use.

### vector_store

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ class AzureBlobStorage(Storage):
_container_name: str
_base_dir: str | None
_encoding: str
_storage_account_blob_url: str | None
_account_url: str | None
_blob_service_client: BlobServiceClient
_storage_account_name: str | None

Expand All @@ -42,7 +42,7 @@ def __init__(
) -> None:
"""Create a new BlobStorage instance."""
if connection_string is not None and account_url is not None:
msg = "AzureBlobStorage requires only one of connection_string or storage_account_blob_url to be specified, not both."
msg = "AzureBlobStorage requires only one of connection_string or account_url to be specified, not both."
logger.error(msg)
raise ValueError(msg)

Expand All @@ -63,15 +63,15 @@ def __init__(
credential=DefaultAzureCredential(),
)
else:
msg = "AzureBlobStorage requires either a connection_string or storage_account_blob_url to be specified."
msg = "AzureBlobStorage requires either a connection_string or account_url to be specified."
logger.error(msg)
raise ValueError(msg)

self._encoding = encoding
self._container_name = container_name
self._connection_string = connection_string
self._base_dir = base_dir
self._storage_account_blob_url = account_url
self._account_url = account_url
self._storage_account_name = (
account_url.split("//")[1].split(".")[0] if account_url else None
)
Expand Down Expand Up @@ -225,7 +225,7 @@ def child(self, name: str | None) -> "Storage":
container_name=self._container_name,
encoding=self._encoding,
base_dir=path,
account_url=self._storage_account_blob_url,
account_url=self._account_url,
)

def keys(self) -> list[str]:
Expand Down
2 changes: 1 addition & 1 deletion packages/graphrag/graphrag/config/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ class StorageDefaults:
base_dir: str | None = None
azure_connection_string: None = None
azure_container_name: None = None
azure_storage_account_blob_url: None = None
azure_account_url: None = None
azure_cosmosdb_account_url: None = None


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class ReportingConfig(BaseModel):
description="The reporting container name to use.",
default=graphrag_config_defaults.reporting.container_name,
)
storage_account_blob_url: str | None = Field(
account_url: str | None = Field(
description="The storage account blob url to use.",
default=graphrag_config_defaults.reporting.storage_account_blob_url,
)
2 changes: 1 addition & 1 deletion packages/graphrag/graphrag/index/run/run_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,6 @@ async def _copy_previous_output(
copy_storage: Storage,
):
for file in storage.find(re.compile(r"\.parquet$")):
base_name = file[0].replace(".parquet", "")
base_name = file.replace(".parquet", "")
table = await load_table_from_storage(base_name, storage)
await write_table_to_storage(table, base_name, copy_storage)
14 changes: 7 additions & 7 deletions packages/graphrag/graphrag/logger/blob_workflow_logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def __init__(
container_name: str | None,
blob_name: str = "",
base_dir: str | None = None,
storage_account_blob_url: str | None = None,
account_url: str | None = None,
level: int = logging.NOTSET,
):
"""Create a new instance of the BlobWorkflowLogger class."""
Expand All @@ -35,24 +35,24 @@ def __init__(
if container_name is None:
msg = "No container name provided for blob storage."
raise ValueError(msg)
if connection_string is None and storage_account_blob_url is None:
if connection_string is None and account_url is None:
msg = "No storage account blob url provided for blob storage."
raise ValueError(msg)

self._connection_string = connection_string
self._storage_account_blob_url = storage_account_blob_url
self.account_url = account_url

if self._connection_string:
self._blob_service_client = BlobServiceClient.from_connection_string(
self._connection_string
)
else:
if storage_account_blob_url is None:
msg = "Either connection_string or storage_account_blob_url must be provided."
if account_url is None:
msg = "Either connection_string or account_url must be provided."
raise ValueError(msg)

self._blob_service_client = BlobServiceClient(
storage_account_blob_url,
account_url,
credential=DefaultAzureCredential(),
)

Expand Down Expand Up @@ -107,7 +107,7 @@ def _write_log(self, log: dict[str, Any]):
self.__init__(
self._connection_string,
self._container_name,
storage_account_blob_url=self._storage_account_blob_url,
account_url=self.account_url,
)

blob_client = self._blob_service_client.get_blob_client(
Expand Down
2 changes: 1 addition & 1 deletion packages/graphrag/graphrag/logger/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def create_blob_logger(**kwargs) -> logging.Handler:
connection_string=kwargs["connection_string"],
container_name=kwargs["container_name"],
base_dir=kwargs["base_dir"],
storage_account_blob_url=kwargs["storage_account_blob_url"],
account_url=kwargs["account_url"],
)


Expand Down
2 changes: 1 addition & 1 deletion tests/unit/config/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ def assert_reporting_configs(
assert actual.base_dir == expected.base_dir
assert actual.connection_string == expected.connection_string
assert actual.container_name == expected.container_name
assert actual.storage_account_blob_url == expected.storage_account_blob_url
assert actual.account_url == expected.account_url


def assert_storage_config(actual: StorageConfig, expected: StorageConfig) -> None:
Expand Down