diff --git a/.semversioner/next-release/major-20250909010205372690.json b/.semversioner/next-release/major-20250909010205372690.json new file mode 100644 index 0000000000..537045e947 --- /dev/null +++ b/.semversioner/next-release/major-20250909010205372690.json @@ -0,0 +1,4 @@ +{ + "type": "major", + "description": "Remove document filtering option." +} diff --git a/docs/config/yaml.md b/docs/config/yaml.md index 7d7dcdd9e2..adf1c3cabb 100644 --- a/docs/config/yaml.md +++ b/docs/config/yaml.md @@ -87,7 +87,6 @@ Our pipeline can ingest .csv, .txt, or .json data from an input location. See th - `file_type` **text|csv|json** - The type of input data to load. Default is `text` - `encoding` **str** - The encoding of the input file. Default is `utf-8` - `file_pattern` **str** - A regex to match input files. Default is `.*\.csv$`, `.*\.txt$`, or `.*\.json$` depending on the specified `file_type`, but you can customize it if needed. -- `file_filter` **dict** - Key/value pairs to filter. Default is None. - `text_column` **str** - (CSV/JSON only) The text column name. If unset we expect a column named `text`. - `title_column` **str** - (CSV/JSON only) The title column name, filename will be used if unset. - `metadata` **list[str]** - (CSV/JSON only) The additional document attributes fields to keep. diff --git a/graphrag/config/defaults.py b/graphrag/config/defaults.py index dc84fc4595..f1ee7d23c5 100644 --- a/graphrag/config/defaults.py +++ b/graphrag/config/defaults.py @@ -262,7 +262,6 @@ class InputDefaults: file_type: ClassVar[InputFileType] = InputFileType.text encoding: str = "utf-8" file_pattern: str = "" - file_filter: None = None text_column: str = "text" title_column: None = None metadata: None = None diff --git a/graphrag/config/models/input_config.py b/graphrag/config/models/input_config.py index 139dd90c46..bc34d9402d 100644 --- a/graphrag/config/models/input_config.py +++ b/graphrag/config/models/input_config.py @@ -32,10 +32,6 @@ class InputConfig(BaseModel): description="The input file pattern to use.", default=graphrag_config_defaults.input.file_pattern, ) - file_filter: dict[str, str] | None = Field( - description="The optional file filter for the input files.", - default=graphrag_config_defaults.input.file_filter, - ) text_column: str = Field( description="The input text column to use.", default=graphrag_config_defaults.input.text_column, diff --git a/graphrag/index/input/csv.py b/graphrag/index/input/csv.py index eea4864b17..7db033debb 100644 --- a/graphrag/index/input/csv.py +++ b/graphrag/index/input/csv.py @@ -22,19 +22,10 @@ async def load_csv( """Load csv inputs from a directory.""" logger.info("Loading csv files from %s", config.storage.base_dir) - async def load_file(path: str, group: dict | None) -> pd.DataFrame: - if group is None: - group = {} + async def load_file(path: str) -> pd.DataFrame: buffer = BytesIO(await storage.get(path, as_bytes=True)) data = pd.read_csv(buffer, encoding=config.encoding) - additional_keys = group.keys() - if len(additional_keys) > 0: - data[[*additional_keys]] = data.apply( - lambda _row: pd.Series([group[key] for key in additional_keys]), axis=1 - ) - data = process_data_columns(data, config, path) - creation_date = await storage.get_creation_date(path) data["creation_date"] = data.apply(lambda _: creation_date, axis=1) diff --git a/graphrag/index/input/json.py b/graphrag/index/input/json.py index e2880bc888..41343fd2b2 100644 --- a/graphrag/index/input/json.py +++ b/graphrag/index/input/json.py @@ -22,23 +22,13 @@ async def load_json( """Load json inputs from a directory.""" logger.info("Loading json files from %s", config.storage.base_dir) - async def load_file(path: str, group: dict | None) -> pd.DataFrame: - if group is None: - group = {} + async def load_file(path: str) -> pd.DataFrame: text = await storage.get(path, encoding=config.encoding) as_json = json.loads(text) # json file could just be a single object, or an array of objects rows = as_json if isinstance(as_json, list) else [as_json] data = pd.DataFrame(rows) - - additional_keys = group.keys() - if len(additional_keys) > 0: - data[[*additional_keys]] = data.apply( - lambda _row: pd.Series([group[key] for key in additional_keys]), axis=1 - ) - data = process_data_columns(data, config, path) - creation_date = await storage.get_creation_date(path) data["creation_date"] = data.apply(lambda _: creation_date, axis=1) diff --git a/graphrag/index/input/text.py b/graphrag/index/input/text.py index 1834a532eb..f1fc74352f 100644 --- a/graphrag/index/input/text.py +++ b/graphrag/index/input/text.py @@ -22,11 +22,9 @@ async def load_text( ) -> pd.DataFrame: """Load text inputs from a directory.""" - async def load_file(path: str, group: dict | None = None) -> pd.DataFrame: - if group is None: - group = {} + async def load_file(path: str) -> pd.DataFrame: text = await storage.get(path, encoding=config.encoding) - new_item = {**group, "text": text} + new_item = {"text": text} new_item["id"] = gen_sha512_hash(new_item, new_item.keys()) new_item["title"] = str(Path(path).name) new_item["creation_date"] = await storage.get_creation_date(path) diff --git a/graphrag/index/input/util.py b/graphrag/index/input/util.py index cc15183673..457c1864a9 100644 --- a/graphrag/index/input/util.py +++ b/graphrag/index/input/util.py @@ -22,12 +22,7 @@ async def load_files( storage: PipelineStorage, ) -> pd.DataFrame: """Load files from storage and apply a loader function.""" - files = list( - storage.find( - re.compile(config.file_pattern), - file_filter=config.file_filter, - ) - ) + files = list(storage.find(re.compile(config.file_pattern))) if len(files) == 0: msg = f"No {config.file_type} files found in {config.storage.base_dir}" @@ -35,9 +30,9 @@ async def load_files( files_loaded = [] - for file, group in files: + for file in files: try: - files_loaded.append(await loader(file, group)) + files_loaded.append(await loader(file)) except Exception as e: # noqa: BLE001 (catching Exception is fine here) logger.warning("Warning! Error loading file %s. Skipping...", file) logger.warning("Error: %s", e) diff --git a/graphrag/storage/blob_pipeline_storage.py b/graphrag/storage/blob_pipeline_storage.py index 2a5ba8c41f..5a00af85b5 100644 --- a/graphrag/storage/blob_pipeline_storage.py +++ b/graphrag/storage/blob_pipeline_storage.py @@ -101,15 +101,13 @@ def find( self, file_pattern: re.Pattern[str], base_dir: str | None = None, - file_filter: dict[str, Any] | None = None, max_count=-1, - ) -> Iterator[tuple[str, dict[str, Any]]]: - """Find blobs in a container using a file pattern, as well as a custom filter function. + ) -> Iterator[str]: + """Find blobs in a container using a file pattern. Params: base_dir: The name of the base container. file_pattern: The file pattern to use. - file_filter: A dictionary of key-value pairs to filter the blobs. max_count: The maximum number of blobs to return. If -1, all blobs are returned. Returns @@ -131,14 +129,6 @@ def _blobname(blob_name: str) -> str: blob_name = blob_name[1:] return blob_name - def item_filter(item: dict[str, Any]) -> bool: - if file_filter is None: - return True - - return all( - re.search(value, item[key]) for key, value in file_filter.items() - ) - try: container_client = self._blob_service_client.get_container_client( self._container_name @@ -151,14 +141,10 @@ def item_filter(item: dict[str, Any]) -> bool: for blob in all_blobs: match = file_pattern.search(blob.name) if match and blob.name.startswith(base_dir): - group = match.groupdict() - if item_filter(group): - yield (_blobname(blob.name), group) - num_loaded += 1 - if max_count > 0 and num_loaded >= max_count: - break - else: - num_filtered += 1 + yield _blobname(blob.name) + num_loaded += 1 + if max_count > 0 and num_loaded >= max_count: + break else: num_filtered += 1 logger.debug( @@ -169,10 +155,9 @@ def item_filter(item: dict[str, Any]) -> bool: ) except Exception: # noqa: BLE001 logger.warning( - "Error finding blobs: base_dir=%s, file_pattern=%s, file_filter=%s", + "Error finding blobs: base_dir=%s, file_pattern=%s", base_dir, file_pattern, - file_filter, ) async def get( diff --git a/graphrag/storage/cosmosdb_pipeline_storage.py b/graphrag/storage/cosmosdb_pipeline_storage.py index 795d071df8..8d2673e89b 100644 --- a/graphrag/storage/cosmosdb_pipeline_storage.py +++ b/graphrag/storage/cosmosdb_pipeline_storage.py @@ -121,15 +121,13 @@ def find( self, file_pattern: re.Pattern[str], base_dir: str | None = None, - file_filter: dict[str, Any] | None = None, max_count=-1, - ) -> Iterator[tuple[str, dict[str, Any]]]: - """Find documents in a Cosmos DB container using a file pattern regex and custom file filter (optional). + ) -> Iterator[str]: + """Find documents in a Cosmos DB container using a file pattern regex. Params: base_dir: The name of the base directory (not used in Cosmos DB context). file_pattern: The file pattern to use. - file_filter: A dictionary of key-value pairs to filter the documents. max_count: The maximum number of documents to return. If -1, all documents are returned. Returns @@ -145,23 +143,12 @@ def find( if not self._database_client or not self._container_client: return - def item_filter(item: dict[str, Any]) -> bool: - if file_filter is None: - return True - return all( - re.search(value, item.get(key, "")) - for key, value in file_filter.items() - ) - try: query = "SELECT * FROM c WHERE RegexMatch(c.id, @pattern)" parameters: list[dict[str, Any]] = [ {"name": "@pattern", "value": file_pattern.pattern} ] - if file_filter: - for key, value in file_filter.items(): - query += f" AND c.{key} = @{key}" - parameters.append({"name": f"@{key}", "value": value}) + items = list( self._container_client.query_items( query=query, @@ -177,14 +164,10 @@ def item_filter(item: dict[str, Any]) -> bool: for item in items: match = file_pattern.search(item["id"]) if match: - group = match.groupdict() - if item_filter(group): - yield (item["id"], group) - num_loaded += 1 - if max_count > 0 and num_loaded >= max_count: - break - else: - num_filtered += 1 + yield item["id"] + num_loaded += 1 + if max_count > 0 and num_loaded >= max_count: + break else: num_filtered += 1 diff --git a/graphrag/storage/file_pipeline_storage.py b/graphrag/storage/file_pipeline_storage.py index a52a92d36d..15445b0d3a 100644 --- a/graphrag/storage/file_pipeline_storage.py +++ b/graphrag/storage/file_pipeline_storage.py @@ -41,18 +41,9 @@ def find( self, file_pattern: re.Pattern[str], base_dir: str | None = None, - file_filter: dict[str, Any] | None = None, max_count=-1, - ) -> Iterator[tuple[str, dict[str, Any]]]: - """Find files in the storage using a file pattern, as well as a custom filter function.""" - - def item_filter(item: dict[str, Any]) -> bool: - if file_filter is None: - return True - return all( - re.search(value, item[key]) for key, value in file_filter.items() - ) - + ) -> Iterator[str]: + """Find files in the storage using a file pattern.""" search_path = Path(self._root_dir) / (base_dir or "") logger.info( "search %s for files matching %s", search_path, file_pattern.pattern @@ -64,17 +55,13 @@ def item_filter(item: dict[str, Any]) -> bool: for file in all_files: match = file_pattern.search(f"{file}") if match: - group = match.groupdict() - if item_filter(group): - filename = f"{file}".replace(self._root_dir, "") - if filename.startswith(os.sep): - filename = filename[1:] - yield (filename, group) - num_loaded += 1 - if max_count > 0 and num_loaded >= max_count: - break - else: - num_filtered += 1 + filename = f"{file}".replace(self._root_dir, "") + if filename.startswith(os.sep): + filename = filename[1:] + yield filename + num_loaded += 1 + if max_count > 0 and num_loaded >= max_count: + break else: num_filtered += 1 logger.debug( diff --git a/graphrag/storage/pipeline_storage.py b/graphrag/storage/pipeline_storage.py index ed117a577d..ba3ab86e97 100644 --- a/graphrag/storage/pipeline_storage.py +++ b/graphrag/storage/pipeline_storage.py @@ -18,10 +18,9 @@ def find( self, file_pattern: re.Pattern[str], base_dir: str | None = None, - file_filter: dict[str, Any] | None = None, max_count=-1, - ) -> Iterator[tuple[str, dict[str, Any]]]: - """Find files in the storage using a file pattern, as well as a custom filter function.""" + ) -> Iterator[str]: + """Find files in the storage using a file pattern.""" @abstractmethod async def get( diff --git a/tests/integration/storage/test_blob_pipeline_storage.py b/tests/integration/storage/test_blob_pipeline_storage.py index 24e380d20e..f99a74ff74 100644 --- a/tests/integration/storage/test_blob_pipeline_storage.py +++ b/tests/integration/storage/test_blob_pipeline_storage.py @@ -21,7 +21,6 @@ async def test_find(): items = list( storage.find(base_dir="input", file_pattern=re.compile(r".*\.txt$")) ) - items = [item[0] for item in items] assert items == [] await storage.set( @@ -30,12 +29,10 @@ async def test_find(): items = list( storage.find(base_dir="input", file_pattern=re.compile(r".*\.txt$")) ) - items = [item[0] for item in items] assert items == ["input/christmas.txt"] await storage.set("test.txt", "Hello, World!", encoding="utf-8") items = list(storage.find(file_pattern=re.compile(r".*\.txt$"))) - items = [item[0] for item in items] assert items == ["input/christmas.txt", "test.txt"] output = await storage.get("test.txt") @@ -57,7 +54,6 @@ async def test_dotprefix(): try: await storage.set("input/christmas.txt", "Merry Christmas!", encoding="utf-8") items = list(storage.find(file_pattern=re.compile(r".*\.txt$"))) - items = [item[0] for item in items] assert items == ["input/christmas.txt"] finally: storage._delete_container() # noqa: SLF001 @@ -91,12 +87,10 @@ async def test_child(): storage = parent.child("input") await storage.set("christmas.txt", "Merry Christmas!", encoding="utf-8") items = list(storage.find(re.compile(r".*\.txt$"))) - items = [item[0] for item in items] assert items == ["christmas.txt"] await storage.set("test.txt", "Hello, World!", encoding="utf-8") items = list(storage.find(re.compile(r".*\.txt$"))) - items = [item[0] for item in items] print("FOUND", items) assert items == ["christmas.txt", "test.txt"] @@ -104,7 +98,6 @@ async def test_child(): assert output == "Hello, World!" items = list(parent.find(re.compile(r".*\.txt$"))) - items = [item[0] for item in items] print("FOUND ITEMS", items) assert items == ["input/christmas.txt", "input/test.txt"] finally: diff --git a/tests/integration/storage/test_cosmosdb_storage.py b/tests/integration/storage/test_cosmosdb_storage.py index 529be52256..af08972fb3 100644 --- a/tests/integration/storage/test_cosmosdb_storage.py +++ b/tests/integration/storage/test_cosmosdb_storage.py @@ -30,7 +30,6 @@ async def test_find(): try: try: items = list(storage.find(file_pattern=re.compile(r".*\.json$"))) - items = [item[0] for item in items] assert items == [] json_content = { @@ -40,7 +39,6 @@ async def test_find(): "christmas.json", json.dumps(json_content), encoding="utf-8" ) items = list(storage.find(file_pattern=re.compile(r".*\.json$"))) - items = [item[0] for item in items] assert items == ["christmas.json"] json_content = { @@ -48,7 +46,6 @@ async def test_find(): } await storage.set("test.json", json.dumps(json_content), encoding="utf-8") items = list(storage.find(file_pattern=re.compile(r".*\.json$"))) - items = [item[0] for item in items] assert items == ["christmas.json", "test.json"] output = await storage.get("test.json") diff --git a/tests/integration/storage/test_factory.py b/tests/integration/storage/test_factory.py index 3d21e50e9f..0b59366ef9 100644 --- a/tests/integration/storage/test_factory.py +++ b/tests/integration/storage/test_factory.py @@ -117,9 +117,8 @@ def find( self, file_pattern: re.Pattern[str], base_dir: str | None = None, - file_filter: dict[str, Any] | None = None, max_count=-1, - ) -> Iterator[tuple[str, dict[str, Any]]]: + ) -> Iterator[str]: return iter([]) async def get( diff --git a/tests/integration/storage/test_file_pipeline_storage.py b/tests/integration/storage/test_file_pipeline_storage.py index be58476480..cc5b3f7c83 100644 --- a/tests/integration/storage/test_file_pipeline_storage.py +++ b/tests/integration/storage/test_file_pipeline_storage.py @@ -18,12 +18,10 @@ async def test_find(): storage = FilePipelineStorage() items = list( storage.find( - base_dir="tests/fixtures/text/input", - file_pattern=re.compile(r".*\.txt$"), - file_filter=None, + base_dir="tests/fixtures/text/input", file_pattern=re.compile(r".*\.txt$") ) ) - assert items == [(str(Path("tests/fixtures/text/input/dulce.txt")), {})] + assert items == [str(Path("tests/fixtures/text/input/dulce.txt"))] output = await storage.get("tests/fixtures/text/input/dulce.txt") assert len(output) > 0 @@ -52,7 +50,7 @@ async def test_child(): storage = FilePipelineStorage() storage = storage.child("tests/fixtures/text/input") items = list(storage.find(re.compile(r".*\.txt$"))) - assert items == [(str(Path("dulce.txt")), {})] + assert items == [str(Path("dulce.txt"))] output = await storage.get("dulce.txt") assert len(output) > 0 diff --git a/tests/unit/config/utils.py b/tests/unit/config/utils.py index 2fa5b14107..1cfc736372 100644 --- a/tests/unit/config/utils.py +++ b/tests/unit/config/utils.py @@ -175,7 +175,6 @@ def assert_input_configs(actual: InputConfig, expected: InputConfig) -> None: assert actual.storage.container_name == expected.storage.container_name assert actual.encoding == expected.encoding assert actual.file_pattern == expected.file_pattern - assert actual.file_filter == expected.file_filter assert actual.text_column == expected.text_column assert actual.title_column == expected.title_column assert actual.metadata == expected.metadata