-
Notifications
You must be signed in to change notification settings - Fork 201
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: filesystem delete old pipeline state files #1838
Changes from all commits
4e11f00
577d604
8baaf0b
d235052
777dddd
350a5da
aea5142
177dc6c
f625c22
4b1ca4f
2968371
67b3b3b
4a390ba
a91d3bd
c47b149
06bf3f8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,7 +3,7 @@ | |
import base64 | ||
|
||
from types import TracebackType | ||
from typing import Dict, List, Type, Iterable, Iterator, Optional, Tuple, Sequence, cast | ||
from typing import Dict, List, Type, Iterable, Iterator, Optional, Tuple, Sequence, cast, Any | ||
from fsspec import AbstractFileSystem | ||
from contextlib import contextmanager | ||
|
||
|
@@ -479,7 +479,9 @@ def _to_path_safe_string(self, s: str) -> str: | |
"""for base64 strings""" | ||
return base64.b64decode(s).hex() if s else None | ||
|
||
def _list_dlt_table_files(self, table_name: str) -> Iterator[Tuple[str, List[str]]]: | ||
def _list_dlt_table_files( | ||
self, table_name: str, pipeline_name: str = None | ||
) -> Iterator[Tuple[str, List[str]]]: | ||
dirname = self.get_table_dir(table_name) | ||
if not self.fs_client.exists(self.pathlib.join(dirname, INIT_FILE_NAME)): | ||
raise DestinationUndefinedEntity({"dir": dirname}) | ||
|
@@ -488,7 +490,9 @@ def _list_dlt_table_files(self, table_name: str) -> Iterator[Tuple[str, List[str | |
fileparts = filename.split(FILENAME_SEPARATOR) | ||
if len(fileparts) != 3: | ||
continue | ||
yield filepath, fileparts | ||
# Filters only if pipeline_name provided | ||
if pipeline_name is None or fileparts[0] == pipeline_name: | ||
yield filepath, fileparts | ||
|
||
def _store_load(self, load_id: str) -> None: | ||
# write entry to load "table" | ||
|
@@ -523,6 +527,31 @@ def _get_state_file_name(self, pipeline_name: str, version_hash: str, load_id: s | |
f"{pipeline_name}{FILENAME_SEPARATOR}{load_id}{FILENAME_SEPARATOR}{self._to_path_safe_string(version_hash)}.jsonl", | ||
) | ||
|
||
def _cleanup_pipeline_states(self, pipeline_name: str) -> None: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In the ticket it says that we should make sure to not delete state files attached to failed loads, but we are not saving state on failed loads, so we should be good here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, indeed. @rudolfix specified:
Will partial loads store a state file? if not, we can keep the code as it is. |
||
state_table_files = list( | ||
self._list_dlt_table_files(self.schema.state_table_name, pipeline_name) | ||
) | ||
|
||
if len(state_table_files) > self.config.max_state_files: | ||
# filter and collect a list of state files | ||
state_file_info: List[Dict[str, Any]] = [ | ||
{ | ||
"load_id": float(fileparts[1]), # convert load_id to float for comparison | ||
"filepath": filepath, | ||
} | ||
for filepath, fileparts in state_table_files | ||
] | ||
|
||
# sort state file info by load_id in descending order | ||
state_file_info.sort(key=lambda x: x["load_id"], reverse=True) | ||
|
||
# keeping only the most recent MAX_STATE_HISTORY files | ||
files_to_delete = state_file_info[self.config.max_state_files :] | ||
|
||
# delete the old files | ||
for file_info in files_to_delete: | ||
donotpush marked this conversation as resolved.
Show resolved
Hide resolved
|
||
self._delete_file(file_info["filepath"]) | ||
|
||
def _store_current_state(self, load_id: str) -> None: | ||
# don't save the state this way when used as staging | ||
if self.config.as_staging_destination: | ||
|
@@ -542,6 +571,10 @@ def _store_current_state(self, load_id: str) -> None: | |
# write | ||
self._write_to_json_file(hash_path, cast(DictStrAny, pipeline_state_doc)) | ||
|
||
# perform state cleanup only if max_state_files is set to a positive value | ||
if self.config.max_state_files >= 1: | ||
self._cleanup_pipeline_states(pipeline_name) | ||
|
||
def get_stored_state(self, pipeline_name: str) -> Optional[StateInfo]: | ||
# search newest state | ||
selected_path = None | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
you have changed the bevahior of this function but not updated the other places where it is used, this will no longer just list the files of the current pipeline by default, so please double check that there are no suprising side effects :)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The function behaviour hasn't change for the other places because
pipeline_name
is set to None by default.And the condition will always be True when pipeline_name is None - returning all the files wihtout filtering. The functions
_iter_stored_schema_files
and_list_dlt_table_files
aren't affected by it.