Skip to content

Commit eb08bba

Browse files
mpcallanantensorflower-gardener
authored andcommitted
Create class to manage tf.data service dispatcher snapshots.
This is mostly a 1:1 restructuring with the following changes: 1) Added simple snapshot recovery from on-disk state. 2) Removed all members tracking snapshot, stream, and source completion. I think these may have been structured incorrectly, and either way they weren't tested or used. I'll reevaluate when stream completion is implemented. 3) Removed some validations that weren't tested and/or were related to #1. Will add back after addressing #1. 4) Renamed directory -> path. PiperOrigin-RevId: 502934739
1 parent 711cdfd commit eb08bba

17 files changed

+654
-303
lines changed

tensorflow/core/data/service/BUILD

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -394,12 +394,15 @@ cc_library(
394394
":journal_proto_cc",
395395
":split_provider",
396396
":task_remover",
397+
":utils",
397398
":validate_utils",
398399
":worker_cc_grpc_proto",
399400
"@com_google_absl//absl/container:flat_hash_map",
400401
"@com_google_absl//absl/container:flat_hash_set",
401402
"@com_google_absl//absl/memory",
403+
"@com_google_absl//absl/strings",
402404
"@com_google_absl//absl/time",
405+
"//tensorflow/core/data/service/snapshot:snapshot_manager",
403406
"//tensorflow/core:core_cpu",
404407
"//tensorflow/core:core_cpu_internal",
405408
"//tensorflow/core:framework",

tensorflow/core/data/service/dispatcher.proto

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -253,8 +253,8 @@ message SnapshotRequest {
253253
// The dataset to snapshot.
254254
DatasetDef dataset = 1;
255255

256-
// The directory to which to materialize the snapshot.
257-
string directory = 2;
256+
// The path to which to materialize the snapshot.
257+
string path = 2;
258258

259259
// The metadata for the snapshot.
260260
experimental.DistributedSnapshotMetadata metadata = 3;
@@ -265,8 +265,8 @@ message SnapshotResponse {}
265265

266266
// Next tag: 4
267267
message GetSnapshotSplitRequest {
268-
// The directory in which the snapshot is being materialized.
269-
string directory = 1;
268+
// The base path of the snapshot materialization.
269+
string base_path = 1;
270270

271271
// The index of the snapshot stream from which to get the split.
272272
int64 stream_index = 2;

tensorflow/core/data/service/dispatcher_client.cc

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -115,13 +115,13 @@ Status DataServiceDispatcherClient::GetSplit(int64_t iteration_id,
115115
}
116116

117117
Status DataServiceDispatcherClient::Snapshot(
118-
const DatasetDef& dataset, const std::string& directory,
118+
const DatasetDef& dataset, const std::string& path,
119119
const experimental::DistributedSnapshotMetadata& metadata) {
120120
TF_RETURN_IF_ERROR(EnsureInitialized());
121121

122122
SnapshotRequest req;
123123
*req.mutable_dataset() = dataset;
124-
req.set_directory(directory);
124+
req.set_path(path);
125125
*req.mutable_metadata() = metadata;
126126

127127
SnapshotResponse resp;
@@ -134,12 +134,12 @@ Status DataServiceDispatcherClient::Snapshot(
134134
}
135135

136136
Status DataServiceDispatcherClient::GetSnapshotSplit(
137-
const std::string& directory, int64_t stream_index, int64_t source_index,
137+
const std::string& base_path, int64_t stream_index, int64_t source_index,
138138
Tensor& split, bool& end_of_splits) {
139139
TF_RETURN_IF_ERROR(EnsureInitialized());
140140

141141
GetSnapshotSplitRequest req;
142-
req.set_directory(directory);
142+
req.set_base_path(base_path);
143143
req.set_stream_index(stream_index);
144144
req.set_source_index(source_index);
145145

tensorflow/core/data/service/dispatcher_client.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -65,14 +65,14 @@ class DataServiceDispatcherClient : public DataServiceClientBase {
6565
bool& end_of_splits);
6666

6767
// Gets the next split for the specified source of a stream of the snapshot in
68-
// `directory`. If `end_of_splits` returns true, then there are no more splits
68+
// `base_path`. If `end_of_splits` returns true, then there are no more splits
6969
// to be processed for the specified stream source.
70-
Status GetSnapshotSplit(const std::string& directory, int64_t stream_index,
70+
Status GetSnapshotSplit(const std::string& base_path, int64_t stream_index,
7171
int64_t source_index, Tensor& split,
7272
bool& end_of_splits);
7373

74-
// Initiates the process of materializing `dataset`'s output to `directory`.
75-
Status Snapshot(const DatasetDef& dataset, const std::string& directory,
74+
// Initiates the process of materializing `dataset`'s output to `path`.
75+
Status Snapshot(const DatasetDef& dataset, const std::string& path,
7676
const experimental::DistributedSnapshotMetadata& metadata);
7777

7878
// Registers a dataset with the tf.data service, and stores the generated

tensorflow/core/data/service/dispatcher_client_test.cc

Lines changed: 15 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -139,44 +139,43 @@ TEST_F(DispatcherClientTest, GetDataServiceConfig) {
139139
EXPECT_EQ(config.deployment_mode(), DEPLOYMENT_MODE_COLOCATED);
140140
}
141141

142-
TEST_F(DispatcherClientTest, SnapshotMetadataAndDatasetDefWritten) {
143-
TF_ASSERT_OK_AND_ASSIGN(absl::flat_hash_set<std::string> directories,
142+
TEST_F(DispatcherClientTest, SkeletonWritten) {
143+
TF_ASSERT_OK_AND_ASSIGN(absl::flat_hash_set<std::string> paths,
144144
StartDummySnapshots());
145-
for (const auto& directory : directories) {
146-
TF_ASSERT_OK(Env::Default()->FileExists(
147-
io::JoinPath(directory, "snapshot.metadata")));
148-
TF_ASSERT_OK(Env::Default()->FileExists(
149-
io::JoinPath(directory, "dataset_def.proto")));
145+
for (const auto& path : paths) {
146+
TF_ASSERT_OK(Env::Default()->FileExists(CommittedChunksDirectory(path)));
147+
TF_ASSERT_OK(Env::Default()->FileExists(StreamsDirectory(path)));
150148
}
151149
}
152150

153-
TEST_F(DispatcherClientTest, CreateCommittedChunksDirectory) {
154-
TF_ASSERT_OK_AND_ASSIGN(absl::flat_hash_set<std::string> directories,
151+
TEST_F(DispatcherClientTest, SnapshotMetadataAndDatasetDefWritten) {
152+
TF_ASSERT_OK_AND_ASSIGN(absl::flat_hash_set<std::string> paths,
155153
StartDummySnapshots());
156-
for (const auto& directory : directories) {
154+
for (const auto& path : paths) {
155+
TF_ASSERT_OK(
156+
Env::Default()->FileExists(io::JoinPath(path, "snapshot.metadata")));
157157
TF_ASSERT_OK(
158-
Env::Default()->FileExists(CommittedChunksDirectory(directory)));
158+
Env::Default()->FileExists(io::JoinPath(path, "dataset_def.proto")));
159159
}
160160
}
161161

162162
TEST_F(DispatcherClientTest, SnapshotsInHeartbeat) {
163-
TF_ASSERT_OK_AND_ASSIGN(absl::flat_hash_set<std::string> directories,
163+
TF_ASSERT_OK_AND_ASSIGN(absl::flat_hash_set<std::string> paths,
164164
StartDummySnapshots());
165165
WorkerHeartbeatRequest worker_heartbeat_request;
166166
worker_heartbeat_request.set_worker_address(test_cluster_->WorkerAddress(0));
167167
TF_ASSERT_OK_AND_ASSIGN(
168168
WorkerHeartbeatResponse worker_heartbeat_response,
169169
dispatcher_client_->WorkerHeartbeat(worker_heartbeat_request));
170-
ASSERT_EQ(worker_heartbeat_response.snapshot_tasks_size(),
171-
directories.size());
170+
ASSERT_EQ(worker_heartbeat_response.snapshot_tasks_size(), paths.size());
172171
for (const auto& snapshot_task : worker_heartbeat_response.snapshot_tasks()) {
173-
ASSERT_TRUE(directories.count(snapshot_task.base_path()));
172+
ASSERT_TRUE(paths.count(snapshot_task.base_path()));
174173
ASSERT_EQ(snapshot_task.stream_index(), 0);
175174
}
176175
}
177176

178177
TEST_F(DispatcherClientTest, GetSnapshotSplit) {
179-
TF_ASSERT_OK_AND_ASSIGN(absl::flat_hash_set<std::string> directories,
178+
TF_ASSERT_OK_AND_ASSIGN(absl::flat_hash_set<std::string> paths,
180179
StartDummySnapshots());
181180
WorkerHeartbeatRequest worker_heartbeat_request;
182181
worker_heartbeat_request.set_worker_address(test_cluster_->WorkerAddress(0));

0 commit comments

Comments
 (0)