From c9a86dbe999a254d4f174aa3c2bdd259831d934b Mon Sep 17 00:00:00 2001
From: Youngteac Hong <susukang98@gmail.com>
Date: Wed, 11 Dec 2024 13:57:45 +0900
Subject: [PATCH] Add missing MongoDB sharding configuration for version
 vectors (#1097)

This commit adds the missing MongoDB sharding configuration for
versionvectors collection introduced in PR #1047, as well as updates
the related documentation to reflect these changes.
---
 .../charts/yorkie-mongodb/values.yaml         |  5 ++
 build/docker/sharding/scripts/init-mongos1.js | 59 +++++++++++--------
 design/mongodb-sharding.md                    | 36 ++++++-----
 server/backend/database/mongo/indexes.go      |  2 +-
 4 files changed, 61 insertions(+), 41 deletions(-)

diff --git a/build/charts/yorkie-cluster/charts/yorkie-mongodb/values.yaml b/build/charts/yorkie-cluster/charts/yorkie-mongodb/values.yaml
index 8a127e542..a271b293a 100644
--- a/build/charts/yorkie-cluster/charts/yorkie-mongodb/values.yaml
+++ b/build/charts/yorkie-cluster/charts/yorkie-mongodb/values.yaml
@@ -57,6 +57,11 @@ sharded:
           - name: doc_id
             method: "1"
         unique: false
+      - collectionName: versionvectors
+        shardKeys:
+          - name: doc_id
+            method: "1"
+        unique: false
 
 # Configuration for manual dmongodb sharded stack
 mongodb-sharded:
diff --git a/build/docker/sharding/scripts/init-mongos1.js b/build/docker/sharding/scripts/init-mongos1.js
index 8942d777f..2fa37b47e 100644
--- a/build/docker/sharding/scripts/init-mongos1.js
+++ b/build/docker/sharding/scripts/init-mongos1.js
@@ -1,39 +1,46 @@
-sh.addShard("shard-rs-1/shard1-1:27017")
-sh.addShard("shard-rs-2/shard2-1:27017")
+sh.addShard("shard-rs-1/shard1-1:27017");
+sh.addShard("shard-rs-2/shard2-1:27017");
 
 function findAnotherShard(shard) {
-    if (shard == "shard-rs-1") {
-        return "shard-rs-2"
-    } else {
-        return "shard-rs-1"
-    }
+  if (shard == "shard-rs-1") {
+    return "shard-rs-2";
+  } else {
+    return "shard-rs-1";
+  }
 }
 
 function shardOfChunk(minKeyOfChunk) {
-    return db.getSiblingDB("config").chunks.findOne({ min: { project_id: minKeyOfChunk } }).shard
+  return db
+    .getSiblingDB("config")
+    .chunks.findOne({ min: { project_id: minKeyOfChunk } }).shard;
 }
 
 // Shard the database for the mongo client test
-const mongoClientDB = "test-yorkie-meta-mongo-client"
-sh.enableSharding(mongoClientDB)
-sh.shardCollection(mongoClientDB + ".clients", { project_id: 1 })
-sh.shardCollection(mongoClientDB + ".documents", { project_id: 1 })
-sh.shardCollection(mongoClientDB + ".changes", { doc_id: 1 })
-sh.shardCollection(mongoClientDB + ".snapshots", { doc_id: 1 })
-sh.shardCollection(mongoClientDB + ".syncedseqs", { doc_id: 1 })
+const mongoClientDB = "test-yorkie-meta-mongo-client";
+sh.enableSharding(mongoClientDB);
+sh.shardCollection(mongoClientDB + ".clients", { project_id: 1 });
+sh.shardCollection(mongoClientDB + ".documents", { project_id: 1 });
+sh.shardCollection(mongoClientDB + ".changes", { doc_id: 1 });
+sh.shardCollection(mongoClientDB + ".snapshots", { doc_id: 1 });
+sh.shardCollection(mongoClientDB + ".syncedseqs", { doc_id: 1 });
+sh.shardCollection(mongoClientDB + ".versionvectors", { doc_id: 1 });
 
 // Split the inital range at `splitPoint` to allow doc_ids duplicate in different shards.
-const splitPoint = ObjectId("500000000000000000000000")
-sh.splitAt(mongoClientDB + ".documents", { project_id: splitPoint })
+const splitPoint = ObjectId("500000000000000000000000");
+sh.splitAt(mongoClientDB + ".documents", { project_id: splitPoint });
 // Move the chunk to another shard.
-db.adminCommand({ moveChunk: mongoClientDB + ".documents", find: { project_id: splitPoint }, to: findAnotherShard(shardOfChunk(splitPoint)) })
+db.adminCommand({
+  moveChunk: mongoClientDB + ".documents",
+  find: { project_id: splitPoint },
+  to: findAnotherShard(shardOfChunk(splitPoint)),
+});
 
 // Shard the database for the server test
-const serverDB = "test-yorkie-meta-server"
-sh.enableSharding(serverDB)
-sh.shardCollection(serverDB + ".clients", { project_id: 1 })
-sh.shardCollection(serverDB + ".documents", { project_id: 1 })
-sh.shardCollection(serverDB + ".changes", { doc_id: 1 })
-sh.shardCollection(serverDB + ".snapshots", { doc_id: 1 })
-sh.shardCollection(serverDB + ".syncedseqs", { doc_id: 1 })
-
+const serverDB = "test-yorkie-meta-server";
+sh.enableSharding(serverDB);
+sh.shardCollection(serverDB + ".clients", { project_id: 1 });
+sh.shardCollection(serverDB + ".documents", { project_id: 1 });
+sh.shardCollection(serverDB + ".changes", { doc_id: 1 });
+sh.shardCollection(serverDB + ".snapshots", { doc_id: 1 });
+sh.shardCollection(serverDB + ".syncedseqs", { doc_id: 1 });
+sh.shardCollection(serverDB + ".versionvectors", { doc_id: 1 });
diff --git a/design/mongodb-sharding.md b/design/mongodb-sharding.md
index 460c49645..92da5833d 100644
--- a/design/mongodb-sharding.md
+++ b/design/mongodb-sharding.md
@@ -1,6 +1,6 @@
 ---
 title: mongodb-sharding
-target-version: 0.4.14
+target-version: 0.5.7
 ---
 
 # MongoDB Sharding
@@ -29,11 +29,10 @@ This document will only explain the core concepts of the selected sharding strat
 
 1. Cluster-wide: `users`, `projects`
 2. Project-wide: `documents`, `clients`
-3. Document-wide: `changes`, `snapshots`, `syncedseqs` 
+3. Document-wide: `changes`, `snapshots`, `syncedseqs`, `versionvectors`
 
 <img src="media/mongodb-sharding-prev-relation.png">
 
-
 **Sharding Goals**
 
 Shard Project-wide and Document-wide collections due to the large number of data count in each collection
@@ -49,6 +48,7 @@ Shard Project-wide and Document-wide collections due to the large number of data
 3. `Changes`: `(doc_id, server_seq)`
 4. `Snapshots`: `(doc_id, server_seq)`
 5. `Syncedseqs`: `(doc_id, client_id)`
+6. `Versionvectors`: `(doc_id, client_id)`
 
 **Main Query Patterns**
 
@@ -64,6 +64,7 @@ cursor, err := c.collection(ColClients).Find(ctx, bson.M{
     },
 }, options.Find().SetLimit(int64(candidatesLimit)))
 ```
+
 ```go
 // Documents
 filter := bson.M{
@@ -106,6 +107,7 @@ cursor, err := c.collection(colChanges).Find(ctx, bson.M{
     },
 }, options.Find())
 ```
+
 ```go
 // Snapshots
 result := c.collection(colSnapshots).FindOne(ctx, bson.M{
@@ -132,6 +134,7 @@ Every unique constraint can be satisfied because each has the shard key as a pre
 3. `Changes`: `(doc_id, server_seq)`
 4. `Snapshots`: `(doc_id, server_seq)`
 5. `Syncedseqs`: `(doc_id, client_id)`
+6. `Versionvectors`: `(doc_id, client_id)`
 
 **Changes of Reference Keys**
 
@@ -142,6 +145,7 @@ Since the uniqueness of `_id` isn't guaranteed across shards, reference keys to
 3. `Changes`: `_id` -> `(project_id, doc_id, server_seq)`
 4. `Snapshots`: `_id` -> `(project_id, doc_id, server_seq)`
 5. `Syncedseqs`: `_id` -> `(project_id, doc_id, client_id)`
+6. `Versionvectors`: `_id` -> `(project_id, doc_id, client_id)`
 
 Considering that MongoDB ensures the uniqueness of `_id` per shard, `Documents` and `Clients` can be identified with the combination of `project_id` and `_id`. Note that the reference keys of document-wide collections are also subsequently changed.
 
@@ -155,12 +159,12 @@ Considering that MongoDB ensures the uniqueness of `_id` per shard, `Documents`
 
 For a production deployment, consider the following to ensure data redundancy and system availability.
 
-* Config Server (3 member replica set): `config1`,`config2`,`config3`
-* 3 Shards (each a 3 member replica set):
-	* `shard1-1`,`shard1-2`, `shard1-3`
-	* `shard2-1`,`shard2-2`, `shard2-3`
-	* `shard3-1`,`shard3-2`, `shard3-3`
-* 2 Mongos: `mongos1`, `mongos2`
+- Config Server (3 member replica set): `config1`,`config2`,`config3`
+- 3 Shards (each a 3 member replica set):
+  - `shard1-1`,`shard1-2`, `shard1-3`
+  - `shard2-1`,`shard2-2`, `shard2-3`
+  - `shard3-1`,`shard3-2`, `shard3-3`
+- 2 Mongos: `mongos1`, `mongos2`
 
 ![Cluster architecture](media/mongodb-sharding-cluster-arch.png)
 
@@ -178,11 +182,12 @@ Using a composite shard key like `(project_id, key)` can resolve this issue. Aft
 However, this change of shard keys can lead to the value duplication of either `actor_id` or `owner`, which uses `client_id` as a value. Now the values of `client_id` can be duplicated, contrary to the current sharding strategy where locating every client in the same project into the same shard prevents this kind of duplications.
 
 The duplication of `client_id` values can devastate the consistency of documents. There are three expected approaches to resolve this issue:
+
 1. Use `client_key + client_id` as a value.
-    * this may increase the size of CRDT metadata and the size of document snapshots.
+   - this may increase the size of CRDT metadata and the size of document snapshots.
 2. Introduce a cluster-level GUID generator.
 3. Depend on the low possibility of duplication of MongoDB ObjectID.
-    * see details in the following contents.
+   - see details in the following contents.
 
 **Duplicate MongoDB ObjectID**
 
@@ -192,6 +197,7 @@ Both `client_id` and `doc_id` are currently using MongoDB ObjectID as a value. W
 
 However, the possibility of duplicate ObjectIDs is **extremely low in practical use cases** due to its mechanism.
 ObjectID uses the following format:
+
 ```
 TimeStamp(4 bytes) + MachineId(3 bytes) + ProcessId(2 bytes) + Counter(3 bytes)
 ```
@@ -199,10 +205,12 @@ TimeStamp(4 bytes) + MachineId(3 bytes) + ProcessId(2 bytes) + Counter(3 bytes)
 The condition for duplicate ObjectIDs is that more than `16,777,216` documents/clients are created every single second in a single machine and process. Considering Google processes over `99,000` searches every single second, it is unlikely to occur.
 
 When we have to meet that amount of traffic in the future, consider the following options:
+
 1. Introduce a cluster-level GUID generator.
 2. Disable auto-balancing chunks of documents and clients.
-   * Just isolate each shard for a single project.
+   - Just isolate each shard for a single project.
 
 ## References
-* [Implementation of ObjectID generator in golang driver](https://github.com/mongodb/mongo-go-driver/blob/v0.0.18/bson/objectid/objectid.go#L40)
-* [Generating globally unique identifiers for use with MongoDB](https://www.mongodb.com/blog/post/generating-globally-unique-identifiers-for-use-with-mongodb)
+
+- [Implementation of ObjectID generator in golang driver](https://github.com/mongodb/mongo-go-driver/blob/v0.0.18/bson/objectid/objectid.go#L40)
+- [Generating globally unique identifiers for use with MongoDB](https://www.mongodb.com/blog/post/generating-globally-unique-identifiers-for-use-with-mongodb)
diff --git a/server/backend/database/mongo/indexes.go b/server/backend/database/mongo/indexes.go
index 578c8bf38..2af28b84a 100644
--- a/server/backend/database/mongo/indexes.go
+++ b/server/backend/database/mongo/indexes.go
@@ -174,8 +174,8 @@ var collectionInfos = []collectionInfo{
 		name: ColVersionVectors,
 		indexes: []mongo.IndexModel{{
 			Keys: bsonx.Doc{
+				{Key: "doc_id", Value: bsonx.Int32(1)}, // shard key
 				{Key: "project_id", Value: bsonx.Int32(1)},
-				{Key: "doc_id", Value: bsonx.Int32(1)},
 				{Key: "client_id", Value: bsonx.Int32(1)},
 			},
 			Options: options.Index().SetUnique(true),