From e0b9ea741a57ac16a4b43f1f101dbe2f66da9da3 Mon Sep 17 00:00:00 2001
From: Vadzim Ramanenka <vramanenka@sugarcrm.com>
Date: Wed, 20 Sep 2023 18:27:31 +0200
Subject: [PATCH] CXP-2809: Extend and improve integration tests of kafka
 connect s3

---
 .../kafka/connect/s3/sink/S3SinkTask.java     |   6 +-
 .../kafka/connect/s3/S3SinkConnectorIT.java   | 235 +++++++++++++++++-
 2 files changed, 230 insertions(+), 11 deletions(-)

diff --git a/sink/src/main/java/com/spredfast/kafka/connect/s3/sink/S3SinkTask.java b/sink/src/main/java/com/spredfast/kafka/connect/s3/sink/S3SinkTask.java
index b2c8cd1..8de9ad2 100644
--- a/sink/src/main/java/com/spredfast/kafka/connect/s3/sink/S3SinkTask.java
+++ b/sink/src/main/java/com/spredfast/kafka/connect/s3/sink/S3SinkTask.java
@@ -282,12 +282,12 @@ public boolean shouldFlush() {
       long timeSinceFirstRecordProduced = now - firstRecord.timestamp();
       long timeSinceLastRecordReceived = now - lastRecordReceiveTime;
 
-      boolean doWallClockFlush =
+      boolean doWallTimeFlush =
           flushIntervalMs != -1
               && timeSinceFirstRecordProduced >= (flushIntervalMs + gracePeriodMs)
               && timeSinceLastRecordReceived > gracePeriodMs;
-      if (doWallClockFlush) {
-        log.debug("{} performing a wall clock flush on {}", name(), tp);
+      if (doWallTimeFlush) {
+        log.debug("{} performing a wall time flush on {}", name(), tp);
         return true;
       }
 
diff --git a/sink/src/test/java/com/spredfast/kafka/connect/s3/S3SinkConnectorIT.java b/sink/src/test/java/com/spredfast/kafka/connect/s3/S3SinkConnectorIT.java
index 8baf1f8..8935b24 100644
--- a/sink/src/test/java/com/spredfast/kafka/connect/s3/S3SinkConnectorIT.java
+++ b/sink/src/test/java/com/spredfast/kafka/connect/s3/S3SinkConnectorIT.java
@@ -2,6 +2,7 @@
 
 import static net.mguenther.kafka.junit.ObserveKeyValues.on;
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
 
 import com.amazonaws.SdkClientException;
 import com.amazonaws.auth.AWSStaticCredentialsProvider;
@@ -9,6 +10,7 @@
 import com.amazonaws.client.builder.AwsClientBuilder;
 import com.amazonaws.services.s3.AmazonS3;
 import com.amazonaws.services.s3.AmazonS3ClientBuilder;
+import com.amazonaws.services.s3.model.DeleteObjectsRequest;
 import com.amazonaws.services.s3.model.S3Object;
 import io.debezium.testing.testcontainers.Connector.State;
 import io.debezium.testing.testcontainers.ConnectorConfiguration;
@@ -23,6 +25,8 @@
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.List;
+import java.util.Map;
+import java.util.concurrent.ExecutionException;
 import java.util.concurrent.TimeUnit;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
@@ -34,6 +38,14 @@
 import net.mguenther.kafka.junit.TopicConfig;
 import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
 import org.apache.commons.lang.StringUtils;
+import org.apache.kafka.clients.admin.Admin;
+import org.apache.kafka.clients.admin.AdminClientConfig;
+import org.apache.kafka.clients.consumer.OffsetAndMetadata;
+import org.apache.kafka.clients.producer.KafkaProducer;
+import org.apache.kafka.clients.producer.ProducerConfig;
+import org.apache.kafka.clients.producer.ProducerRecord;
+import org.apache.kafka.common.TopicPartition;
+import org.apache.kafka.common.serialization.StringSerializer;
 import org.awaitility.Awaitility;
 import org.junit.AfterClass;
 import org.junit.Before;
@@ -86,6 +98,9 @@ public class S3SinkConnectorIT {
   private ExternalKafkaCluster kafkaCluster =
       ExternalKafkaCluster.at(kafkaContainer.getBootstrapServers());
 
+  private Admin kafkaAdmin;
+  private KafkaProducer<String, String> kafkaProducer;
+
   @BeforeClass
   public static void startContainers() {
     Startables.deepStart(Stream.of(kafkaContainer, localStackContainer, kafkaConnectContainer))
@@ -107,6 +122,23 @@ public void setup() {
             .build();
 
     kafkaCluster = ExternalKafkaCluster.at(kafkaContainer.getBootstrapServers());
+    kafkaAdmin =
+        Admin.create(
+            Map.of(
+                AdminClientConfig.BOOTSTRAP_SERVERS_CONFIG, kafkaContainer.getBootstrapServers()));
+    kafkaProducer =
+        new KafkaProducer<>(
+            Map.of(
+                ProducerConfig.BOOTSTRAP_SERVERS_CONFIG,
+                kafkaContainer.getBootstrapServers(),
+                ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG,
+                StringSerializer.class,
+                ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG,
+                StringSerializer.class,
+                ProducerConfig.ACKS_CONFIG,
+                "all",
+                ProducerConfig.RETRIES_CONFIG,
+                "1"));
   }
 
   @AfterClass
@@ -134,6 +166,8 @@ public void testSinkWithGzipFileThreshold() throws Exception {
     kafkaConnectContainer.registerConnector(connectorName, connectorConfiguration);
     kafkaConnectContainer.ensureConnectorTaskState(connectorName, 0, State.RUNNING);
 
+    assertEquals(-1, getConsumerOffset("connect-" + connectorName, topicName, 0));
+
     // Produce messages to the topic (uncompressed: 3190, compressed: ~340)
     StringBuilder sb = new StringBuilder();
     for (int i = 0; i < 95; i++) {
@@ -176,6 +210,10 @@ public void testSinkWithGzipFileThreshold() throws Exception {
     objectContents = getS3FileOutput(bucketName, indexObjectKey);
     assertEquals(expectedIndexObjectContents, objectContents);
 
+    Awaitility.await()
+        .atMost(60, TimeUnit.SECONDS)
+        .until(() -> getConsumerOffset("connect-" + connectorName, topicName, 0) == 95);
+
     // It's not enough to trigger another flush
     for (int i = 100; i < 150; i++) {
       kafkaCluster.send(
@@ -191,6 +229,9 @@ public void testSinkWithGzipFileThreshold() throws Exception {
         .atMost(30, TimeUnit.SECONDS)
         .until(() -> !objectKeyExists(bucketName, indexObjectKey2));
 
+    // Make sure the committed offset didn't change as there were no new file uploads
+    assertEquals(95, getConsumerOffset("connect-" + connectorName, topicName, 0));
+
     // Delete the connector
     deleteConnector(connectorName);
   }
@@ -207,6 +248,7 @@ public void testSinkWithBigFlushInterval() throws Exception {
     kafkaCluster.createTopic(TopicConfig.withName(topicName).withNumberOfPartitions(1));
 
     // Define, register, and start the connector
+    final long flushIntevalMs = 12 * 3600 * 1000;
     ConnectorConfiguration connectorConfiguration =
         getConnectorConfiguration(
             connectorName,
@@ -215,31 +257,60 @@ public void testSinkWithBigFlushInterval() throws Exception {
             topicName,
             67108864,
             100 * 1024 * 1024,
-            12 * 3600 * 1000);
+            flushIntevalMs);
     kafkaConnectContainer.registerConnector(connectorName, connectorConfiguration);
     kafkaConnectContainer.ensureConnectorTaskState(connectorName, 0, State.RUNNING);
 
     // Produce messages to the topic
+    StringBuilder sb = new StringBuilder();
+    long ts = System.currentTimeMillis();
     for (int i = 0; i < 100; i++) {
       String json = String.format("{{\"foo\": \"bar\", \"counter\": %d}}", i);
-      kafkaCluster.send(SendValues.to(topicName, json));
+      kafkaProducer.send(new ProducerRecord<>(topicName, 0, ts, "", json));
+      sb.append(json).append("\n");
     }
     LocalDate today = LocalDate.now(ZoneOffset.UTC);
 
     // Define the expected files
-    List<String> expectedObjects =
-        List.of(
-            String.format("%s/%s/%s-00000-000000000000.index.json", prefix, today, topicName),
-            String.format("%s/%s/%s-00000-000000000000.gz", prefix, today, topicName),
-            String.format("%s/last_chunk_index.%s-00000.txt", prefix, topicName));
+    String dataObjectKey =
+        String.format("%s/%s/%s-00000-000000000000.gz", prefix, today, topicName);
+    String indexObjectKey =
+        String.format("%s/%s/%s-00000-000000000000.index.json", prefix, today, topicName);
+    String lastChunkIndexObjectKey =
+        String.format("%s/last_chunk_index.%s-00000.txt", prefix, topicName);
+    List<String> expectedObjects = List.of(indexObjectKey, dataObjectKey, lastChunkIndexObjectKey);
 
-    // Await for all the files to be produced by all tasks
+    // Make sure no files are produced yet as the timestamp based flushing threshold is not reached
     Awaitility.await()
         .pollDelay(5, TimeUnit.SECONDS)
         .during(20, TimeUnit.SECONDS)
         .atMost(30, TimeUnit.SECONDS)
         .until(() -> expectedObjects.stream().noneMatch(key -> objectKeyExists(bucketName, key)));
 
+    assertEquals(-1, getConsumerOffset("connect-" + connectorName, topicName, 0));
+
+    ts += flushIntevalMs + 60 * 1000;
+    for (int i = 100; i < 200; i++) {
+      String json = String.format("{{\"foo\": \"bar\", \"counter\": %d}}", i);
+      kafkaProducer.send(new ProducerRecord<>(topicName, 0, ts, "", json));
+    }
+
+    // Await for all the files to be produced by all tasks
+    Awaitility.await()
+        .atMost(30, TimeUnit.SECONDS)
+        .until(() -> expectedObjects.stream().allMatch(key -> objectKeyExists(bucketName, key)));
+
+    // Check the data object contains expected records
+    // in particular, it shouldn't contain the record that triggered flushing
+    String objectContents = getS3FileOutput(bucketName, dataObjectKey);
+    assertEquals(sb.toString(), objectContents);
+
+    // Check that the committed offset corresponds to the message offset that triggered timestamp
+    // based flushing
+    Awaitility.await()
+        .atMost(30, TimeUnit.SECONDS)
+        .until(() -> getConsumerOffset("connect-" + connectorName, topicName, 0) == 100);
+
     // Delete the connector
     deleteConnector(connectorName);
   }
@@ -404,6 +475,143 @@ public void testSinkWithRestart() throws Exception {
     deleteConnector(connectorName);
   }
 
+  @Test
+  public void testSinkWithWallTimeFlushingAndRewind() throws Exception {
+    String bucketName = "connect-system-test-wall-time";
+    String prefix = "systest";
+    String topicName = "system_test_wall_time";
+    String connectorName = "s3-sink-wall-time";
+    s3.createBucket(bucketName);
+
+    // Create the test topic
+    kafkaCluster.createTopic(TopicConfig.withName(topicName).withNumberOfPartitions(1));
+
+    // Define, register, and start the connector
+    final long flushIntevalMs = 30 * 1000;
+    ConnectorConfiguration connectorConfiguration =
+        getConnectorConfiguration(
+            connectorName,
+            bucketName,
+            prefix,
+            topicName,
+            67108864,
+            100 * 1024 * 1024,
+            flushIntevalMs);
+    kafkaConnectContainer.registerConnector(connectorName, connectorConfiguration);
+    kafkaConnectContainer.ensureConnectorTaskState(connectorName, 0, State.RUNNING);
+
+    assertEquals(-1, getConsumerOffset("connect-" + connectorName, topicName, 0));
+
+    // Produce messages to the topic
+    StringBuilder sb = new StringBuilder();
+    for (int i = 0; i < 100; i++) {
+      String json = String.format("{{\"foo\": \"bar\", \"counter\": %d}}", i);
+      kafkaCluster.send(SendValues.to(topicName, json));
+      sb.append(json).append("\n");
+    }
+    String firstObjectExpectedContent = sb.toString();
+    LocalDate today = LocalDate.now(ZoneOffset.UTC);
+
+    // Define the expected files
+    String dataObjectKey =
+        String.format("%s/%s/%s-00000-000000000000.gz", prefix, today, topicName);
+    String indexObjectKey =
+        String.format("%s/%s/%s-00000-000000000000.index.json", prefix, today, topicName);
+    String lastChunkIndexObjectKey =
+        String.format("%s/last_chunk_index.%s-00000.txt", prefix, topicName);
+    List<String> expectedObjects = List.of(indexObjectKey, dataObjectKey, lastChunkIndexObjectKey);
+
+    // Make sure files are produced using wall time flushing
+    Awaitility.await()
+        .atMost(flushIntevalMs * 3, TimeUnit.MILLISECONDS)
+        .until(() -> expectedObjects.stream().allMatch(key -> objectKeyExists(bucketName, key)));
+
+    assertEquals(100, getConsumerOffset("connect-" + connectorName, topicName, 0));
+
+    String objectContents = getS3FileOutput(bucketName, dataObjectKey);
+    assertEquals(firstObjectExpectedContent, objectContents);
+
+    sb = new StringBuilder();
+    for (int i = 100; i < 200; i++) {
+      String json = String.format("{{\"foo\": \"bar\", \"counter\": %d}}", i);
+      kafkaCluster.send(SendValues.to(topicName, json));
+      sb.append(json).append("\n");
+    }
+    String secondObjectExpectedContent = sb.toString();
+
+    String dataObjectKey2 =
+        String.format("%s/%s/%s-00000-000000000100.gz", prefix, today, topicName);
+    String indexObjectKey2 =
+        String.format("%s/%s/%s-00000-000000000100.index.json", prefix, today, topicName);
+    List<String> expectedObjects2 =
+        List.of(indexObjectKey2, dataObjectKey2, lastChunkIndexObjectKey);
+
+    // Make sure files are produced using wall time flushing
+    Awaitility.await()
+        .atMost(flushIntevalMs * 3, TimeUnit.MILLISECONDS)
+        .until(() -> expectedObjects2.stream().allMatch(key -> objectKeyExists(bucketName, key)));
+
+    assertEquals(200, getConsumerOffset("connect-" + connectorName, topicName, 0));
+
+    objectContents = getS3FileOutput(bucketName, dataObjectKey2);
+    assertEquals(secondObjectExpectedContent, objectContents);
+
+    deleteConnector(connectorName);
+
+    for (int i = 200; i < 300; i++) {
+      String json = String.format("{{\"foo\": \"bar\", \"counter\": %d}}", i);
+      kafkaCluster.send(SendValues.to(topicName, json));
+    }
+
+    s3.deleteObjects(
+        new DeleteObjectsRequest(bucketName)
+            .withKeys(
+                lastChunkIndexObjectKey,
+                dataObjectKey,
+                indexObjectKey,
+                dataObjectKey2,
+                indexObjectKey2));
+    s3.deleteBucket(bucketName);
+    s3.createBucket(bucketName);
+
+    assertTrue(expectedObjects.stream().noneMatch(key -> objectKeyExists(bucketName, key)));
+    assertTrue(expectedObjects2.stream().noneMatch(key -> objectKeyExists(bucketName, key)));
+
+    // Rewind the consumer to the first record
+    kafkaAdmin
+        .alterConsumerGroupOffsets(
+            "connect-" + connectorName,
+            Map.of(new TopicPartition(topicName, 0), new OffsetAndMetadata(0)))
+        .all()
+        .get();
+
+    assertEquals(0, getConsumerOffset("connect-" + connectorName, topicName, 0));
+
+    kafkaConnectContainer.registerConnector(connectorName, connectorConfiguration);
+    kafkaConnectContainer.ensureConnectorTaskState(connectorName, 0, State.RUNNING);
+
+    // Check files are produced again shortly after the connector restart
+    Awaitility.await()
+        .atMost(5, TimeUnit.SECONDS)
+        .until(
+            () ->
+                Stream.concat(expectedObjects.stream(), expectedObjects2.stream())
+                    .allMatch(key -> objectKeyExists(bucketName, key)));
+
+    Awaitility.await()
+        .atMost(30, TimeUnit.SECONDS)
+        .until(() -> getConsumerOffset("connect-" + connectorName, topicName, 0) == 200);
+
+    // Check same files with same content are produced again
+    objectContents = getS3FileOutput(bucketName, dataObjectKey);
+    assertEquals(firstObjectExpectedContent, objectContents);
+
+    objectContents = getS3FileOutput(bucketName, dataObjectKey2);
+    assertEquals(secondObjectExpectedContent, objectContents);
+
+    deleteConnector(connectorName);
+  }
+
   @Test
   public void testSinkWithBinaryFormat() throws Exception {
     String bucketName = "connect-system-test";
@@ -548,6 +756,17 @@ private boolean objectKeyExists(String bucketName, String objectKey) {
     return returnValue;
   }
 
+  private long getConsumerOffset(String groupId, String topic, int partition)
+      throws ExecutionException, InterruptedException {
+    var offset =
+        kafkaAdmin
+            .listConsumerGroupOffsets(groupId)
+            .partitionsToOffsetAndMetadata()
+            .get()
+            .get(new TopicPartition(topic, partition));
+    return offset == null ? -1 : offset.offset();
+  }
+
   private String getS3FileOutput(String bucketName, String objectKey) throws IOException {
     StringBuilder sb = new StringBuilder();
     S3Object s3Object = s3.getObject(bucketName, objectKey);