diff --git a/CHANGES.md b/CHANGES.md index 93353245042a..20f3ea3a5ec2 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -69,6 +69,7 @@ * Support for X source added (Java/Python) ([#X](https://github.com/apache/beam/issues/X)). * Upgraded GoogleAdsAPI to v19 for GoogleAdsIO (Java) ([#34497](https://github.com/apache/beam/pull/34497)). Changed PTransform method from version-specified (`v17()`) to `current()` for better backward compatibility in the future. +* Added support for writing to Pubsub with ordering keys (Java) ([#21162](https://github.com/apache/beam/issues/21162)) ## New Features / Improvements diff --git a/runners/google-cloud-dataflow-java/src/main/java/org/apache/beam/runners/dataflow/DataflowRunner.java b/runners/google-cloud-dataflow-java/src/main/java/org/apache/beam/runners/dataflow/DataflowRunner.java index 6ff804590a85..66cabecb695b 100644 --- a/runners/google-cloud-dataflow-java/src/main/java/org/apache/beam/runners/dataflow/DataflowRunner.java +++ b/runners/google-cloud-dataflow-java/src/main/java/org/apache/beam/runners/dataflow/DataflowRunner.java @@ -20,6 +20,7 @@ import static java.nio.charset.StandardCharsets.UTF_8; import static org.apache.beam.sdk.io.gcp.pubsub.PubsubIO.ENABLE_CUSTOM_PUBSUB_SINK; import static org.apache.beam.sdk.io.gcp.pubsub.PubsubIO.ENABLE_CUSTOM_PUBSUB_SOURCE; +import static org.apache.beam.sdk.options.ExperimentalOptions.hasExperiment; import static org.apache.beam.sdk.util.CoderUtils.encodeToByteArray; import static org.apache.beam.sdk.util.SerializableUtils.serializeToByteArray; import static org.apache.beam.sdk.util.StringUtils.byteArrayToJsonString; @@ -2150,6 +2151,15 @@ private static void translate( PubsubUnboundedSink overriddenTransform, StepTranslationContext stepContext, PCollection input) { + if (overriddenTransform.getPublishBatchWithOrderingKey()) { + throw new UnsupportedOperationException( + String.format( + "The DataflowRunner does not currently support publishing to Pubsub with ordering keys. " + + "%s is required to support publishing with ordering keys. " + + "Set the pipeline option --experiments=%s to use this PTransform. " + + "See https://issuetracker.google.com/issues/200955424 for current status.", + PubsubUnboundedSink.class.getSimpleName(), ENABLE_CUSTOM_PUBSUB_SINK)); + } stepContext.addInput(PropertyNames.FORMAT, "pubsub"); if (overriddenTransform.getTopicProvider() != null) { if (overriddenTransform.getTopicProvider().isAccessible()) { diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/values/Row.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/values/Row.java index 591a83600561..cf7ad3de7b7b 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/values/Row.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/values/Row.java @@ -829,7 +829,7 @@ public Row attachValues(List<@Nullable Object> attachedValues) { return new RowWithStorage(schema, attachedValues); } - public Row attachValues(Object... values) { + public Row attachValues(@Nullable Object... values) { return attachValues(Arrays.asList(values)); } diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFn.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFn.java index fdfa4e8d0a3e..fb096e382994 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFn.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFn.java @@ -29,19 +29,25 @@ import org.apache.beam.sdk.transforms.windowing.PaneInfo; import org.apache.beam.sdk.values.TupleTag; import org.apache.beam.sdk.values.ValueInSingleWindow; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Strings; import org.checkerframework.checker.nullness.qual.Nullable; import org.joda.time.Instant; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class PreparePubsubWriteDoFn extends DoFn { + private static final Logger LOG = LoggerFactory.getLogger(PreparePubsubWriteDoFn.class); // See https://cloud.google.com/pubsub/quotas#resource_limits. private static final int PUBSUB_MESSAGE_DATA_MAX_BYTES = 10 << 20; private static final int PUBSUB_MESSAGE_MAX_ATTRIBUTES = 100; private static final int PUBSUB_MESSAGE_ATTRIBUTE_MAX_KEY_BYTES = 256; private static final int PUBSUB_MESSAGE_ATTRIBUTE_MAX_VALUE_BYTES = 1024; + private static final int ORDERING_KEY_MAX_BYTE_SIZE = 1024; // The amount of bytes that each attribute entry adds up to the request private static final int PUBSUB_MESSAGE_ATTRIBUTE_ENCODE_ADDITIONAL_BYTES = 6; + private final boolean usesOrderingKey; private int maxPublishBatchSize; - + private boolean logOrderingKeyUnconfigured = false; private SerializableFunction, PubsubMessage> formatFunction; @Nullable SerializableFunction, PubsubIO.PubsubTopic> topicFunction; /** Last TopicPath that reported Lineage. */ @@ -66,6 +72,20 @@ static int validatePubsubMessageSize(PubsubMessage message, int maxPublishBatchS } int totalSize = payloadSize; + @Nullable String orderingKey = message.getOrderingKey(); + if (orderingKey != null) { + int orderingKeySize = orderingKey.getBytes(StandardCharsets.UTF_8).length; + if (orderingKeySize > ORDERING_KEY_MAX_BYTE_SIZE) { + throw new SizeLimitExceededException( + "Pubsub message ordering key of length " + + orderingKeySize + + " exceeds maximum of " + + ORDERING_KEY_MAX_BYTE_SIZE + + " bytes. See https://cloud.google.com/pubsub/quotas#resource_limits"); + } + totalSize += orderingKeySize; + } + @Nullable Map attributes = message.getAttributeMap(); if (attributes != null) { if (attributes.size() > PUBSUB_MESSAGE_MAX_ATTRIBUTES) { @@ -125,12 +145,14 @@ static int validatePubsubMessageSize(PubsubMessage message, int maxPublishBatchS SerializableFunction, PubsubMessage> formatFunction, @Nullable SerializableFunction, PubsubIO.PubsubTopic> topicFunction, + boolean usesOrderingKey, int maxPublishBatchSize, BadRecordRouter badRecordRouter, Coder inputCoder, TupleTag outputTag) { this.formatFunction = formatFunction; this.topicFunction = topicFunction; + this.usesOrderingKey = usesOrderingKey; this.maxPublishBatchSize = maxPublishBatchSize; this.badRecordRouter = badRecordRouter; this.inputCoder = inputCoder; @@ -179,6 +201,16 @@ public void process( null); reportedLineage = topic; } + if (!usesOrderingKey && !Strings.isNullOrEmpty(message.getOrderingKey())) { + if (!logOrderingKeyUnconfigured) { + LOG.warn( + "Encountered Pubsub message with ordering key but this sink was not configured to " + + "retain ordering keys, so they will be dropped. Please set #withOrderingKeys()."); + + logOrderingKeyUnconfigured = true; + } + message = message.withOrderingKey(null); + } try { validatePubsubMessageSize(message, maxPublishBatchSize); } catch (SizeLimitExceededException e) { diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java index ebc9122e3c35..3c08bcbf2819 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java @@ -22,7 +22,6 @@ import com.google.api.client.util.Clock; import com.google.auto.value.AutoValue; -import com.google.protobuf.ByteString; import com.google.protobuf.Descriptors.Descriptor; import com.google.protobuf.DynamicMessage; import com.google.protobuf.InvalidProtocolBufferException; @@ -1378,6 +1377,8 @@ public abstract static class Write extends PTransform, PDone> abstract @Nullable String getPubsubRootUrl(); + abstract boolean getPublishWithOrderingKey(); + abstract BadRecordRouter getBadRecordRouter(); abstract ErrorHandler getBadRecordErrorHandler(); @@ -1393,6 +1394,7 @@ static Builder newBuilder( builder.setFormatFn(formatFn); builder.setBadRecordRouter(BadRecordRouter.THROWING_ROUTER); builder.setBadRecordErrorHandler(new DefaultErrorHandler<>()); + builder.setPublishWithOrderingKey(false); builder.setValidate(false); return builder; } @@ -1425,6 +1427,8 @@ abstract Builder setFormatFn( abstract Builder setPubsubRootUrl(String pubsubRootUrl); + abstract Builder setPublishWithOrderingKey(boolean publishWithOrderingKey); + abstract Builder setBadRecordRouter(BadRecordRouter badRecordRouter); abstract Builder setBadRecordErrorHandler( @@ -1510,6 +1514,19 @@ public Write withMaxBatchBytesSize(int maxBatchBytesSize) { return toBuilder().setMaxBatchBytesSize(maxBatchBytesSize).build(); } + /** + * Writes to Pub/Sub with each record's ordering key. A subscription with message ordering + * enabled will receive messages published in the same region with the same ordering key in the + * order in which they were received by the service. Note that the order in which Beam publishes + * records to the service remains unspecified. + * + * @see Pub/Sub documentation on message + * ordering + */ + public Write withOrderingKey() { + return toBuilder().setPublishWithOrderingKey(true).build(); + } + /** * Writes to Pub/Sub and adds each record's timestamp to the published messages in an attribute * with the specified name. The value of the attribute will be a number representing the number @@ -1586,6 +1603,7 @@ public PDone expand(PCollection input) { new PreparePubsubWriteDoFn<>( getFormatFn(), topicFunction, + getPublishWithOrderingKey(), maxMessageSize, getBadRecordRouter(), input.getCoder(), @@ -1597,8 +1615,12 @@ public PDone expand(PCollection input) { pubsubMessageTuple .get(BAD_RECORD_TAG) .setCoder(BadRecord.getCoder(input.getPipeline()))); - PCollection pubsubMessages = - pubsubMessageTuple.get(pubsubMessageTupleTag).setCoder(PubsubMessageWithTopicCoder.of()); + PCollection pubsubMessages = pubsubMessageTuple.get(pubsubMessageTupleTag); + if (getPublishWithOrderingKey()) { + pubsubMessages.setCoder(PubsubMessageSchemaCoder.getSchemaCoder()); + } else { + pubsubMessages.setCoder(PubsubMessageWithTopicCoder.of()); + } switch (input.isBounded()) { case BOUNDED: pubsubMessages.apply( @@ -1618,6 +1640,7 @@ public PDone expand(PCollection input) { getTimestampAttribute(), getIdAttribute(), 100 /* numShards */, + getPublishWithOrderingKey(), MoreObjects.firstNonNull( getMaxBatchSize(), PubsubUnboundedSink.DEFAULT_PUBLISH_BATCH_SIZE), MoreObjects.firstNonNull( @@ -1679,7 +1702,9 @@ private class OutgoingData { } } - private transient Map output; + // NOTE: A single publish request may only write to one ordering key. + // See https://cloud.google.com/pubsub/docs/publisher#using-ordering-keys for details. + private transient Map, OutgoingData> output; private transient PubsubClient pubsubClient; @@ -1710,51 +1735,47 @@ public void startBundle(StartBundleContext c) throws IOException { public void processElement(@Element PubsubMessage message, @Timestamp Instant timestamp) throws IOException, SizeLimitExceededException { // Validate again here just as a sanity check. + // TODO(sjvanrossum): https://github.com/apache/beam/issues/31800 + // - Size validation makes no distinction between JSON and Protobuf encoding + // - Accounting for HTTP to gRPC transcoding is non-trivial PreparePubsubWriteDoFn.validatePubsubMessageSize(message, maxPublishBatchByteSize); - byte[] payload = message.getPayload(); - int messageSize = payload.length; - - PubsubTopic pubsubTopic; + // NOTE: The record id is always null since it will be assigned by Pub/Sub. + final OutgoingMessage msg = + OutgoingMessage.of(message, timestamp.getMillis(), null, message.getTopic()); + // TODO(sjvanrossum): https://github.com/apache/beam/issues/31800 + // - Size validation makes no distinction between JSON and Protobuf encoding + // - Accounting for HTTP to gRPC transcoding is non-trivial + final int messageSize = msg.getMessage().getData().size(); + + final PubsubTopic pubsubTopic; if (getTopicProvider() != null) { pubsubTopic = getTopicProvider().get(); } else { - pubsubTopic = - PubsubTopic.fromPath(Preconditions.checkArgumentNotNull(message.getTopic())); - } - // Checking before adding the message stops us from violating max batch size or bytes - OutgoingData currentTopicOutput = - output.computeIfAbsent(pubsubTopic, t -> new OutgoingData()); - if (currentTopicOutput.messages.size() >= maxPublishBatchSize - || (!currentTopicOutput.messages.isEmpty() - && (currentTopicOutput.bytes + messageSize) >= maxPublishBatchByteSize)) { - publish(pubsubTopic, currentTopicOutput.messages); - currentTopicOutput.messages.clear(); - currentTopicOutput.bytes = 0; + pubsubTopic = PubsubTopic.fromPath(Preconditions.checkArgumentNotNull(msg.topic())); } - Map attributes = message.getAttributeMap(); - String orderingKey = message.getOrderingKey(); - - com.google.pubsub.v1.PubsubMessage.Builder msgBuilder = - com.google.pubsub.v1.PubsubMessage.newBuilder() - .setData(ByteString.copyFrom(payload)) - .putAllAttributes(attributes); - - if (orderingKey != null) { - msgBuilder.setOrderingKey(orderingKey); + // Checking before adding the message stops us from violating max batch size or bytes + String orderingKey = getPublishWithOrderingKey() ? msg.getMessage().getOrderingKey() : ""; + final OutgoingData currentTopicAndOrderingKeyOutput = + output.computeIfAbsent(KV.of(pubsubTopic, orderingKey), t -> new OutgoingData()); + // TODO(sjvanrossum): https://github.com/apache/beam/issues/31800 + if (currentTopicAndOrderingKeyOutput.messages.size() >= maxPublishBatchSize + || (!currentTopicAndOrderingKeyOutput.messages.isEmpty() + && (currentTopicAndOrderingKeyOutput.bytes + messageSize) + >= maxPublishBatchByteSize)) { + publish(pubsubTopic, currentTopicAndOrderingKeyOutput.messages); + currentTopicAndOrderingKeyOutput.messages.clear(); + currentTopicAndOrderingKeyOutput.bytes = 0; } - // NOTE: The record id is always null. - currentTopicOutput.messages.add( - OutgoingMessage.of( - msgBuilder.build(), timestamp.getMillis(), null, message.getTopic())); - currentTopicOutput.bytes += messageSize; + currentTopicAndOrderingKeyOutput.messages.add(msg); + currentTopicAndOrderingKeyOutput.bytes += messageSize; } @FinishBundle public void finishBundle() throws IOException { - for (Map.Entry entry : output.entrySet()) { - publish(entry.getKey(), entry.getValue().messages); + for (Map.Entry, OutgoingData> entry : output.entrySet()) { + publish(entry.getKey().getKey(), entry.getValue().messages); } output = null; pubsubClient.close(); diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubJsonClient.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubJsonClient.java index a12c64ff9a9b..f36e94360996 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubJsonClient.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubJsonClient.java @@ -226,11 +226,11 @@ public List pull( com.google.pubsub.v1.PubsubMessage.newBuilder(); protoMessage.setData(ByteString.copyFrom(elementBytes)); protoMessage.putAllAttributes(attributes); - // PubsubMessage uses `null` to represent no ordering key where we want a default of "". + // {@link PubsubMessage} uses `null` or empty string to represent no ordering key. + // {@link com.google.pubsub.v1.PubsubMessage} does not track string field presence and uses + // empty string as a default. if (pubsubMessage.getOrderingKey() != null) { protoMessage.setOrderingKey(pubsubMessage.getOrderingKey()); - } else { - protoMessage.setOrderingKey(""); } incomingMessages.add( IncomingMessage.of( diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessage.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessage.java index d57ac3d80207..779a9b847874 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessage.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessage.java @@ -112,6 +112,16 @@ public byte[] getPayload() { return impl.getMessageId(); } + public PubsubMessage withOrderingKey(@Nullable String orderingKey) { + return new PubsubMessage( + Impl.create( + impl.getTopic(), + impl.getPayload(), + impl.getAttributeMap(), + impl.getMessageId(), + orderingKey)); + } + /** Returns the ordering key of the message. */ public @Nullable String getOrderingKey() { return impl.getOrderingKey(); diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageSchemaCoder.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageSchemaCoder.java new file mode 100644 index 000000000000..e3df56a98185 --- /dev/null +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageSchemaCoder.java @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.gcp.pubsub; + +import org.apache.beam.sdk.schemas.Schema; +import org.apache.beam.sdk.schemas.SchemaCoder; +import org.apache.beam.sdk.transforms.SerializableFunction; +import org.apache.beam.sdk.values.Row; +import org.apache.beam.sdk.values.TypeDescriptor; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; +import org.checkerframework.checker.nullness.qual.Nullable; + +/** + * Provides a {@link SchemaCoder} for {@link PubsubMessage}, including the topic and all fields of a + * PubSub message from server. + * + *

{@link SchemaCoder} is used so that fields can be added in the future without breaking update + * compatibility. Maintainers should prefer this coder when adding new features to {@link PubsubIO}. + */ +public class PubsubMessageSchemaCoder { + // NOTE: Fields must not be reordered. + private static final Schema PUBSUB_MESSAGE_SCHEMA = + Schema.builder() + .addByteArrayField("payload") + .addNullableStringField("topic") + .addNullableMapField("attributes", Schema.FieldType.STRING, Schema.FieldType.STRING) + .addNullableStringField("message_id") + .addNullableStringField("ordering_key") + .build(); + + private static final SerializableFunction TO_ROW = + (PubsubMessage message) -> { + // NOTE: The row's value attachment order must match the schema's definition order. + return Row.withSchema(PUBSUB_MESSAGE_SCHEMA) + .attachValues( + message.getPayload(), + message.getTopic(), + message.getAttributeMap(), + message.getMessageId(), + message.getOrderingKey()); + }; + + private static final SerializableFunction FROM_ROW = + (Row row) -> { + PubsubMessage message = + new PubsubMessage( + Preconditions.checkNotNull(row.getBytes("payload")), + row.getMap("attributes"), + row.getString("message_id"), + row.getString("ordering_key")); + + @Nullable String topic = row.getString("topic"); + if (topic != null) { + message = message.withTopic(topic); + } + return message; + }; + + public static SchemaCoder getSchemaCoder() { + return SchemaCoder.of( + PUBSUB_MESSAGE_SCHEMA, TypeDescriptor.of(PubsubMessage.class), TO_ROW, FROM_ROW); + } +} diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageWithAttributesAndMessageIdAndOrderingKeyCoder.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageWithAttributesAndMessageIdAndOrderingKeyCoder.java index 7c2a4250e87c..968d76584bc3 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageWithAttributesAndMessageIdAndOrderingKeyCoder.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageWithAttributesAndMessageIdAndOrderingKeyCoder.java @@ -31,7 +31,12 @@ import org.apache.beam.sdk.extensions.protobuf.ProtoCoder; import org.apache.beam.sdk.values.TypeDescriptor; -/** A coder for PubsubMessage including all fields of a PubSub message from server. */ +/** + * A coder for PubsubMessage including all fields of a PubSub message from server. + * + *

Maintainers should prefer {@link PubsubMessageSchemaCoder} over this coder when adding + * features to {@link PubsubIO}. + */ @SuppressWarnings({ "nullness" // TODO(https://github.com/apache/beam/issues/20497) }) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageWithAttributesAndMessageIdCoder.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageWithAttributesAndMessageIdCoder.java index f6fa8a0cc973..803ff83583ce 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageWithAttributesAndMessageIdCoder.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageWithAttributesAndMessageIdCoder.java @@ -29,7 +29,12 @@ import org.apache.beam.sdk.coders.StringUtf8Coder; import org.apache.beam.sdk.values.TypeDescriptor; -/** A coder for PubsubMessage including attributes and the message id from the PubSub server. */ +/** + * A coder for PubsubMessage including attributes and the message id from the PubSub server. + * + *

Maintainers should prefer {@link PubsubMessageSchemaCoder} over this coder when adding + * features to {@link PubsubIO}. + */ @SuppressWarnings({ "nullness" // TODO(https://github.com/apache/beam/issues/20497) }) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageWithAttributesCoder.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageWithAttributesCoder.java index 3be321891d33..9dd3ab56a72a 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageWithAttributesCoder.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageWithAttributesCoder.java @@ -29,7 +29,12 @@ import org.apache.beam.sdk.coders.StringUtf8Coder; import org.apache.beam.sdk.values.TypeDescriptor; -/** A coder for PubsubMessage including attributes. */ +/** + * A coder for PubsubMessage including attributes. + * + *

Maintainers should prefer {@link PubsubMessageSchemaCoder} over this coder when adding + * features to {@link PubsubIO}. + */ @SuppressWarnings({ "nullness" // TODO(https://github.com/apache/beam/issues/20497) }) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageWithMessageIdCoder.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageWithMessageIdCoder.java index 95bd43c53a66..7016146b3671 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageWithMessageIdCoder.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageWithMessageIdCoder.java @@ -29,6 +29,9 @@ /** * A coder for PubsubMessage treating the raw bytes being decoded as the message's payload, with the * message id from the PubSub server. + * + *

Maintainers should prefer {@link PubsubMessageSchemaCoder} over this coder when adding + * features to {@link PubsubIO}. */ @SuppressWarnings({ "nullness" // TODO(https://github.com/apache/beam/issues/20497) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageWithTopicCoder.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageWithTopicCoder.java index 768aebe54e65..c3270625fce1 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageWithTopicCoder.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageWithTopicCoder.java @@ -30,7 +30,12 @@ import org.apache.beam.sdk.values.TypeDescriptor; import org.checkerframework.checker.nullness.qual.Nullable; -/** A coder for PubsubMessage including the topic from the PubSub server. */ +/** + * A coder for PubsubMessage including the topic from the PubSub server. + * + *

Maintainers should prefer {@link PubsubMessageSchemaCoder} over this coder when adding + * features to {@link PubsubIO}. + */ public class PubsubMessageWithTopicCoder extends CustomCoder { // A message's payload cannot be null private static final Coder PAYLOAD_CODER = ByteArrayCoder.of(); diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubTestClient.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubTestClient.java index 3d5a879fce15..7bf342eee8c6 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubTestClient.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubTestClient.java @@ -32,6 +32,7 @@ import org.apache.beam.sdk.schemas.Schema; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Lists; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Sets; +import org.checkerframework.checker.nullness.qual.MonotonicNonNull; import org.checkerframework.checker.nullness.qual.Nullable; /** @@ -304,12 +305,23 @@ private static void activate(Runnable setStateValues) { private static void deactivate(Runnable runFinalChecks) { synchronized (STATE) { checkState(STATE.isActive, "No test still in flight"); - runFinalChecks.run(); - STATE.remainingExpectedOutgoingMessages = null; - STATE.remainingPendingIncomingMessages = null; - STATE.pendingAckIncomingMessages = null; - STATE.ackDeadline = null; - STATE.isActive = false; + try { + runFinalChecks.run(); + } finally { + STATE.isPublish = false; + STATE.expectedTopic = null; + STATE.remainingExpectedOutgoingMessages = null; + STATE.remainingFailingOutgoingMessages = null; + STATE.clock = null; + STATE.expectedSubscription = null; + STATE.ackTimeoutSec = 0; + STATE.remainingPendingIncomingMessages = null; + STATE.pendingAckIncomingMessages = null; + STATE.ackDeadline = null; + STATE.expectedSchemaPath = null; + STATE.expectedSchema = null; + STATE.isActive = false; + } } } @@ -461,7 +473,12 @@ public int publish(TopicPath topic, List outgoingMessages) thro topic, STATE.expectedTopic); } + @MonotonicNonNull String batchOrderingKey = null; for (OutgoingMessage outgoingMessage : outgoingMessages) { + if (batchOrderingKey == null) { + batchOrderingKey = outgoingMessage.getMessage().getOrderingKey(); + } + checkState(outgoingMessage.getMessage().getOrderingKey().equals(batchOrderingKey)); if (isDynamic) { checkState(outgoingMessage.topic().equals(topic.getPath())); } else { diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java index 00d563772bfd..3fe7d51aec1e 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java @@ -23,8 +23,11 @@ import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.UUID; import java.util.concurrent.ThreadLocalRandom; import org.apache.beam.sdk.coders.AtomicCoder; @@ -69,7 +72,9 @@ import org.apache.beam.sdk.values.TypeDescriptors; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Strings; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.hash.Hashing; +import org.checkerframework.checker.nullness.qual.MonotonicNonNull; import org.checkerframework.checker.nullness.qual.Nullable; import org.joda.time.Duration; import org.joda.time.Instant; @@ -201,8 +206,17 @@ public void processElement( break; } + // TODO(sjvanrossum): https://github.com/apache/beam/issues/31828 + // NOTE: Null and empty ordering keys are treated as equivalent. @Nullable String topic = dynamicTopicFn.apply(element); - K key = keyFunction.apply(ThreadLocalRandom.current().nextInt(numShards), topic); + @Nullable String orderingKey = message.getOrderingKey(); + int shard = + Strings.isNullOrEmpty(orderingKey) + ? ThreadLocalRandom.current().nextInt(numShards) + : Hashing.consistentHash( + Hashing.farmHashFingerprint64().hashString(orderingKey, StandardCharsets.UTF_8), + numShards); + K key = keyFunction.apply(shard, topic); o.output(KV.of(key, OutgoingMessage.of(message, timestampMsSinceEpoch, recordId, topic))); } @@ -220,11 +234,20 @@ public void populateDisplayData(DisplayData.Builder builder) { /** Publish messages to Pubsub in batches. */ @VisibleForTesting static class WriterFn extends DoFn, Void> { + private static class OutgoingData { + int bytes; + List messages; + + OutgoingData() { + this.bytes = 0; + this.messages = new ArrayList<>(); + } + } + private final PubsubClientFactory pubsubFactory; private final @Nullable ValueProvider topic; private final String timestampAttribute; private final String idAttribute; - private final int publishBatchSize; private final int publishBatchBytes; private final String pubsubRootUrl; @@ -247,7 +270,6 @@ static class WriterFn extends DoFn, Void> { this.topic = topic; this.timestampAttribute = timestampAttribute; this.idAttribute = idAttribute; - this.publishBatchSize = publishBatchSize; this.publishBatchBytes = publishBatchBytes; this.pubsubRootUrl = null; } @@ -257,14 +279,12 @@ static class WriterFn extends DoFn, Void> { @Nullable ValueProvider topic, String timestampAttribute, String idAttribute, - int publishBatchSize, int publishBatchBytes, String pubsubRootUrl) { this.pubsubFactory = pubsubFactory; this.topic = topic; this.timestampAttribute = timestampAttribute; this.idAttribute = idAttribute; - this.publishBatchSize = publishBatchSize; this.publishBatchBytes = publishBatchBytes; this.pubsubRootUrl = pubsubRootUrl; } @@ -316,27 +336,45 @@ public void startBundle(StartBundleContext c) throws Exception { } @ProcessElement + @SuppressWarnings("ReferenceEquality") public void processElement(ProcessContext c) throws Exception { - List pubsubMessages = new ArrayList<>(publishBatchSize); - int bytes = 0; + // TODO(sjvanrossum): Refactor the write transform so this map can be indexed with topic + + // ordering key and have bundle scoped lifetime. + // NOTE: A single publish request may only write to one ordering key. + // See https://cloud.google.com/pubsub/docs/publisher#using-ordering-keys for details. + Map orderingKeyBatches = new HashMap<>(); + @MonotonicNonNull String currentOrderingKey = null; + @Nullable OutgoingData currentBatch = null; for (OutgoingMessage message : c.element()) { - if (!pubsubMessages.isEmpty() - && bytes + message.getMessage().getData().size() > publishBatchBytes) { + String messageOrderingKey = message.getMessage().getOrderingKey(); + if (currentOrderingKey == null || !currentOrderingKey.equals(messageOrderingKey)) { + currentOrderingKey = messageOrderingKey; + currentBatch = orderingKeyBatches.get(currentOrderingKey); + } + if (currentBatch == null) { + currentBatch = new OutgoingData(); + orderingKeyBatches.put(currentOrderingKey, currentBatch); + } else if (currentBatch.bytes + message.getMessage().getData().size() > publishBatchBytes) { + // TODO(sjvanrossum): https://github.com/apache/beam/issues/31800 + // Break large (in bytes) batches into smaller. // (We've already broken by batch size using the trigger below, though that may // run slightly over the actual PUBLISH_BATCH_SIZE. We'll consider that ok since // the hard limit from Pubsub is by bytes rather than number of messages.) // BLOCKS until published. - publishBatch(pubsubMessages, bytes); - pubsubMessages.clear(); - bytes = 0; + publishBatch(currentBatch.messages, currentBatch.bytes); + currentBatch.messages.clear(); + currentBatch.bytes = 0; } - pubsubMessages.add(message); - bytes += message.getMessage().getData().size(); + currentBatch.messages.add(message); + // TODO(sjvanrossum): https://github.com/apache/beam/issues/31800 + currentBatch.bytes += message.getMessage().getData().size(); } - if (!pubsubMessages.isEmpty()) { - // BLOCKS until published. - publishBatch(pubsubMessages, bytes); + for (OutgoingData batch : orderingKeyBatches.values()) { + if (!batch.messages.isEmpty()) { + // BLOCKS until published. + publishBatch(batch.messages, batch.bytes); + } } } @@ -389,6 +427,12 @@ public void populateDisplayData(DisplayData.Builder builder) { */ private final int numShards; + /** + * Publish messages with an ordering key. Currently unsupported with DataflowRunner's Pubsub sink + * override. + */ + private final boolean publishBatchWithOrderingKey; + /** Maximum number of messages per publish. */ private final int publishBatchSize; @@ -413,6 +457,7 @@ public void populateDisplayData(DisplayData.Builder builder) { String timestampAttribute, String idAttribute, int numShards, + boolean publishBatchWithOrderingKey, int publishBatchSize, int publishBatchBytes, Duration maxLatency, @@ -423,6 +468,7 @@ public void populateDisplayData(DisplayData.Builder builder) { this.timestampAttribute = timestampAttribute; this.idAttribute = idAttribute; this.numShards = numShards; + this.publishBatchWithOrderingKey = publishBatchWithOrderingKey; this.publishBatchSize = publishBatchSize; this.publishBatchBytes = publishBatchBytes; this.maxLatency = maxLatency; @@ -435,13 +481,15 @@ public PubsubUnboundedSink( ValueProvider topic, String timestampAttribute, String idAttribute, - int numShards) { + int numShards, + boolean publishBatchWithOrderingKey) { this( pubsubFactory, topic, timestampAttribute, idAttribute, numShards, + publishBatchWithOrderingKey, DEFAULT_PUBLISH_BATCH_SIZE, DEFAULT_PUBLISH_BATCH_BYTES, DEFAULT_MAX_LATENCY, @@ -455,6 +503,7 @@ public PubsubUnboundedSink( String timestampAttribute, String idAttribute, int numShards, + boolean publishBatchWithOrderingKey, String pubsubRootUrl) { this( pubsubFactory, @@ -462,6 +511,7 @@ public PubsubUnboundedSink( timestampAttribute, idAttribute, numShards, + publishBatchWithOrderingKey, DEFAULT_PUBLISH_BATCH_SIZE, DEFAULT_PUBLISH_BATCH_BYTES, DEFAULT_MAX_LATENCY, @@ -475,6 +525,7 @@ public PubsubUnboundedSink( String timestampAttribute, String idAttribute, int numShards, + boolean publishBatchWithOrderingKey, int publishBatchSize, int publishBatchBytes) { this( @@ -483,6 +534,7 @@ public PubsubUnboundedSink( timestampAttribute, idAttribute, numShards, + publishBatchWithOrderingKey, publishBatchSize, publishBatchBytes, DEFAULT_MAX_LATENCY, @@ -496,6 +548,7 @@ public PubsubUnboundedSink( String timestampAttribute, String idAttribute, int numShards, + boolean publishBatchWithOrderingKey, int publishBatchSize, int publishBatchBytes, String pubsubRootUrl) { @@ -505,6 +558,7 @@ public PubsubUnboundedSink( timestampAttribute, idAttribute, numShards, + publishBatchWithOrderingKey, publishBatchSize, publishBatchBytes, DEFAULT_MAX_LATENCY, @@ -531,6 +585,10 @@ public PubsubUnboundedSink( return idAttribute; } + public boolean getPublishBatchWithOrderingKey() { + return publishBatchWithOrderingKey; + } + @Override public PDone expand(PCollection input) { if (topic != null) { @@ -601,7 +659,6 @@ public PDone expand(PCollection> input) { outer.topic, outer.timestampAttribute, outer.idAttribute, - outer.publishBatchSize, outer.publishBatchBytes, outer.pubsubRootUrl))); return PDone.in(input.getPipeline()); @@ -654,7 +711,6 @@ public PDone expand(PCollection input) { outer.topic, outer.timestampAttribute, outer.idAttribute, - outer.publishBatchSize, outer.publishBatchBytes, outer.pubsubRootUrl))); return PDone.in(input.getPipeline()); diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFnTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFnTest.java index 494189d43f36..a125a7b67e69 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFnTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFnTest.java @@ -44,6 +44,19 @@ public void testValidatePubsubMessageSizeOnlyPayload() throws SizeLimitExceededE assertEquals(data.length, messageSize); } + @Test + public void testValidatePubsubMessageSizePayloadAndOrderingKey() + throws SizeLimitExceededException { + byte[] data = new byte[1024]; + String orderingKey = "key"; + PubsubMessage message = new PubsubMessage(data, null, null, orderingKey); + + int messageSize = + PreparePubsubWriteDoFn.validatePubsubMessageSize(message, PUBSUB_MESSAGE_MAX_TOTAL_SIZE); + + assertEquals(data.length + orderingKey.getBytes(StandardCharsets.UTF_8).length, messageSize); + } + @Test public void testValidatePubsubMessageSizePayloadAndAttributes() throws SizeLimitExceededException { @@ -76,6 +89,19 @@ public void testValidatePubsubMessageSizePayloadTooLarge() { message, PUBSUB_MESSAGE_MAX_TOTAL_SIZE)); } + @Test + public void testValidatePubsubMessageSizePayloadPlusOrderingKeyTooLarge() { + byte[] data = new byte[(10 << 20)]; + String orderingKey = "key"; + PubsubMessage message = new PubsubMessage(data, null, null, orderingKey); + + assertThrows( + SizeLimitExceededException.class, + () -> + PreparePubsubWriteDoFn.validatePubsubMessageSize( + message, PUBSUB_MESSAGE_MAX_TOTAL_SIZE)); + } + @Test public void testValidatePubsubMessageSizePayloadPlusAttributesTooLarge() { byte[] data = new byte[(10 << 20)]; @@ -121,6 +147,19 @@ public void testValidatePubsubMessageSizeAttributeValueTooLarge() { message, PUBSUB_MESSAGE_MAX_TOTAL_SIZE)); } + @Test + public void testValidatePubsubMessageSizeOrderingKeyTooLarge() { + byte[] data = new byte[1024]; + String orderingKey = RandomStringUtils.randomAscii(1025); + PubsubMessage message = new PubsubMessage(data, null, null, orderingKey); + + assertThrows( + SizeLimitExceededException.class, + () -> + PreparePubsubWriteDoFn.validatePubsubMessageSize( + message, PUBSUB_MESSAGE_MAX_TOTAL_SIZE)); + } + @Test public void testValidatePubsubMessagePayloadTooLarge() { byte[] data = new byte[(10 << 20) + 1]; diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubSubWritePayloadTranslationTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubSubWritePayloadTranslationTest.java index 45fbab0576fb..75a6173591a1 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubSubWritePayloadTranslationTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubSubWritePayloadTranslationTest.java @@ -67,6 +67,7 @@ public void testTranslateSinkWithTopic() throws Exception { TIMESTAMP_ATTRIBUTE, ID_ATTRIBUTE, 0, + false, 0, 0, Duration.ZERO, @@ -104,6 +105,7 @@ public void testTranslateDynamicSink() throws Exception { TIMESTAMP_ATTRIBUTE, ID_ATTRIBUTE, 0, + false, 0, 0, Duration.ZERO, @@ -143,6 +145,7 @@ public void testTranslateSinkWithTopicOverridden() throws Exception { TIMESTAMP_ATTRIBUTE, ID_ATTRIBUTE, 0, + false, 0, 0, Duration.ZERO, diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageSchemaCoderTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageSchemaCoderTest.java new file mode 100644 index 000000000000..0776b1cb100f --- /dev/null +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageSchemaCoderTest.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.gcp.pubsub; + +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.equalTo; + +import java.nio.charset.StandardCharsets; +import java.util.Map; +import org.apache.beam.sdk.coders.Coder; +import org.apache.beam.sdk.testing.CoderProperties; +import org.apache.beam.sdk.util.SerializableUtils; +import org.apache.beam.sdk.values.TypeDescriptor; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; +import org.junit.Test; + +public class PubsubMessageSchemaCoderTest { + private static final String DATA = "testData"; + private static final String TOPIC = "testTopic"; + private static final String MESSAGE_ID = "testMessageId"; + private static final Map ATTRIBUTES = + new ImmutableMap.Builder().put("1", "hello").build(); + private static final String ORDERING_KEY = "key123"; + private static final Coder TEST_CODER = PubsubMessageSchemaCoder.getSchemaCoder(); + private static final PubsubMessage TEST_VALUE = + new PubsubMessage(DATA.getBytes(StandardCharsets.UTF_8), ATTRIBUTES, MESSAGE_ID, ORDERING_KEY) + .withTopic(TOPIC); + private static final PubsubMessage TEST_MINIMAL_VALUE = + new PubsubMessage(DATA.getBytes(StandardCharsets.UTF_8), null); + + @Test + public void testValueEncodable() { + SerializableUtils.ensureSerializableByCoder(TEST_CODER, TEST_VALUE, "error"); + SerializableUtils.ensureSerializableByCoder(TEST_CODER, TEST_MINIMAL_VALUE, "error"); + } + + @Test + public void testCoderDecodeEncodeEqual() throws Exception { + CoderProperties.structuralValueDecodeEncodeEqual(TEST_CODER, TEST_VALUE); + CoderProperties.structuralValueDecodeEncodeEqual(TEST_CODER, TEST_MINIMAL_VALUE); + } + + @Test + public void testEncodedTypeDescriptor() { + TypeDescriptor typeDescriptor = new TypeDescriptor() {}; + assertThat(TEST_CODER.getEncodedTypeDescriptor(), equalTo(typeDescriptor)); + } +} diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSinkTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSinkTest.java index be68083bb28c..49b7f88f9e25 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSinkTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSinkTest.java @@ -17,14 +17,18 @@ */ package org.apache.beam.sdk.io.gcp.pubsub; +import static org.junit.Assert.assertThrows; + import com.google.protobuf.ByteString; import java.io.IOException; import java.io.Serializable; import java.nio.charset.StandardCharsets; import java.util.ArrayList; +import java.util.Collections; import java.util.List; import java.util.Map; import java.util.stream.Collectors; +import org.apache.beam.sdk.Pipeline.PipelineExecutionException; import org.apache.beam.sdk.io.gcp.pubsub.PubsubClient.OutgoingMessage; import org.apache.beam.sdk.io.gcp.pubsub.PubsubClient.TopicPath; import org.apache.beam.sdk.io.gcp.pubsub.PubsubTestClient.PubsubTestClientFactory; @@ -35,10 +39,12 @@ import org.apache.beam.sdk.transforms.Create; import org.apache.beam.sdk.transforms.DoFn; import org.apache.beam.sdk.transforms.ParDo; +import org.apache.beam.sdk.transforms.SerializableFunction; import org.apache.beam.sdk.values.TimestampedValue; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.hash.Hashing; +import org.checkerframework.checker.nullness.qual.Nullable; import org.joda.time.Duration; import org.joda.time.Instant; import org.junit.Rule; @@ -53,26 +59,41 @@ public class PubsubUnboundedSinkTest implements Serializable { private static final String DATA = "testData"; private static final ImmutableMap ATTRIBUTES = ImmutableMap.builder().put("a", "b").put("c", "d").build(); + private static final SerializableFunction ORDERING_KEY_FN = + e -> String.valueOf(e.length()); private static final long TIMESTAMP = 1234L; private static final String TIMESTAMP_ATTRIBUTE = "timestamp"; private static final String ID_ATTRIBUTE = "id"; private static final int NUM_SHARDS = 10; private static class Stamp extends DoFn { - private final Map attributes; + private final @Nullable Map attributes; + private final SerializableFunction orderingKeyFn; private Stamp() { - this(ImmutableMap.of()); + this(ImmutableMap.of(), e -> null); + } + + private Stamp(SerializableFunction orderingKeyFn) { + this(ImmutableMap.of(), orderingKeyFn); } - private Stamp(Map attributes) { + private Stamp(@Nullable Map attributes) { + this(attributes, e -> null); + } + + private Stamp( + @Nullable Map attributes, + SerializableFunction orderingKeyFn) { this.attributes = attributes; + this.orderingKeyFn = orderingKeyFn; } @ProcessElement public void processElement(ProcessContext c) { c.outputWithTimestamp( - new PubsubMessage(c.element().getBytes(StandardCharsets.UTF_8), attributes), + new PubsubMessage(c.element().getBytes(StandardCharsets.UTF_8), attributes) + .withOrderingKey(orderingKeyFn.apply(c.element())), new Instant(TIMESTAMP)); } } @@ -120,6 +141,7 @@ public void sendOneMessage() throws IOException { TIMESTAMP_ATTRIBUTE, ID_ATTRIBUTE, NUM_SHARDS, + false, batchSize, batchBytes, Duration.standardSeconds(2), @@ -152,13 +174,14 @@ public void sendOneMessageWithoutAttributes() throws IOException { TIMESTAMP_ATTRIBUTE, ID_ATTRIBUTE, NUM_SHARDS, + false, 1 /* batchSize */, 1 /* batchBytes */, Duration.standardSeconds(2), RecordIdMethod.DETERMINISTIC, null); p.apply(Create.of(ImmutableList.of(DATA))) - .apply(ParDo.of(new Stamp(null /* attributes */))) + .apply(ParDo.of((new Stamp((@Nullable Map) null /* attributes */)))) .apply(sink); p.run(); } @@ -207,6 +230,7 @@ public void testDynamicTopics() throws IOException { TIMESTAMP_ATTRIBUTE, ID_ATTRIBUTE, NUM_SHARDS, + false, 1 /* batchSize */, 1 /* batchBytes */, Duration.standardSeconds(2), @@ -258,6 +282,7 @@ public void sendMoreThanOneBatchByNumMessages() throws IOException { TIMESTAMP_ATTRIBUTE, ID_ATTRIBUTE, NUM_SHARDS, + false, batchSize, batchBytes, Duration.standardSeconds(2), @@ -303,6 +328,7 @@ public void sendMoreThanOneBatchByByteSize() throws IOException { TIMESTAMP_ATTRIBUTE, ID_ATTRIBUTE, NUM_SHARDS, + false, batchSize, batchBytes, Duration.standardSeconds(2), @@ -315,6 +341,142 @@ public void sendMoreThanOneBatchByByteSize() throws IOException { // message does not match the expected publish message. } + @Test + public void sendOneMessageWithWrongCoderForOrderingKey() throws IOException { + List outgoing = + ImmutableList.of( + OutgoingMessage.of( + com.google.pubsub.v1.PubsubMessage.newBuilder() + .setData(ByteString.copyFromUtf8(DATA)) + .putAllAttributes(ATTRIBUTES) + .setOrderingKey(ORDERING_KEY_FN.apply(DATA)) + .build(), + TIMESTAMP, + getRecordId(DATA), + null)); + assertThrows( + PipelineExecutionException.class, + () -> { + try (PubsubTestClientFactory factory = + PubsubTestClient.createFactoryForPublish(TOPIC, outgoing, ImmutableList.of())) { + PubsubUnboundedSink sink = + new PubsubUnboundedSink( + factory, + StaticValueProvider.of(TOPIC), + TIMESTAMP_ATTRIBUTE, + ID_ATTRIBUTE, + NUM_SHARDS, + true, + 1 /* batchSize */, + 1 /* batchBytes */, + Duration.standardSeconds(2), + RecordIdMethod.DETERMINISTIC, + null); + p.apply(Create.of(DATA)) + .apply(ParDo.of(new Stamp(ATTRIBUTES, ORDERING_KEY_FN))) + .apply(sink); + p.run(); + } + }); + // The PubsubTestClientFactory will assert fail on close if the actual published + // message does not match the expected publish message. + } + + @Test + public void sendOneMessagePerOrderingKey() throws IOException { + List outgoing = + ImmutableList.of( + OutgoingMessage.of( + com.google.pubsub.v1.PubsubMessage.newBuilder() + .setData(ByteString.copyFromUtf8(DATA)) + .putAllAttributes(ATTRIBUTES) + .setOrderingKey(ORDERING_KEY_FN.apply(DATA)) + .build(), + TIMESTAMP, + getRecordId(DATA), + null), + OutgoingMessage.of( + com.google.pubsub.v1.PubsubMessage.newBuilder() + .setData(ByteString.copyFromUtf8(DATA + DATA)) + .putAllAttributes(ATTRIBUTES) + .setOrderingKey(ORDERING_KEY_FN.apply(DATA + DATA)) + .build(), + TIMESTAMP, + getRecordId(DATA + DATA), + null)); + try (PubsubTestClientFactory factory = + PubsubTestClient.createFactoryForPublish(TOPIC, outgoing, ImmutableList.of())) { + PubsubUnboundedSink sink = + new PubsubUnboundedSink( + factory, + StaticValueProvider.of(TOPIC), + TIMESTAMP_ATTRIBUTE, + ID_ATTRIBUTE, + NUM_SHARDS, + true, + 1 /* batchSize */, + 1 /* batchBytes */, + Duration.standardSeconds(2), + RecordIdMethod.DETERMINISTIC, + null); + p.apply(Create.of(DATA, DATA + DATA)) + .apply(ParDo.of(new Stamp(ATTRIBUTES, ORDERING_KEY_FN))) + .setCoder(PubsubMessageSchemaCoder.getSchemaCoder()) + .apply(sink); + p.run(); + } + // The PubsubTestClientFactory will assert fail on close if the actual published + // message does not match the expected publish message. + } + + @Test + public void sendMoreThanOneBatchByOrderingKey() throws IOException { + List outgoing = new ArrayList<>(); + List data = new ArrayList<>(); + int batchSize = 2; + int batchBytes = 1000; + for (int i = 0; i < batchSize * 10; i++) { + String str = String.valueOf(i); + outgoing.add( + OutgoingMessage.of( + com.google.pubsub.v1.PubsubMessage.newBuilder() + .setData(ByteString.copyFromUtf8(str)) + .setOrderingKey(ORDERING_KEY_FN.apply(str)) + .build(), + TIMESTAMP, + getRecordId(str), + null)); + data.add(str); + } + + // Randomly shuffle test input to highlight potential issues caused by order dependent logic. + Collections.shuffle(data); + + try (PubsubTestClientFactory factory = + PubsubTestClient.createFactoryForPublish(TOPIC, outgoing, ImmutableList.of())) { + PubsubUnboundedSink sink = + new PubsubUnboundedSink( + factory, + StaticValueProvider.of(TOPIC), + TIMESTAMP_ATTRIBUTE, + ID_ATTRIBUTE, + NUM_SHARDS, + true, + batchSize, + batchBytes, + Duration.standardSeconds(2), + RecordIdMethod.DETERMINISTIC, + null); + p.apply(Create.of(data)) + .apply(ParDo.of(new Stamp(ORDERING_KEY_FN))) + .setCoder(PubsubMessageSchemaCoder.getSchemaCoder()) + .apply(sink); + p.run(); + } + // The PubsubTestClientFactory will assert fail on close if the actual published + // message does not match the expected publish message. + } + // TODO: We would like to test that failed Pubsub publish calls cause the already assigned // (and random) record ids to be reused. However that can't be done without the test runnner // supporting retrying bundles. diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubWriteIT.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubWriteIT.java index 1898c4ae4af0..45c85183d536 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubWriteIT.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubWriteIT.java @@ -17,18 +17,27 @@ */ package org.apache.beam.sdk.io.gcp.pubsub; +import static org.junit.Assert.assertEquals; + +import com.google.protobuf.UnsafeByteOperations; import java.io.IOException; import java.nio.charset.StandardCharsets; +import java.util.HashMap; +import java.util.List; import java.util.Map; import java.util.Objects; import org.apache.beam.sdk.io.GenerateSequence; +import org.apache.beam.sdk.io.gcp.pubsub.PubsubClient.IncomingMessage; +import org.apache.beam.sdk.io.gcp.pubsub.PubsubClient.SubscriptionPath; import org.apache.beam.sdk.io.gcp.pubsub.PubsubClient.TopicPath; import org.apache.beam.sdk.testing.TestPipeline; import org.apache.beam.sdk.transforms.Create; import org.apache.beam.sdk.transforms.MapElements; import org.apache.beam.sdk.values.TypeDescriptors; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.apache.commons.lang3.RandomStringUtils; +import org.checkerframework.checker.nullness.qual.Nullable; import org.joda.time.Instant; import org.junit.After; import org.junit.Before; @@ -46,11 +55,12 @@ public class PubsubWriteIT { private PubsubClient pubsubClient; private TopicPath testTopic; + private String project; @Before public void setup() throws IOException { PubsubOptions options = TestPipeline.testingPipelineOptions().as(PubsubOptions.class); - String project = options.getProject(); + project = options.getProject(); pubsubClient = PubsubGrpcClient.FACTORY.newClient(null, null, options); testTopic = PubsubClient.topicPathFromName(project, "pubsub-write-" + Instant.now().getMillis()); @@ -102,4 +112,79 @@ public void testBoundedWriteMessageWithAttributes() { .apply(PubsubIO.writeMessages().to(testTopic.getPath())); pipeline.run(); } + + @Test + public void testBoundedWriteMessageWithAttributesAndOrderingKey() throws IOException { + TopicPath testTopicPath = + PubsubClient.topicPathFromName( + project, "pubsub-write-ordering-key-" + Instant.now().getMillis()); + pubsubClient.createTopic(testTopicPath); + SubscriptionPath testSubscriptionPath = + pubsubClient.createRandomSubscription( + PubsubClient.projectPathFromId(project), testTopicPath, 10); + + byte[] payload = new byte[] {-16, -97, -89, -86}; // U+1F9EA + + // Messages with and without ordering keys can be mixed together in a collection. The writer + // ensures that a publish request only sends message batches with the same ordering key, + // otherwise the Pub/Sub service will reject the request. + // Outgoing messages may specify either null or empty string to represent messages without + // ordering keys, but Protobuf treats strings as primitives so we explicitly use empty string in + // this test for the round trip assertion. + Map outgoingMessages = new HashMap<>(); + outgoingMessages.put( + "0", + new PubsubMessage(payload, ImmutableMap.of("id", "0")) + .withOrderingKey("")); // No ordering key + outgoingMessages.put( + "1", + new PubsubMessage(payload, ImmutableMap.of("id", "1")) + .withOrderingKey("12")); // Repeated ordering key + outgoingMessages.put( + "2", + new PubsubMessage(payload, ImmutableMap.of("id", "2")) + .withOrderingKey("12")); // Repeated ordering key + outgoingMessages.put( + "3", + new PubsubMessage(payload, ImmutableMap.of("id", "3")) + .withOrderingKey("3")); // Distinct ordering key + + pipeline + .apply( + Create.of(ImmutableList.copyOf(outgoingMessages.values())) + .withCoder(PubsubMessageSchemaCoder.getSchemaCoder())) + .apply(PubsubIO.writeMessages().withOrderingKey().to(testTopicPath.getPath())); + pipeline.run().waitUntilFinish(); + + // sometimes the first pull comes up short. try 4 pulls to avoid flaky false-negatives + int numPulls = 0; + while (!outgoingMessages.isEmpty()) { + boolean emptyOrDuplicatePull = true; + List incomingMessages = + pubsubClient.pull(Instant.now().getMillis(), testSubscriptionPath, 4, true); + + for (IncomingMessage incomingMessage : incomingMessages) { + com.google.pubsub.v1.PubsubMessage message = incomingMessage.message(); + @Nullable + PubsubMessage outgoingMessage = + outgoingMessages.remove(message.getAttributesMap().get("id")); + if (outgoingMessage != null) { + emptyOrDuplicatePull = false; + assertEquals( + UnsafeByteOperations.unsafeWrap(outgoingMessage.getPayload()), message.getData()); + assertEquals(outgoingMessage.getAttributeMap(), message.getAttributesMap()); + assertEquals(outgoingMessage.getOrderingKey(), message.getOrderingKey()); + } + } + + if (emptyOrDuplicatePull && ++numPulls > 4) { + throw new RuntimeException( + String.format( + "Pulled %s times from PubSub but retrieved no expected elements.", numPulls)); + } + } + + pubsubClient.deleteSubscription(testSubscriptionPath); + pubsubClient.deleteTopic(testTopicPath); + } }