Skip to content

Commit

Permalink
Support for retrying messages within replicator processor
Browse files Browse the repository at this point in the history
Kafka consumer for replication of events now only relies on actual topic
it is consuming from and DLQ for messages which cannot be processed due
to bugs in the replication stack.  We no longer uses retry queue for the
configured consumer.

Replication message processing will now infinitely sit in the loop of
transient errors in processing the message.  In the event where
processing logic returns RetryTaskError, it will retry that error few
times before moving the message to DLQ.

Also added a bunch of new metric to help with debugging replication
related issues.
  • Loading branch information
samarabbas committed Jun 7, 2018
1 parent 7154288 commit 2fa2455
Show file tree
Hide file tree
Showing 12 changed files with 381 additions and 77 deletions.
48 changes: 46 additions & 2 deletions .gen/go/history/historyservice_replicateevents.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions .gen/go/history/idl.go

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions .gen/go/shared/idl.go

Large diffs are not rendered by default.

110 changes: 110 additions & 0 deletions .gen/go/shared/types.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 0 additions & 6 deletions common/messaging/kafkaClient.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@ func (c *kafkaClient) NewConsumer(currentCluster, sourceCluster, consumerName st
sourceTopics := c.config.getTopicsForCadenceCluster(sourceCluster)

topicKafkaCluster := c.config.getKafkaClusterForTopic(sourceTopics.Topic)
retryTopicKafkaCluster := c.config.getKafkaClusterForTopic(currentTopics.RetryTopic)
dqlTopicKafkaCluster := c.config.getKafkaClusterForTopic(currentTopics.DLQTopic)
topicList := kafka.ConsumerTopicList{
kafka.ConsumerTopic{
Expand All @@ -50,11 +49,6 @@ func (c *kafkaClient) NewConsumer(currentCluster, sourceCluster, consumerName st
Cluster: topicKafkaCluster,
BrokerList: c.config.getBrokersForKafkaCluster(topicKafkaCluster),
},
RetryQ: kafka.Topic{
Name: currentTopics.RetryTopic,
Cluster: retryTopicKafkaCluster,
BrokerList: c.config.getBrokersForKafkaCluster(retryTopicKafkaCluster),
},
DLQ: kafka.Topic{
Name: currentTopics.DLQTopic,
Cluster: dqlTopicKafkaCluster,
Expand Down
19 changes: 18 additions & 1 deletion common/metrics/defs.go
Original file line number Diff line number Diff line change
Expand Up @@ -394,6 +394,8 @@ const (
ReplicatorQueueProcessorScope
// ReplicatorTaskHistoryScope is the scope used for history task processing by replicator queue processor
ReplicatorTaskHistoryScope
// ReplicateHistoryEventsScope is the scope used by historyReplicator API for applying events
ReplicateHistoryEventsScope

NumHistoryScopes
)
Expand Down Expand Up @@ -426,6 +428,10 @@ const (
const (
// ReplicationScope is the scope used by all metric emitted by replicator
ReplicatorScope = iota + NumCommonScopes
// DomainReplicationTaskScope is the scope used by domain task replication processing
DomainReplicationTaskScope
// HistoryReplicationTaskScope is the scope used by history task replication processing
HistoryReplicationTaskScope

NumWorkerScopes
)
Expand Down Expand Up @@ -575,6 +581,7 @@ var ScopeDefs = map[ServiceIdx]map[int]scopeDefinition{
HistoryEventNotificationScope: {operation: "HistoryEventNotification"},
ReplicatorQueueProcessorScope: {operation: "ReplicatorQueueProcessor"},
ReplicatorTaskHistoryScope: {operation: "ReplicatorTaskHistory"},
ReplicateHistoryEventsScope: {operation: "ReplicateHistoryEvents"},
},
// Matching Scope Names
Matching: {
Expand All @@ -590,7 +597,9 @@ var ScopeDefs = map[ServiceIdx]map[int]scopeDefinition{
},
// Worker Scope Names
Worker: {
ReplicatorScope: {operation: "Replicator"},
ReplicatorScope: {operation: "Replicator"},
DomainReplicationTaskScope: {operation: "DomainReplicationTask"},
HistoryReplicationTaskScope: {operation: "HistoryReplicationTask"},
},
}

Expand All @@ -609,6 +618,7 @@ const (
CadenceErrQueryFailedCounter
CadenceErrLimitExceededCounter
CadenceErrContextTimeoutCounter
CadenceErrRetryTaskCounter
PersistenceRequests
PersistenceFailures
PersistenceLatency
Expand Down Expand Up @@ -672,6 +682,9 @@ const (
HistoryEventNotificationFanoutLatency
HistoryEventNotificationInFlightMessageGauge
HistoryEventNotificationFailDeliveryCount
StaleReplicationEventsCounter
BufferedReplicationTaskCounter
HistoryConflictsCounter
)

// Matching metrics enum
Expand Down Expand Up @@ -710,6 +723,7 @@ var MetricDefs = map[ServiceIdx]map[int]metricDefinition{
CadenceErrQueryFailedCounter: {metricName: "cadence.errors.query-failed", metricType: Counter},
CadenceErrLimitExceededCounter: {metricName: "cadence.errors.limit-exceeded", metricType: Counter},
CadenceErrContextTimeoutCounter: {metricName: "cadence.errors.context-timeout", metricType: Counter},
CadenceErrRetryTaskCounter: {metricName: "cadence.errors.retry-task", metricType: Counter},
PersistenceRequests: {metricName: "persistence.requests", metricType: Counter},
PersistenceFailures: {metricName: "persistence.errors", metricType: Counter},
PersistenceLatency: {metricName: "persistence.latency", metricType: Timer},
Expand Down Expand Up @@ -768,6 +782,9 @@ var MetricDefs = map[ServiceIdx]map[int]metricDefinition{
HistoryEventNotificationFanoutLatency: {metricName: "history-event-notification-fanout-latency", metricType: Timer},
HistoryEventNotificationInFlightMessageGauge: {metricName: "history-event-notification-inflight-message-gauge", metricType: Gauge},
HistoryEventNotificationFailDeliveryCount: {metricName: "history-event-notification-fail-delivery-count", metricType: Counter},
StaleReplicationEventsCounter: {metricName: "stale-replication-events", metricType: Counter},
BufferedReplicationTaskCounter: {metricName: "buffered-replication-tasks", metricType: Counter},
HistoryConflictsCounter: {metricName: "history-conflicts", metricType: Counter},
},
Matching: {
PollSuccessCounter: {metricName: "poll.success"},
Expand Down
1 change: 1 addition & 0 deletions idl/github.com/uber/cadence/history.thrift
Original file line number Diff line number Diff line change
Expand Up @@ -536,6 +536,7 @@ service HistoryService {
3: shared.EntityNotExistsError entityNotExistError,
4: ShardOwnershipLostError shardOwnershipLostError,
5: shared.LimitExceededError limitExceededError,
6: shared.RetryTaskError retryTaskError,
)

/**
Expand Down
4 changes: 4 additions & 0 deletions idl/github.com/uber/cadence/shared.thrift
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,10 @@ exception AccessDeniedError {
1: required string message
}

exception RetryTaskError {
1: required string message
}

enum WorkflowIdReusePolicy {
/*
* allow start a workflow execution using the same workflow ID,
Expand Down
2 changes: 2 additions & 0 deletions service/history/handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -923,6 +923,8 @@ func (h *Handler) updateErrorMetric(scope int, err error) {
h.metricsClient.IncCounter(scope, metrics.CadenceErrCancellationAlreadyRequestedCounter)
case *gen.LimitExceededError:
h.metricsClient.IncCounter(scope, metrics.CadenceErrLimitExceededCounter)
case *gen.RetryTaskError:
h.metricsClient.IncCounter(scope, metrics.CadenceErrRetryTaskCounter)
case *yarpcerrors.Status:
if err.Code() == yarpcerrors.CodeDeadlineExceeded {
h.metricsClient.IncCounter(scope, metrics.CadenceErrContextTimeoutCounter)
Expand Down
Loading

0 comments on commit 2fa2455

Please sign in to comment.