Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

metrics: add failure counter for triple/presig/signature #804

Merged
merged 1 commit into from
Aug 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions chain-signatures/node/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -335,6 +335,51 @@ pub(crate) static FAILED_SEND_ENCRYPTED_LATENCY: Lazy<HistogramVec> = Lazy::new(
.unwrap()
});

pub(crate) static NUM_TOTAL_HISTORICAL_SIGNATURE_GENERATORS: Lazy<IntGaugeVec> = Lazy::new(|| {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we rename this to SIGNATURE_GENERATOR, NEW_SIGNATURE_GENERATOR, or something similar?
We should not expose how it is represented on Grafana in code. It can be reused, it can be per hour, not historical. It can be combined with other data and not be TOTAL.

I would rename all new and existing metrics so they represent what they are, and not how they are shown on in Grafana.

try_create_int_gauge_vec(
"multichain_num_total_historical_signature_generators",
"number of all signature generators historically on the node",
&["node_account_id"],
)
.unwrap()
});

pub(crate) static TRIPLE_GENERATOR_FAILURES: Lazy<IntGaugeVec> = Lazy::new(|| {
try_create_int_gauge_vec(
"multichain_triple_generator_failures",
"total triple generator failures",
&["node_account_id"],
)
.unwrap()
});

pub(crate) static SIGNATURE_GENERATOR_FAILURES: Lazy<IntGaugeVec> = Lazy::new(|| {
try_create_int_gauge_vec(
"multichain_signature_generator_failures",
"total signature generator failures",
&["node_account_id"],
)
.unwrap()
});

pub(crate) static PRESIGNATURE_GENERATOR_FAILURES: Lazy<IntGaugeVec> = Lazy::new(|| {
try_create_int_gauge_vec(
"multichain_presignature_generator_failures",
"total presignature generator failures",
&["node_account_id"],
)
.unwrap()
});

pub(crate) static SIGNATURE_FAILURES: Lazy<IntGaugeVec> = Lazy::new(|| {
try_create_int_gauge_vec(
"multichain_signature_failures",
"total signature failures",
&["node_account_id"],
)
.unwrap()
});

pub fn try_create_int_gauge_vec(name: &str, help: &str, labels: &[&str]) -> Result<IntGaugeVec> {
check_metric_multichain_prefix(name)?;
let opts = Opts::new(name, help);
Expand Down
2 changes: 2 additions & 0 deletions chain-signatures/node/src/protocol/consensus.rs
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@ impl ConsensusProtocol for StartedState {
me,
contract_state.public_key,
epoch,
ctx.my_account_id(),
),
)),
messages: Default::default(),
Expand Down Expand Up @@ -398,6 +399,7 @@ impl ConsensusProtocol for WaitingForConsensusState {
me,
self.public_key,
self.epoch,
ctx.my_account_id(),
))),
messages: self.messages,
}))
Expand Down
7 changes: 1 addition & 6 deletions chain-signatures/node/src/protocol/cryptography.rs
Original file line number Diff line number Diff line change
Expand Up @@ -456,12 +456,7 @@ impl CryptographicProtocol for RunningState {
messages.push(info.clone(), MpcMessage::Signature(msg));
}
signature_manager
.publish(
ctx.rpc_client(),
ctx.signer(),
ctx.mpc_contract_id(),
&my_account_id,
)
.publish(ctx.rpc_client(), ctx.signer(), ctx.mpc_contract_id())
.await;
drop(signature_manager);
let failures = messages
Expand Down
3 changes: 3 additions & 0 deletions chain-signatures/node/src/protocol/presignature.rs
Original file line number Diff line number Diff line change
Expand Up @@ -435,6 +435,9 @@ impl PresignatureManager {
let action = match generator.poke() {
Ok(action) => action,
Err(e) => {
crate::metrics::PRESIGNATURE_GENERATOR_FAILURES
.with_label_values(&[self.my_account_id.as_str()])
.inc();
self.gc.insert(*id, Instant::now());
self.introduced.remove(id);
errors.push(e);
Expand Down
31 changes: 26 additions & 5 deletions chain-signatures/node/src/protocol/signature.rs
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,7 @@ pub struct SignatureManager {
me: Participant,
public_key: PublicKey,
epoch: u64,
my_account_id: AccountId,
}

pub const MAX_RETRY: u8 = 10;
Expand Down Expand Up @@ -267,7 +268,12 @@ impl ToPublish {
}

impl SignatureManager {
pub fn new(me: Participant, public_key: PublicKey, epoch: u64) -> Self {
pub fn new(
me: Participant,
public_key: PublicKey,
epoch: u64,
my_account_id: &AccountId,
) -> Self {
Self {
generators: HashMap::new(),
failed: VecDeque::new(),
Expand All @@ -276,6 +282,7 @@ impl SignatureManager {
me,
public_key,
epoch,
my_account_id: my_account_id.clone(),
}
}

Expand Down Expand Up @@ -354,6 +361,9 @@ impl SignatureManager {
req,
cfg,
)?;
crate::metrics::NUM_TOTAL_HISTORICAL_SIGNATURE_GENERATORS
.with_label_values(&[self.my_account_id.as_str()])
.inc();
self.generators.insert(receipt_id, generator);
Ok(())
}
Expand Down Expand Up @@ -393,6 +403,9 @@ impl SignatureManager {
},
cfg,
)?;
crate::metrics::NUM_TOTAL_HISTORICAL_SIGNATURE_GENERATORS
.with_label_values(&[self.my_account_id.as_str()])
.inc();
self.generators.insert(receipt_id, generator);
Ok(())
}
Expand Down Expand Up @@ -462,6 +475,9 @@ impl SignatureManager {
}
};
let generator = entry.insert(generator);
crate::metrics::NUM_TOTAL_HISTORICAL_SIGNATURE_GENERATORS
.with_label_values(&[self.my_account_id.as_str()])
.inc();
Ok(&mut generator.protocol)
}
Entry::Occupied(entry) => Ok(&mut entry.into_mut().protocol),
Expand All @@ -482,6 +498,9 @@ impl SignatureManager {
if generator.proposer == self.me {
if generator.sign_request_timestamp.elapsed() < generator.timeout_total {
tracing::warn!(?err, "signature failed to be produced; pushing request back into failed queue");
crate::metrics::SIGNATURE_GENERATOR_FAILURES
.with_label_values(&[self.my_account_id.as_str()])
.inc();
// only retry the signature generation if it was initially proposed by us. We do not
// want any nodes to be proposing the same signature multiple times.
self.failed.push_back((
Expand All @@ -496,6 +515,9 @@ impl SignatureManager {
));
} else {
self.completed.insert(*receipt_id, Instant::now());
crate::metrics::SIGNATURE_FAILURES
.with_label_values(&[self.my_account_id.as_str()])
.inc();
tracing::warn!(?err, "signature failed to be produced; trashing request");
}
}
Expand Down Expand Up @@ -663,7 +685,6 @@ impl SignatureManager {
rpc_client: &near_fetch::Client,
signer: &T,
mpc_contract_id: &AccountId,
my_account_id: &AccountId,
) {
let mut to_retry: Vec<ToPublish> = Vec::new();

Expand Down Expand Up @@ -720,14 +741,14 @@ impl SignatureManager {
};

crate::metrics::NUM_SIGN_SUCCESS
.with_label_values(&[my_account_id.as_str()])
.with_label_values(&[self.my_account_id.as_str()])
.inc();
crate::metrics::SIGN_LATENCY
.with_label_values(&[my_account_id.as_str()])
.with_label_values(&[self.my_account_id.as_str()])
.observe(time_added.elapsed().as_secs_f64());
if time_added.elapsed().as_secs() <= 30 {
crate::metrics::NUM_SIGN_SUCCESS_30S
.with_label_values(&[my_account_id.as_str()])
.with_label_values(&[self.my_account_id.as_str()])
.inc();
}
}
Expand Down
3 changes: 3 additions & 0 deletions chain-signatures/node/src/protocol/triple.rs
Original file line number Diff line number Diff line change
Expand Up @@ -459,6 +459,9 @@ impl TripleManager {
Ok(action) => action,
Err(e) => {
errors.push(e);
crate::metrics::TRIPLE_GENERATOR_FAILURES
.with_label_values(&[self.my_account_id.as_str()])
.inc();
self.gc.insert(*id, Instant::now());
self.ongoing.remove(id);
self.introduced.remove(id);
Expand Down
Loading