Skip to content

Commit

Permalink
feat: add actor info to diagnose & dump diagnose on CI failure (#19787)
Browse files Browse the repository at this point in the history
Signed-off-by: xxchan <xxchan22f@gmail.com>
  • Loading branch information
xxchan authored Dec 13, 2024
1 parent 5a85b55 commit 6b800b4
Show file tree
Hide file tree
Showing 14 changed files with 122 additions and 24 deletions.
15 changes: 14 additions & 1 deletion Makefile.toml
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,19 @@ script = '''
watch -n 1 "${TMUX} list-windows -t risedev | grep -v active | cut -d'(' -f1"
'''

[tasks.diagnose]
category = "Misc"
description = "Dump diagnose info"
dependencies = ["check-and-load-risedev-env-file"]
script = '''
#!/usr/bin/env bash
set -e
file_name=${PREFIX_LOG}/diagnose-$(date -u +%Y-%m-%dT%H:%M:%SZ).txt
curl -s ${RISEDEV_RW_META_DASHBOARD_ADDR}/api/monitor/diagnose/ > ${file_name}
echo "Diagnose info has been dumped to ${file_name}"
'''


[tasks.del]
alias = "delete"

Expand Down Expand Up @@ -1327,7 +1340,7 @@ echo "All processes has exited."

[tasks.slt]
category = "RiseDev - Test - SQLLogicTest"
install_crate = { min_version = "0.21.0", crate_name = "sqllogictest-bin", binary = "sqllogictest", test_arg = [
install_crate = { min_version = "0.23.1", crate_name = "sqllogictest-bin", binary = "sqllogictest", test_arg = [
"--help",
], install_command = "binstall" }
dependencies = ["check-and-load-risedev-env-file"]
Expand Down
2 changes: 1 addition & 1 deletion ci/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ ENV CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
RUN curl -L --proto '=https' --tlsv1.2 -sSf https://raw.githubusercontent.com/cargo-bins/cargo-binstall/main/install-from-binstall-release.sh | bash
RUN cargo binstall -y --locked --no-symlinks cargo-llvm-cov cargo-nextest cargo-hakari cargo-sort cargo-cache cargo-audit \
cargo-make@0.37.9 \
sqllogictest-bin@0.21.0 \
sqllogictest-bin@0.23.1 \
sccache@0.7.4 \
&& cargo cache -a \
&& rm -rf "/root/.cargo/registry/index" \
Expand Down
2 changes: 1 addition & 1 deletion ci/build-ci-image.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ cat ../rust-toolchain
# shellcheck disable=SC2155

# REMEMBER TO ALSO UPDATE ci/docker-compose.yml
export BUILD_ENV_VERSION=v20241030
export BUILD_ENV_VERSION=v20241213

export BUILD_TAG="public.ecr.aws/w1p7b4n3/rw-build-env:${BUILD_ENV_VERSION}"

Expand Down
12 changes: 6 additions & 6 deletions ci/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ services:
retries: 5

source-test-env:
image: public.ecr.aws/w1p7b4n3/rw-build-env:v20241030
image: public.ecr.aws/w1p7b4n3/rw-build-env:v20241213
depends_on:
- mysql
- sqlserver-server
Expand All @@ -84,7 +84,7 @@ services:
- ..:/risingwave

sink-test-env:
image: public.ecr.aws/w1p7b4n3/rw-build-env:v20241030
image: public.ecr.aws/w1p7b4n3/rw-build-env:v20241213
depends_on:
- mysql
- db
Expand All @@ -106,7 +106,7 @@ services:
- ..:/risingwave

rw-build-env:
image: public.ecr.aws/w1p7b4n3/rw-build-env:v20241030
image: public.ecr.aws/w1p7b4n3/rw-build-env:v20241213
volumes:
- ..:/risingwave

Expand All @@ -119,14 +119,14 @@ services:
- ..:/risingwave

iceberg-engine-env:
image: public.ecr.aws/w1p7b4n3/rw-build-env:v20241030
image: public.ecr.aws/w1p7b4n3/rw-build-env:v20241213
depends_on:
- db
volumes:
- ..:/risingwave

ci-flamegraph-env:
image: public.ecr.aws/w1p7b4n3/rw-build-env:v20241030
image: public.ecr.aws/w1p7b4n3/rw-build-env:v20241213
# NOTE(kwannoel): This is used in order to permit
# syscalls for `nperf` (perf_event_open),
# so it can do CPU profiling.
Expand All @@ -137,7 +137,7 @@ services:
- ..:/risingwave

regress-test-env:
image: public.ecr.aws/w1p7b4n3/rw-build-env:v20241030
image: public.ecr.aws/w1p7b4n3/rw-build-env:v20241213
depends_on:
db:
condition: service_healthy
Expand Down
8 changes: 7 additions & 1 deletion ci/scripts/common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,12 @@ export RW_SECRET_STORE_PRIVATE_KEY_HEX="0123456789abcdef0123456789abcdef"
export RUST_MIN_STACK=4194304

unset LANG

function dump_diagnose_info() {
./risedev diagnose || true
}
trap dump_diagnose_info EXIT

if [ -n "${BUILDKITE_COMMIT:-}" ]; then
export GIT_SHA=$BUILDKITE_COMMIT
fi
Expand Down Expand Up @@ -148,4 +154,4 @@ check_link_info() {
echo "libssl should not be dynamically linked"
exit 1
fi
}
}
4 changes: 2 additions & 2 deletions ci/scripts/e2e-source-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ apt-get -y install jq
echo "--- e2e, inline test"
RUST_LOG="debug,risingwave_stream=info,risingwave_batch=info,risingwave_storage=info,risingwave_meta=info" \
risedev ci-start ci-inline-source-test
risedev slt './e2e_test/source_inline/**/*.slt' -j16
risedev slt './e2e_test/source_inline/**/*.slt' --keep-db-on-failure -j16
risedev slt './e2e_test/source_inline/**/*.slt.serial'
echo "--- Kill cluster"
risedev ci-kill
Expand Down Expand Up @@ -172,4 +172,4 @@ sleep 20
risedev slt "e2e_test/webhook/webhook_source_recovery.slt"

risedev ci-kill
echo "--- cluster killed "
echo "--- cluster killed "
4 changes: 2 additions & 2 deletions ci/scripts/e2e-test-parallel-for-opendal.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ host_args=(-h localhost -p 4565 -h localhost -p 4566 -h localhost -p 4567)
echo "--- e2e, ci-3cn-3fe-opendal-fs-backend, streaming"
RUST_LOG="info,risingwave_stream=info,risingwave_batch=info,risingwave_storage=info" \
risedev ci-start ci-3cn-3fe-opendal-fs-backend
sqllogictest "${host_args[@]}" -d dev './e2e_test/streaming/**/*.slt' -j 16 --junit "parallel-opendal-fs-backend-${profile}" --label "parallel"
sqllogictest "${host_args[@]}" -d dev './e2e_test/streaming/**/*.slt' --keep-db-on-failure -j 16 --junit "parallel-opendal-fs-backend-${profile}" --label "parallel"

echo "--- Kill cluster Streaming"
risedev ci-kill
Expand All @@ -42,7 +42,7 @@ echo "--- e2e, ci-3cn-3fe-opendal-fs-backend, batch"
RUST_LOG="info,risingwave_stream=info,risingwave_batch=info,risingwave_storage=info" \
risedev ci-start ci-3cn-3fe-opendal-fs-backend
sqllogictest "${host_args[@]}" -d dev './e2e_test/ddl/**/*.slt' --junit "parallel-opendal-fs-backend-ddl-${profile}" --label "parallel"
sqllogictest "${host_args[@]}" -d dev './e2e_test/visibility_mode/*.slt' -j 16 --junit "parallel-opendal-fs-backend-batch-${profile}" --label "parallel"
sqllogictest "${host_args[@]}" -d dev './e2e_test/visibility_mode/*.slt' --keep-db-on-failure -j 16 --junit "parallel-opendal-fs-backend-batch-${profile}" --label "parallel"

echo "--- Kill cluster Batch"
risedev ci-kill
Expand Down
4 changes: 2 additions & 2 deletions ci/scripts/e2e-test-parallel-in-memory.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,15 +28,15 @@ host_args=(-h localhost -p 4565 -h localhost -p 4566 -h localhost -p 4567)
echo "--- e2e, ci-3cn-3fe-in-memory, streaming"
risedev ci-start ci-3cn-3fe-in-memory
sqllogictest --version
sqllogictest "${host_args[@]}" -d dev './e2e_test/streaming/**/*.slt' -j 16 --junit "parallel-in-memory-streaming-${profile}" --label "in-memory" --label "parallel"
sqllogictest "${host_args[@]}" -d dev './e2e_test/streaming/**/*.slt' --keep-db-on-failure -j 16 --junit "parallel-in-memory-streaming-${profile}" --label "in-memory" --label "parallel"

echo "--- Kill cluster"
risedev ci-kill

echo "--- e2e, ci-3cn-3fe-in-memory, batch"
risedev ci-start ci-3cn-3fe-in-memory
sqllogictest "${host_args[@]}" -d dev './e2e_test/ddl/**/*.slt' --junit "parallel-in-memory-batch-ddl-${profile}" --label "in-memory" --label "parallel"
sqllogictest "${host_args[@]}" -d dev './e2e_test/batch/**/*.slt' -j 16 --junit "parallel-in-memory-batch-${profile}" --label "in-memory" --label "parallel"
sqllogictest "${host_args[@]}" -d dev './e2e_test/batch/**/*.slt' --keep-db-on-failure -j 16 --junit "parallel-in-memory-batch-${profile}" --label "in-memory" --label "parallel"

echo "--- Kill cluster"
risedev ci-kill
6 changes: 3 additions & 3 deletions ci/scripts/e2e-test-parallel.sh
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ RUST_LOG="info,risingwave_stream=info,risingwave_batch=info,risingwave_storage=i
echo "--- e2e, ci-3streaming-2serving-3fe, streaming"
RUST_LOG=$RUST_LOG \
risedev ci-start ci-3streaming-2serving-3fe
sqllogictest "${host_args[@]}" -d dev './e2e_test/streaming/**/*.slt' -j 16 --junit "parallel-streaming-${profile}" --label "parallel"
sqllogictest "${host_args[@]}" -d dev './e2e_test/streaming/**/*.slt' --keep-db-on-failure -j 16 --junit "parallel-streaming-${profile}" --label "parallel"

kill_cluster

Expand All @@ -47,13 +47,13 @@ RUST_LOG=$RUST_LOG \
risedev ci-start ci-3streaming-2serving-3fe
# Exclude files that contain ALTER SYSTEM commands
find ./e2e_test/ddl -name "*.slt" -type f -exec grep -L "ALTER SYSTEM" {} \; | xargs -r sqllogictest "${host_args[@]}" -d dev --junit "parallel-batch-ddl-${profile}" --label "parallel"
sqllogictest "${host_args[@]}" -d dev './e2e_test/visibility_mode/*.slt' -j 16 --junit "parallel-batch-${profile}" --label "parallel"
sqllogictest "${host_args[@]}" -d dev './e2e_test/visibility_mode/*.slt' --keep-db-on-failure -j 16 --junit "parallel-batch-${profile}" --label "parallel"

kill_cluster

echo "--- e2e, ci-3streaming-2serving-3fe, generated"
RUST_LOG=$RUST_LOG \
risedev ci-start ci-3streaming-2serving-3fe
sqllogictest "${host_args[@]}" -d dev './e2e_test/generated/**/*.slt' -j 16 --junit "parallel-generated-${profile}" --label "parallel"
sqllogictest "${host_args[@]}" -d dev './e2e_test/generated/**/*.slt' --keep-db-on-failure -j 16 --junit "parallel-generated-${profile}" --label "parallel"

kill_cluster
2 changes: 1 addition & 1 deletion e2e_test/source_inline/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ Other tests can be run in parallel.

```bash
# run all parallel tests
risedev slt './e2e_test/source_inline/**/*.slt' -j16
risedev slt './e2e_test/source_inline/**/*.slt' --keep-db-on-failure -j16
# run all serial tests
risedev slt './e2e_test/source_inline/**/*.slt.serial'
```
Expand Down
4 changes: 2 additions & 2 deletions e2e_test/source_inline/kafka/protobuf/alter_source_shared.slt
Original file line number Diff line number Diff line change
Expand Up @@ -99,5 +99,5 @@ SELECT COUNT(*), MAX(age), MIN(age), SUM(age) FROM mv_user_more;
30 104 0 1020


# statement ok
# DROP SOURCE src_user CASCADE;
statement ok
DROP SOURCE src_user CASCADE;
22 changes: 21 additions & 1 deletion src/meta/src/controller/fragment.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ use risingwave_meta_model::prelude::{Actor, Fragment, Sink, StreamingJob};
use risingwave_meta_model::{
actor, actor_dispatcher, fragment, object, sink, source, streaming_job, table, ActorId,
ActorUpstreamActors, ConnectorSplits, DatabaseId, ExprContext, FragmentId, I32Array, JobStatus,
ObjectId, SinkId, SourceId, StreamNode, StreamingParallelism, TableId, VnodeBitmap, WorkerId,
ObjectId, SchemaId, SinkId, SourceId, StreamNode, StreamingParallelism, TableId, VnodeBitmap,
WorkerId,
};
use risingwave_meta_model_migration::{Alias, SelectStatement};
use risingwave_pb::common::PbActorLocation;
Expand Down Expand Up @@ -882,6 +883,25 @@ impl CatalogController {
Ok(actor_locations)
}

pub async fn list_actor_info(
&self,
) -> MetaResult<Vec<(ActorId, FragmentId, ObjectId, SchemaId, ObjectType)>> {
let inner = self.inner.read().await;
let actor_locations: Vec<(ActorId, FragmentId, ObjectId, SchemaId, ObjectType)> =
Actor::find()
.join(JoinType::LeftJoin, actor::Relation::Fragment.def())
.join(JoinType::LeftJoin, fragment::Relation::Object.def())
.select_only()
.columns([actor::Column::ActorId, actor::Column::FragmentId])
.column_as(object::Column::Oid, "job_id")
.column_as(object::Column::SchemaId, "schema_id")
.column_as(object::Column::ObjType, "type")
.into_tuple()
.all(&inner.db)
.await?;
Ok(actor_locations)
}

pub async fn list_source_actors(&self) -> MetaResult<Vec<(ActorId, FragmentId)>> {
let inner = self.inner.read().await;

Expand Down
53 changes: 52 additions & 1 deletion src/meta/src/manager/diagnose.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
// limitations under the License.

use std::cmp::{Ordering, Reverse};
use std::collections::{BTreeMap, BinaryHeap};
use std::collections::{BTreeMap, BinaryHeap, HashMap};
use std::fmt::Write;
use std::sync::Arc;

Expand Down Expand Up @@ -677,6 +677,7 @@ impl DiagnoseCommand {
("INDEX", indexes),
("SINK", sinks),
];
let mut obj_id_to_name = HashMap::new();
for (title, items) in catalogs {
use comfy_table::{Row, Table};
let mut table = Table::new();
Expand All @@ -689,6 +690,7 @@ impl DiagnoseCommand {
row
});
for (id, (name, schema_id, definition)) in items {
obj_id_to_name.insert(id, name.clone());
let mut row = Row::new();
let may_redact =
redact_all_sql_options(&definition).unwrap_or_else(|| "[REDACTED]".into());
Expand All @@ -702,6 +704,55 @@ impl DiagnoseCommand {
let _ = writeln!(s, "{title}");
let _ = writeln!(s, "{table}");
}

let actors = self
.metadata_manager
.catalog_controller
.list_actor_info()
.await?
.into_iter()
.map(|(actor_id, fragment_id, job_id, schema_id, obj_type)| {
(
actor_id,
(
fragment_id,
job_id,
schema_id,
obj_type,
obj_id_to_name
.get(&(job_id as _))
.cloned()
.unwrap_or_default(),
),
)
})
.collect::<BTreeMap<_, _>>();

use comfy_table::{Row, Table};
let mut table = Table::new();
table.set_header({
let mut row = Row::new();
row.add_cell("id".into());
row.add_cell("fragment_id".into());
row.add_cell("job_id".into());
row.add_cell("schema_id".into());
row.add_cell("type".into());
row.add_cell("name".into());
row
});
for (actor_id, (fragment_id, job_id, schema_id, ddl_type, name)) in actors {
let mut row = Row::new();
row.add_cell(actor_id.into());
row.add_cell(fragment_id.into());
row.add_cell(job_id.into());
row.add_cell(schema_id.into());
row.add_cell(ddl_type.as_str().into());
row.add_cell(name.into());
table.add_row(row);
}
let _ = writeln!(s);
let _ = writeln!(s, "ACTOR");
let _ = writeln!(s, "{table}");
Ok(())
}
}
Expand Down
8 changes: 8 additions & 0 deletions src/risedevtool/src/risedev_env.rs
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,14 @@ pub fn generate_risedev_env(services: &Vec<ServiceConfig>) -> String {
)
.unwrap();
}
ServiceConfig::MetaNode(meta_node_config) => {
writeln!(
env,
r#"RISEDEV_RW_META_DASHBOARD_ADDR="http://{}:{}""#,
meta_node_config.address, meta_node_config.dashboard_port
)
.unwrap();
}
_ => {}
}
}
Expand Down

0 comments on commit 6b800b4

Please sign in to comment.