Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ci(framework) Replace kill with check_and_kill function #4794

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 16 additions & 4 deletions e2e/test_exec_api.sh
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,17 @@ case "$3" in
;;
esac

check_and_kill() {
local pids=$1 # Get the PID as the first argument to the function
for pid in $pids; do
echo "Attempting to kill process ID: $pid"
if kill "$pid" 2>/dev/null; then
echo "Process $pid successfully killed."
else
echo "Failed to kill process $pid or it may have already terminated."
fi
done
}

# Create and install Flower app
flwr new e2e-tmp-test --framework numpy --username flwrlabs
Expand Down Expand Up @@ -113,9 +124,10 @@ while [ "$found_success" = false ] && [ $elapsed -lt $timeout ]; do
echo "Training worked correctly!"
found_success=true
if [ "$3" = "deployment-engine" ]; then
kill $cl1_pid; kill $cl2_pid;
check_and_kill "$cl1_pid" "$cl2_pid"
fi
sleep 1; kill $sl_pid;
sleep 1
check_and_kill "$sl_pid"
exit 0;
else
echo "Waiting for training ... ($elapsed seconds elapsed)"
Expand All @@ -128,8 +140,8 @@ done
if [ "$found_success" = false ]; then
echo "Training had an issue and timed out."
if [ "$3" = "deployment-engine" ]; then
kill $cl1_pid; kill $cl2_pid;
check_and_kill "$cl1_pid" "$cl2_pid"
fi
kill $sl_pid;
check_and_kill "$sl_pid"
exit 1;
fi
24 changes: 12 additions & 12 deletions e2e/test_reconnection.sh
Original file line number Diff line number Diff line change
Expand Up @@ -45,63 +45,63 @@ sed -i '/^\[tool\.flwr\.federations\.e2e\]/,/^$/d' pyproject.toml
echo -e $"\n[tool.flwr.federations.e2e]\naddress = \"127.0.0.1:9093\"\ninsecure = true" >> pyproject.toml
sleep 1

echo "Starting SuperLink"
timeout 10m flower-superlink --insecure $db_arg $rest_arg &
sl_pids=$(pgrep -f "flower-superlink")
echo "Starting SuperLink"
sleep 3

echo "Starting first client"
timeout 10m flower-supernode --insecure $rest_arg --superlink $server_address \
--clientappio-api-address="localhost:9094" &
cl1_pid=$!
echo "Starting first client"
sleep 3

echo "Starting second client"
timeout 10m flower-supernode --insecure $rest_arg --superlink $server_address \
--clientappio-api-address="localhost:9095" &
cl2_pid=$!
echo "Starting second client"
sleep 3

# Kill superlink, this should send the clients into their retry loops
check_and_kill "$sl_pids"
echo "Killing Superlink"
check_and_kill "$sl_pids"
sleep 3

# Restart superlink, the clients should now be able to reconnect to it
echo "Restarting Superlink"
timeout 10m flower-superlink --insecure $db_arg $rest_arg 2>&1 | tee flwr_output.log &
sl_pids=$(pgrep -f "flower-superlink")
echo "Restarting Superlink"
sleep 20

# Kill second client, this should send a DeleteNode message to the Superlink
kill $cl1_pid
echo "Killing second client"
check_and_kill "$cl1_pid"
sleep 5

# Starting new client, this is so we have enough clients to execute `flwr run`
echo "Starting new client"
timeout 10m flower-supernode --insecure $rest_arg --superlink $server_address \
--clientappio-api-address "localhost:9094" &
cl1_pid=$!
echo "Starting new client"
sleep 5

# We execute `flwr run` to begin the training
timeout 2m flwr run "." e2e &
echo "Executing flwr run to start training"
timeout 2m flwr run "." e2e &
sleep 10

# Kill first client as soon as the training starts, the flwr-serverapp should just
# receive a failure in this case and continue the rounds when enough clients are
# connected
kill $cl1_pid
echo "Killing first client"
check_and_kill "$cl1_pid"
sleep 3

# Restart first client so enough clients are connected to continue the FL rounds
echo "Starting new client"
timeout 5m flower-supernode --insecure $rest_arg --superlink $server_address \
--clientappio-api-address "localhost:9094" &
cl1_pid=$!
echo "Starting new client"
sleep 5

# Initialize a flag to track if training is successful
Expand All @@ -114,7 +114,7 @@ while [ "$found_success" = false ] && [ $elapsed -lt $timeout ]; do
if grep -q "Run finished" flwr_output.log; then
echo "Training worked correctly!"
found_success=true
kill $cl1_pid; kill $cl2_pid
check_and_kill "$cl1_pid" "$cl2_pid"
sleep 3
check_and_kill "$sl_pids"
else
Expand All @@ -127,7 +127,7 @@ done

if [ "$found_success" = false ]; then
echo "Training had an issue and timed out."
kill $cl1_pid; kill $cl2_pid
check_and_kill "$cl1_pid" "$cl2_pid"
sleep 3
check_and_kill "$sl_pids"
fi
22 changes: 18 additions & 4 deletions e2e/test_superlink.sh
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,18 @@ case "$2" in
;;
esac

check_and_kill() {
local pids=$1 # Get the PID as the first argument to the function
for pid in $pids; do
echo "Attempting to kill process ID: $pid"
if kill "$pid" 2>/dev/null; then
echo "Process $pid successfully killed."
else
echo "Failed to kill process $pid or it may have already terminated."
fi
done
}

# Install Flower app
pip install -e . --no-deps

Expand Down Expand Up @@ -104,8 +116,9 @@ while [ "$found_success" = false ] && [ $elapsed -lt $timeout ]; do
if grep -q "Run finished" flwr_output.log; then
echo "Training worked correctly!"
found_success=true
kill $cl1_pid; kill $cl2_pid;
sleep 1; kill $sl_pid;
check_and_kill "$cl1_pid" "$cl2_pid"
sleep 1
check_and_kill "$sl_pid"
else
echo "Waiting for training ... ($elapsed seconds elapsed)"
fi
Expand All @@ -116,6 +129,7 @@ done

if [ "$found_success" = false ]; then
echo "Training had an issue and timed out."
kill $cl1_pid; kill $cl2_pid;
kill $sl_pid;
check_and_kill "$cl1_pid" "$cl2_pid"
sleep 1
check_and_kill "$sl_pid"
fi
Loading