diff --git a/demo/nvflare/horizontal/README.md b/demo/nvflare/horizontal/README.md index 93ea3794c349..744e909156df 100644 --- a/demo/nvflare/horizontal/README.md +++ b/demo/nvflare/horizontal/README.md @@ -43,9 +43,38 @@ In the admin CLI, run the following command: submit_job horizontal-xgboost ``` +Make a note of the job id: +```console +Submitted job: 28309e77-a7c5-45e6-b2bc-c2e3655122d8 +``` + +On both workers, you should see train and eval losses printed: +```console +[10:45:41] [0] eval-logloss:0.22646 train-logloss:0.23316 +[10:45:41] [1] eval-logloss:0.13776 train-logloss:0.13654 +[10:45:41] [2] eval-logloss:0.08036 train-logloss:0.08243 +[10:45:41] [3] eval-logloss:0.05830 train-logloss:0.05645 +[10:45:41] [4] eval-logloss:0.03825 train-logloss:0.04148 +[10:45:41] [5] eval-logloss:0.02660 train-logloss:0.02958 +[10:45:41] [6] eval-logloss:0.01386 train-logloss:0.01918 +[10:45:41] [7] eval-logloss:0.01018 train-logloss:0.01331 +[10:45:41] [8] eval-logloss:0.00847 train-logloss:0.01112 +[10:45:41] [9] eval-logloss:0.00691 train-logloss:0.00662 +[10:45:41] [10] eval-logloss:0.00543 train-logloss:0.00503 +[10:45:41] [11] eval-logloss:0.00445 train-logloss:0.00420 +[10:45:41] [12] eval-logloss:0.00336 train-logloss:0.00355 +[10:45:41] [13] eval-logloss:0.00277 train-logloss:0.00280 +[10:45:41] [14] eval-logloss:0.00252 train-logloss:0.00244 +[10:45:41] [15] eval-logloss:0.00177 train-logloss:0.00193 +[10:45:41] [16] eval-logloss:0.00156 train-logloss:0.00161 +[10:45:41] [17] eval-logloss:0.00135 train-logloss:0.00142 +[10:45:41] [18] eval-logloss:0.00123 train-logloss:0.00125 +[10:45:41] [19] eval-logloss:0.00106 train-logloss:0.00107 +``` + Once the training finishes, the model file should be written into -`/tmp/nvlfare/poc/site-1/run_1/test.model.json` and `/tmp/nvflare/poc/site-2/run_1/test.model.json` -respectively. +`/tmp/nvlfare/poc/site-1/${job_id}/test.model.json` and `/tmp/nvflare/poc/site-2/${job_id}/test.model.json` +respectively, where `job_id` is the UUID printed out when we ran `submit_job`. Finally, shutdown everything from the admin CLI, using `admin` as password: ```shell diff --git a/demo/nvflare/horizontal/custom/trainer.py b/demo/nvflare/horizontal/custom/trainer.py index 4c6dedc90140..f65f800f0240 100644 --- a/demo/nvflare/horizontal/custom/trainer.py +++ b/demo/nvflare/horizontal/custom/trainer.py @@ -63,8 +63,8 @@ def _do_training(self, fl_ctx: FLContext): } with xgb.collective.CommunicatorContext(**communicator_env): # Load file, file will not be sharded in federated mode. - dtrain = xgb.DMatrix('agaricus.txt.train') - dtest = xgb.DMatrix('agaricus.txt.test') + dtrain = xgb.DMatrix('agaricus.txt.train?format=libsvm') + dtest = xgb.DMatrix('agaricus.txt.test?format=libsvm') # Specify parameters via map, definition are same as c++ version param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'} diff --git a/demo/nvflare/horizontal/prepare_data.sh b/demo/nvflare/horizontal/prepare_data.sh index 6a32008f8116..eed1390b5280 100755 --- a/demo/nvflare/horizontal/prepare_data.sh +++ b/demo/nvflare/horizontal/prepare_data.sh @@ -2,7 +2,7 @@ set -e -rm -fr ./agaricus* ./*.pem ./poc +rm -fr ./agaricus* ./*.pem /tmp/nvflare world_size=2 @@ -11,15 +11,15 @@ openssl req -x509 -newkey rsa:2048 -days 7 -nodes -keyout server-key.pem -out se openssl req -x509 -newkey rsa:2048 -days 7 -nodes -keyout client-key.pem -out client-cert.pem -subj "/C=US/CN=localhost" # Split train and test files manually to simulate a federated environment. -split -n l/${world_size} --numeric-suffixes=1 -a 1 ../data/agaricus.txt.train agaricus.txt.train-site- -split -n l/${world_size} --numeric-suffixes=1 -a 1 ../data/agaricus.txt.test agaricus.txt.test-site- +split -n l/${world_size} --numeric-suffixes=1 -a 1 ../../data/agaricus.txt.train agaricus.txt.train-site- +split -n l/${world_size} --numeric-suffixes=1 -a 1 ../../data/agaricus.txt.test agaricus.txt.test-site- nvflare poc -n 2 --prepare mkdir -p /tmp/nvflare/poc/admin/transfer/horizontal-xgboost cp -fr config custom /tmp/nvflare/poc/admin/transfer/horizontal-xgboost cp server-*.pem client-cert.pem /tmp/nvflare/poc/server/ -for id in $(eval echo "{1..$world_size}"); do - cp server-cert.pem client-*.pem /tmp/nvflare/poc/site-"$id"/ - cp agaricus.txt.train-site-"$id" /tmp/nvflare/poc/site-"$id"/agaricus.txt.train - cp agaricus.txt.test-site-"$id" /tmp/nvflare/poc/site-"$id"/agaricus.txt.test +for (( site=1; site<=world_size; site++ )); do + cp server-cert.pem client-*.pem /tmp/nvflare/poc/site-"$site"/ + cp agaricus.txt.train-site-"$site" /tmp/nvflare/poc/site-"$site"/agaricus.txt.train + cp agaricus.txt.test-site-"$site" /tmp/nvflare/poc/site-"$site"/agaricus.txt.test done