From 392994d3cb1e0269ca7bb4d9dc3a908e7662a587 Mon Sep 17 00:00:00 2001 From: chesterxgchen Date: Mon, 18 May 2026 17:01:16 -0700 Subject: [PATCH 1/3] [2.8] Add end-to-end NVFlare CLI tutorial --- docs/example_applications_algorithms.rst | 2 +- docs/examples/tutorial_notebooks.rst | 2 +- docs/release_notes/flare_240.rst | 2 +- docs/release_notes/flare_280.rst | 2 +- docs/tutorials.rst | 2 +- examples/README.md | 2 +- examples/hello-world/hello-pt/README.md | 8 +- examples/hello-world/hello-pt/client.py | 29 +- examples/hello-world/hello-pt/job.py | 35 +- examples/tutorials/README.md | 4 +- examples/tutorials/job_cli.ipynb | 1513 ----------------- examples/tutorials/nvflare_cli.ipynb | 535 ++++++ .../system_architecture.ipynb | 4 +- nvflare/tool/job/job_cli.py | 5 +- tests/unit_test/tool/job/job_stats_test.py | 21 +- 15 files changed, 626 insertions(+), 1540 deletions(-) delete mode 100644 examples/tutorials/job_cli.ipynb create mode 100644 examples/tutorials/nvflare_cli.ipynb diff --git a/docs/example_applications_algorithms.rst b/docs/example_applications_algorithms.rst index ebac170897..50d6750165 100644 --- a/docs/example_applications_algorithms.rst +++ b/docs/example_applications_algorithms.rst @@ -37,7 +37,7 @@ Can be run from :github_nvflare_link:`hello_world notebook ` - Shows how to use the :ref:`fl_simulator` to run a local simulation of an NVFLARE deployment to test and debug an application without provisioning a real FL project. * :github_nvflare_link:`Hello FLARE API ` - Goes through the different commands of the :ref:`flare_api` to show the syntax and usage of each. * :github_nvflare_link:`NVFLARE in POC Mode ` - Shows how to use :ref:`POC mode ` to test the features of a full FLARE deployment on a single machine. - * :github_nvflare_link:`Job CLI Tutorial ` - Walks through the different commands of the Job CLI and showcases syntax and example usages. + * :github_nvflare_link:`NVFlare CLI Tutorial ` - Walks through the current ``nvflare`` command groups for local setup, recipes, jobs, systems, studies, provisioning, and deployment. * :github_nvflare_link:`Job Recipe ` - Introduces Job Recipes to simplify federated learning job creation and execution with a high-level API. * :github_nvflare_link:`FLARE Logging ` - Covers how to configure logging in FLARE for different use cases and modes. diff --git a/docs/examples/tutorial_notebooks.rst b/docs/examples/tutorial_notebooks.rst index 381fe10dc5..9e5f4fbc25 100644 --- a/docs/examples/tutorial_notebooks.rst +++ b/docs/examples/tutorial_notebooks.rst @@ -9,6 +9,6 @@ Tutorial notebooks on GitHub: - :github_nvflare_link:`FL Simulator Notebook (GitHub) ` - :github_nvflare_link:`Hello FLARE API Notebook (GitHub) ` - :github_nvflare_link:`NVFLARE POC Mode in detail Notebook (GitHub) ` -- :github_nvflare_link:`Job CLI Notebook (GitHub) ` +- :github_nvflare_link:`NVFlare CLI Notebook (GitHub) ` - :github_nvflare_link:`Job Recipe Notebook (GitHub) ` - :github_nvflare_link:`FLARE Logging Notebook (GitHub) ` diff --git a/docs/release_notes/flare_240.rst b/docs/release_notes/flare_240.rst index 88b4a9a062..0c3952ff48 100644 --- a/docs/release_notes/flare_240.rst +++ b/docs/release_notes/flare_240.rst @@ -64,7 +64,7 @@ Furthermore, the Job CLI also offers users a convenient method for submitting jo ``nvflare job list_templates|create|submit|show_variables`` Also explore the continuously growing :github_nvflare_link:`Job Template directory ` we have created for commonly used configurations. -For more in-depth information on Job Templates and the Job CLI, refer to the :ref:`job_cli` documentation and :github_nvflare_link:`tutorials `. +For more in-depth information on Job Templates and the Job CLI, refer to the :ref:`job_cli` documentation and :github_nvflare_link:`CLI tutorials `. ModelLearner ~~~~~~~~~~~~ diff --git a/docs/release_notes/flare_280.rst b/docs/release_notes/flare_280.rst index 289cdcd11f..cb35766919 100644 --- a/docs/release_notes/flare_280.rst +++ b/docs/release_notes/flare_280.rst @@ -76,7 +76,7 @@ For details, see :ref:`nvflare_cli`, :ref:`job_cli`, :ref:`system_command`, :ref:`config_command`, and :ref:`recipe_command`. For a hands-on CLI workflow, see the -:github_nvflare_link:`Job CLI tutorial `. +:github_nvflare_link:`NVFlare CLI tutorial `. Deployment and Provisioning =========================== diff --git a/docs/tutorials.rst b/docs/tutorials.rst index e31a3ac146..e2eb43f5aa 100644 --- a/docs/tutorials.rst +++ b/docs/tutorials.rst @@ -25,7 +25,7 @@ Feature Tutorials - `Simulator CLI & Python API `_ - `FLARE Python API: Job Submission & Monitoring `_ - `Logging: Configuration & Customization `_ -- `Job CLI: Job Submission & Templates `_ +- `NVFlare CLI: Setup, Jobs, Systems, and Deployment `_ - `Job Recipe: Simplified job creation `_ Self-Paced Learning diff --git a/examples/README.md b/examples/README.md index 0861999d11..769290b0ab 100644 --- a/examples/README.md +++ b/examples/README.md @@ -89,7 +89,7 @@ When you open a notebook, select the kernel `nvflare_example` using the dropdown | [Intro to the FL Simulator](./tutorials/flare_simulator.ipynb) | Shows how to use the FLARE Simulator to run a local simulation. | | [Hello FLARE API](./tutorials/flare_api.ipynb) | Goes through the different commands of the FLARE API. | | [NVFLARE in POC Mode](./tutorials/setup_poc.ipynb) | Shows how to use POC mode. | -| [Job CLI](./tutorials/job_cli.ipynb) | Walks through the different commands of the Job CLI. | +| [NVFlare CLI](./tutorials/nvflare_cli.ipynb) | Walks through the current `nvflare` command groups for local setup, recipes, jobs, systems, studies, provisioning, and deployment. | | [Job Recipe](./tutorials/job_recipe.ipynb) | Introduces Job Recipes to simplify federated learning job creation and execution with a high-level API. | | [Logging Tutorial](./tutorials/logging.ipynb) | Shows how to use the logging configuration for different modules. | diff --git a/examples/hello-world/hello-pt/README.md b/examples/hello-world/hello-pt/README.md index b686f5502e..2d8146189a 100644 --- a/examples/hello-world/hello-pt/README.md +++ b/examples/hello-world/hello-pt/README.md @@ -42,6 +42,12 @@ You can download the CIFAR10 dataset from the Internet via torchvision’s datas You can split the datasets for different clients, so that each client has its own dataset. Here, for simplicity's sake, we will be using the same dataset on each client. +For quick smoke tests or offline environments, the job can use synthetic CIFAR-shaped data: + +``` +python job.py --synthetic_data --train_size 128 --test_size 64 --num_rounds 2 --epochs 1 +``` + ## Model In PyTorch, neural networks are implemented by defining a class (e.g., SimpleNetwork) that extends `nn.Module`. The network’s architecture is set up in the __init__ method, while the forward method determines how input data flows @@ -130,7 +136,7 @@ recipe = FedAvgRecipe( num_rounds=num_rounds, model=SimpleNetwork(), train_script="client.py", - train_args=f"--batch_size {batch_size}", + train_args=f"--batch_size {batch_size} --epochs {epochs}", ) env = SimEnv(num_clients=n_clients, num_threads=n_clients) diff --git a/examples/hello-world/hello-pt/client.py b/examples/hello-world/hello-pt/client.py index 8f86799b53..ec36704151 100644 --- a/examples/hello-world/hello-pt/client.py +++ b/examples/hello-world/hello-pt/client.py @@ -45,14 +45,19 @@ def evaluate(net, data_loader, device): total += labels.size(0) correct += (predicted == labels).sum().item() - print(f"Accuracy of the network on the 10000 test images: {100 * correct // total} %") - return 100 * correct // total + accuracy = 100 * correct // total + print(f"Accuracy of the network on {total} test images: {accuracy} %") + return accuracy def main(): parser = argparse.ArgumentParser() parser.add_argument("--epochs", type=int, default=2) parser.add_argument("--batch_size", type=int, default=16) + parser.add_argument("--num_workers", type=int, default=2) + parser.add_argument("--synthetic_data", action="store_true") + parser.add_argument("--train_size", type=int, default=50000) + parser.add_argument("--test_size", type=int, default=10000) args = parser.parse_args() batch_size = args.batch_size epochs = args.epochs @@ -70,11 +75,23 @@ def main(): ) # Load datasets - train_set = torchvision.datasets.CIFAR10(root=DATASET_PATH, train=True, download=True, transform=transform) - train_loader = torch.utils.data.DataLoader(train_set, batch_size=args.batch_size, shuffle=True, num_workers=2) + if args.synthetic_data: + train_set = torchvision.datasets.FakeData( + size=args.train_size, image_size=(3, 32, 32), num_classes=10, transform=transform + ) + test_set = torchvision.datasets.FakeData( + size=args.test_size, image_size=(3, 32, 32), num_classes=10, transform=transform + ) + else: + train_set = torchvision.datasets.CIFAR10(root=DATASET_PATH, train=True, download=True, transform=transform) + test_set = torchvision.datasets.CIFAR10(root=DATASET_PATH, train=False, download=True, transform=transform) - test_set = torchvision.datasets.CIFAR10(root=DATASET_PATH, train=False, download=True, transform=transform) - test_loader = torch.utils.data.DataLoader(test_set, batch_size=args.batch_size, shuffle=False, num_workers=2) + train_loader = torch.utils.data.DataLoader( + train_set, batch_size=batch_size, shuffle=True, num_workers=args.num_workers + ) + test_loader = torch.utils.data.DataLoader( + test_set, batch_size=batch_size, shuffle=False, num_workers=args.num_workers + ) # (3) initializes NVFlare client API flare.init() diff --git a/examples/hello-world/hello-pt/job.py b/examples/hello-world/hello-pt/job.py index 7af360c7a8..0bd78e1b9d 100644 --- a/examples/hello-world/hello-pt/job.py +++ b/examples/hello-world/hello-pt/job.py @@ -29,8 +29,15 @@ def define_parser(): parser.add_argument("--n_clients", type=int, default=2) parser.add_argument("--num_rounds", type=int, default=2) parser.add_argument("--batch_size", type=int, default=16) + parser.add_argument("--epochs", type=int, default=2) + parser.add_argument("--num_workers", type=int, default=2) + parser.add_argument("--synthetic_data", action="store_true") + parser.add_argument("--train_size", type=int, default=50000) + parser.add_argument("--test_size", type=int, default=10000) parser.add_argument("--train_script", type=str, default="client.py") parser.add_argument("--cross_site_eval", action="store_true") + parser.add_argument("--export_config", action=argparse.BooleanOptionalAction, default=False) + parser.add_argument("--enable_log_streaming", action=argparse.BooleanOptionalAction, default=False) parser.add_argument( "--launch_external_process", action="store_true", @@ -52,6 +59,10 @@ def main(): n_clients = args.n_clients num_rounds = args.num_rounds batch_size = args.batch_size + epochs = args.epochs + train_args = f"--batch_size {batch_size} --epochs {epochs} --num_workers {args.num_workers}" + if args.synthetic_data: + train_args += f" --synthetic_data --train_size {args.train_size} --test_size {args.test_size}" recipe = FedAvgRecipe( name="hello-pt", @@ -62,7 +73,7 @@ def main(): # Alternative: model={"class_path": "model.SimpleNetwork", "args": {}}, # For pre-trained weights: initial_ckpt="/server/path/to/pretrained.pt", train_script=args.train_script, - train_args=f"--batch_size {batch_size}", + train_args=train_args, launch_external_process=args.launch_external_process, client_memory_gc_rounds=args.client_memory_gc_rounds, ) @@ -71,13 +82,21 @@ def main(): if args.cross_site_eval: add_cross_site_evaluation(recipe) - # Run FL simulation - env = SimEnv(num_clients=n_clients) - run = recipe.execute(env) - print() - print("Job Status is:", run.get_status()) - print("Result can be found in :", run.get_result()) - print() + if args.enable_log_streaming: + recipe.enable_log_streaming() + + if args.export_config: + job_dir = "/tmp/nvflare/jobs/job_config" + recipe.export(job_dir) + print(f"Job config exported to {job_dir}") + else: + # Run FL simulation + env = SimEnv(num_clients=n_clients) + run = recipe.execute(env) + print() + print("Job Status is:", run.get_status()) + print("Result can be found in :", run.get_result()) + print() if __name__ == "__main__": diff --git a/examples/tutorials/README.md b/examples/tutorials/README.md index 0fc962c1d4..5376c050cb 100644 --- a/examples/tutorials/README.md +++ b/examples/tutorials/README.md @@ -12,8 +12,8 @@ Please make sure you set up a virtual environment and install and configure Jupy * Goes through the different commands of the FLARE API. * [NVFLARE in POC Mode](./setup_poc.ipynb) * Shows how to use POC mode. -* [Job CLI](./job_cli.ipynb) - * Shows how to use Job CLI commands. +* [NVFlare CLI](./nvflare_cli.ipynb) + * Shows how to use the current `nvflare` command groups. * [Job Recipe](./job_recipe.ipynb) * Introduces Job Recipes to simplify federated learning job creation and execution with a high-level API. * [FLARE Logging](./logging.ipynb) diff --git a/examples/tutorials/job_cli.ipynb b/examples/tutorials/job_cli.ipynb deleted file mode 100644 index 4ba697082c..0000000000 --- a/examples/tutorials/job_cli.ipynb +++ /dev/null @@ -1,1513 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "5933f2eb-463a-4d01-a806-b6fc0fe9b4de", - "metadata": {}, - "source": [ - "# NVFLARE JOB CLI" - ] - }, - { - "cell_type": "markdown", - "id": "69ce4e61-da7f-4ba1-96f1-822c578e53a1", - "metadata": { - "tags": [] - }, - "source": [ - "In this notebook, we will go through the different commands of the Job CLI to show the syntax and usage of each.\n", - "Refer to the [Job CLI Documentation](https://nvflare.readthedocs.io/en/main/user_guide/nvflare_cli/job_cli.html) for more details.\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "id": "154531fd-2a94-4062-bbc6-76086b099093", - "metadata": { - "tags": [], - "toc-hr-collapsed": true - }, - "source": [ - "## Install NVIDIA FLARE\n", - "\n", - "For this notebook, we will install NVIDIA FLARE directly in the current environment.\n", - "\n", - "If you use the job CLI to submit job, you will need a running NVFLARE system with client and server. You can either run a local system via nvflare poc commands, or \n", - "use a running production system. \n", - "\n", - "To see how to setup a local system, please refer to the [setup_poc tutorial](setup_poc.ipynb).\n", - " " - ] - }, - { - "cell_type": "markdown", - "id": "5d2dbd79", - "metadata": {}, - "source": [ - "### Install NVFLARE using pip\n", - "\n", - "First, let's install NVFLARE 2.7.2 in the current environment:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0ee556e6", - "metadata": {}, - "outputs": [], - "source": [ - "import sys\n", - "\n", - "!{sys.executable} -m pip install \"nvflare[CONFIG]~=2.7.2rc\" torch torchvision tensorboard --quiet" - ] - }, - { - "cell_type": "markdown", - "id": "7b73088e", - "metadata": {}, - "source": [ - "### Verify NVFLARE installation\n", - "\n", - "Let's verify that nvflare is installed and accessible:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0b85417a", - "metadata": {}, - "outputs": [], - "source": [ - "!nvflare --version" - ] - }, - { - "cell_type": "markdown", - "id": "2956d010", - "metadata": {}, - "source": [ - "## Step-by-step walk-through: from creating a job to running a job\n", - "\n", - "Taking a CIFAR10 pytorch training code for a 2-client federated learning program (such as from the [hello-pt](https://github.com/NVIDIA/NVFlare/tree/main/examples/hello-world/hello-pt) example), we can use the standard Scatter and Gather (SAG) workflow pattern to demonstrate the features of the Job CLI. \n", - "\n", - "Now, we would like to see what are the available pre-configured job templates the user can use and modify. \n", - "\n", - "\n", - "### Check out the available nvflare job templates\n", - " " - ] - }, - { - "cell_type": "markdown", - "id": "6c635e02-7fe6-401c-82d2-e2cde1dc86c0", - "metadata": { - "tags": [] - }, - "source": [ - "#### List Job Templates and job templates directory\n", - "\n", - "The NVFLARE 2.4.0 release introduces job templates for the different types of job configurations.\n", - "\n", - "To list the available templates, you can use the ```nvflare job list_templates``` command:\n", - "\n", - "```\n", - "! nvflare job list_templates\n", - "```\n", - "\n", - "If you installed nvflare 2.4.x via `pip install nvflare`. The above command should show you available job templates (built-in default job templates). But if you cloned the github report of repository, and did not use the ```pip install nvflare```, the above command will expect you to provide the job_templates directory. When the job templates directory is not specified, the Job CLI will try to find the job_templates location with the following sequences of logic:\n", - "\n", - "* See if the NVFLARE_HOME environment variable is set. If NVFLARE_HOME is not empty, the Job CLI will look for the job_templates at:\n", - " \n", - " ```${NVFLARE_HOME}/job_templates```\n", - " \n", - "* If the NVFLARE_HOME env. variable is not set, the Job CLI will look for the `job_template` path of the config in the nvflare hidden directory \n", - "\n", - "```\n", - "cat ~/.nvflare/config.conf \n", - "\n", - "startup_kit {\n", - " path = \"/tmp/nvflare/poc1/example_project/prod_00\"\n", - "}\n", - "poc_workspace {\n", - " path = \"/tmp/nvflare/poc1\"\n", - "}\n", - "job_template {\n", - " path = \"../../job_templates\"\n", - "}\n", - "\n", - "```\n", - "once the `-d ` option is used, the `job_template` value in `~/.nvflare/config.conf` will be updated so you don't need to specify -d again. \n", - "\n", - "If you want to change the `job_template` path, you can directly edit this config file or use the `nvflare config` command:\n", - "\n", - "```\n", - "nvflare config -jt ../../job_templates. \n", - "\n", - "```\n", - "If the ~/.nvflare/config.conf is not defined yet, the command will look at the following location from installed NVFLARE package \n", - "```\n", - " job_templates_dir = os.path.join(nvflare.job.__file__, \"templates\")\n", - "```\n", - " \n", - "If the nvflare is installed, this directory exists, then it should find the built-in job templates. \n", - "\n", - "> Note: this directory may not exist in the follow case: \n", - "> * If you have done ```pip install nvflare```, but also installed the NVFLARE source code from github repo. the sys.path might point to your local NVFLARE repository when load nvflare.job module. In such a case, the above directory will not exist. As the job_templates is not located at nvflare/job/templates in the github repository. \n", - "\n", - "\n", - "If Job templates directory still not found, the command will raise exception for missing Job Template directory. \n", - "\n", - "\n", - "By now, you should understand that the ```nvflare job list_templates``` allows you to list built-in default job templates from the release, as well as provides your own job_templates to reflect the recent changes. \n", - "\n", - "For now, let's specify the job templates directory location\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "447619d5-d917-4d93-b806-6c673e216b88", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "! nvflare job list_templates -d \"../../job_templates\"" - ] - }, - { - "cell_type": "markdown", - "id": "0e61acd2", - "metadata": {}, - "source": [ - "### Prepare Training Code\n", - "\n", - "For this tutorial, we have prepared training scripts in the `code/` directory:\n", - "- `client.py`: The federated learning training script using NVFLARE Client API\n", - "- `model.py`: The PyTorch model definition (SimpleNetwork for CIFAR10)\n", - "\n", - "When creating jobs with the `-sd` flag, these scripts will be copied to the job folder. The `-sd` flag should point to a directory containing only your training code to avoid copying unnecessary files." - ] - }, - { - "cell_type": "markdown", - "id": "9abba0f2-17c9-4e09-a34e-8543238e4039", - "metadata": {}, - "source": [ - "Where the option `-d \"\"` or `--job_template_dir \"\"` is the location of the job_templates. By doing so, we have also save our job_templates into the hidden configuration,so we don't do it again next time. Let's look at the config file. \n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "06fec3a1-71a6-40f7-a10f-52ea6595ad96", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "! cat ~/.nvflare/config.conf" - ] - }, - { - "cell_type": "markdown", - "id": "4663c2a0-3b36-4abb-ba8f-d1e77afea23a", - "metadata": {}, - "source": [ - "You can also manually preset the job_templates directory if you don't want to reply on the default one. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1bea478d-42fb-4223-8155-3c996699a052", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "! nvflare config -jt ../../job_templates" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2a3d793d-bc0c-458b-8a58-047cfde915f0", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "! cat ~/.nvflare/config.conf" - ] - }, - { - "cell_type": "markdown", - "id": "b9a7b708-40e9-4f4f-beec-b83d0e893a0b", - "metadata": {}, - "source": [ - "Now we can list the templates again without -d option" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "49bab293-7fef-476f-8aa9-f2b2868a0fb3", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "! nvflare job list_templates" - ] - }, - { - "cell_type": "markdown", - "id": "10720b0b-af4c-4d71-a751-2d5c301eb05a", - "metadata": { - "tags": [] - }, - "source": [ - "With a job template that fits your needs, you can use the job template name to create a new job folder.\n" - ] - }, - { - "cell_type": "markdown", - "id": "ecbd0cc7-f410-457e-a6bd-d4c999182850", - "metadata": {}, - "source": [ - "### Create a job folder\n", - "\n", - "Since the code for our example is written in pytorch and we would like to try the FedAvg algorithm using the Scatter & Gather (SAG) workflow, the job template **\"sag_pt\"** is what we are looking for. We will use this template to create our job folder. \n", - "\n", - "Create a job folder that contains the base job configuration from the template, which can then be modified as desired. First, create a job folder with the intent for it to be modified, without specifying any code.\n", - "\n", - "\n", - "#### First try\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0b3c9750-070f-4655-9128-757ab136b30d", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "! nvflare job create -j /tmp/nvflare/my_job -w sag_pt -force\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "id": "8cf08b1e-ca7c-4902-b8d0-01968c3801be", - "metadata": { - "tags": [] - }, - "source": [ - "The above command creates a job folder at ```/tmp/nvflare/my_job``` with job template ```sag_pt```. \n", - "You can see that a few configuration files are created. Some of the configurations are open for you to overwrite.\n", - "\n", - "If you have the ```tree``` command installed ( ```sudo apt install tree``` on linux), you can use the ```tree``` command, otherwise, you can use \"ls -al\" to look at the job_folder structure:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e87410e3-c498-44ae-b7f7-2a51af237e41", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "! tree /tmp/nvflare/my_job" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "393ae90b-640b-4dd7-a325-e2b026e7703b", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "! cat /tmp/nvflare/my_job/meta.conf" - ] - }, - { - "cell_type": "markdown", - "id": "9ba16495-eaa3-431c-84da-432635ec8e29", - "metadata": { - "tags": [] - }, - "source": [ - "Notice the app_name is \"my_job\". In `config_fed_client.conf` we can specify the data exchange path, the exchange format, and the way to transfer the model. Let's look at the server side configuration. \n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "66c4f50f-b715-42de-a217-f2feced182ba", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "! cat /tmp/nvflare/my_job/app/config/config_fed_server.conf" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "24bebc20-5351-40a7-af7b-0a3c9bb271ff", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "! cat /tmp/nvflare/my_job/app/config/config_fed_client.conf" - ] - }, - { - "cell_type": "markdown", - "id": "11767829-65e5-47e1-bae1-6c587c315100", - "metadata": {}, - "source": [ - "> Note that both client and server configurations are nicely commented with explainations. \n", - "> If you create the job with customizations such as using -f or configurations, the configuration files will be overwritten. As result, the comments in the configuration will be lost in the final files. \n", - "\n", - "### Show variables\n", - "\n", - "Now, you can see the job folder is auto-created with pre-defined configurations. To make sure this template works for your code and the variables can be updated. Let's check the variables again with the following command" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "74adcc28-9f52-4f2b-b4b2-1bd9b053dd80", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "! nvflare job show_variables -j /tmp/nvflare/my_job" - ] - }, - { - "cell_type": "markdown", - "id": "f8ede943-4c76-42b2-81ad-8d9364637cfb", - "metadata": {}, - "source": [ - "You can see there are many variables you might want to change:\n", - "\n", - "* Change num_rounds to 1 to test out a fast run first.\n", - "* Use custom cifar10 code which was already written based on Flare 2.4.0 Client API.\n", - "\n", - "\n", - "**Note**\n", - "\n", - "the job template name: such as ```sag_pt```, you can also use directory path for the job template. You can try yourself.\n", - "\n", - "```\n", - "! nvflare job create -j /tmp/nvflare/my_job -w ../../job_templates/sag_pt -force\n", - "```\n" - ] - }, - { - "cell_type": "markdown", - "id": "8e991379-78e1-4e86-88fb-ab9a69d6822c", - "metadata": {}, - "source": [ - "\n", - "Let's do a second try, \n", - "\n", - "#### The second try" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "69d32eb0-1384-4d49-9b6e-a369c53e7163", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "! nvflare job create -j /tmp/nvflare/my_job -force -w sag_pt \\\n", - "-f config_fed_server.conf num_rounds=1 model_class_path=model.SimpleNetwork \\\n", - "-f config_fed_client.conf app_script=client.py \\\n", - "-sd ./code" - ] - }, - { - "cell_type": "markdown", - "id": "14aae893-0371-488e-9883-aa2d736a6807", - "metadata": {}, - "source": [ - "The above command creates a job folder at ```/tmp/nvflare/my_job``` with job template ```sag_pt``` again (`-force` to replace the existing job folder). \n", - "Now, `num_rounds` is set to 1, `{app_script}` is \"client.py\", and `model_class_path` is \"model.SimpleNetwork\". The python script will invoke ```python custom/{app_script}```, so the provided `client.py` will be called.\n", - "Now, take a look the code structure again: " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e09239cb-39cb-4ccf-858e-619d2b1072a0", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "! tree /tmp/nvflare/my_job" - ] - }, - { - "cell_type": "markdown", - "id": "3103b02a-1ada-4201-b848-811d237be3fd", - "metadata": {}, - "source": [ - "Notice that the code we had written is copied to the job directory. " - ] - }, - { - "cell_type": "markdown", - "id": "025fa61b-2c30-4421-a74c-75f0725a86f8", - "metadata": { - "tags": [] - }, - "source": [ - "In config_fed_server.conf, we have ```PTFileModelPersistor```, a file-based persistor for pytorch. It requires the `model.SimpleNetwork` class for model initialization and also for saving the final model.\n", - "The \"model.py\" file with class \"SimpleNetwork\" matches the configuration we specified with `model_class_path=model.SimpleNetwork`. If your model file name and class name are different, you will need to update your configuration to match. \n", - "\n", - "We will leave the rest of values as default and try to run the job. " - ] - }, - { - "cell_type": "markdown", - "id": "fe97e7a0-38ae-4af7-b8be-73879a69a55f", - "metadata": {}, - "source": [ - "### Download the data\n", - "\n", - "Download the data first to avoid repeated downloading. You can use the download script:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "70c829fa-099c-4f40-9571-7e7ca8f026d8", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# Download CIFAR10 data if needed\n", - "# You can use torchvision.datasets.CIFAR10 to download the data\n", - "# or use the download script from hello-pt example" - ] - }, - { - "cell_type": "markdown", - "id": "f9308ba5-ad4d-4f3f-9377-76fd6a253687", - "metadata": {}, - "source": [ - "### Run the Job in simulator \n", - "\n", - "You can first run the job with `nvflare simulator` to see if there are any issues:\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2451ab27-68cf-4ab3-ae7f-f276f21185a2", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "! nvflare simulator /tmp/nvflare/my_job -w /tmp/my_job" - ] - }, - { - "cell_type": "markdown", - "id": "6041e784-d81e-4d7f-a497-4e3dbe050068", - "metadata": {}, - "source": [ - "If this does not work for you, you may need to make additional changes based on the error messages.\n", - "\n", - "Assuming `nvflare simulator` works, you can try running locally with POC mode. For more realistic training, you can first recreate the job configuration with a larger number of rounds (num_rounds=100):\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3bbca437-a0be-45da-a438-c387266998a0", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "! nvflare job create -j /tmp/nvflare/my_job -force -w sag_pt \\\n", - "-f config_fed_server.conf num_rounds=100 model_class_path=model.SimpleNetwork \\\n", - "-f config_fed_client.conf app_script=client.py \\\n", - "-sd ./code" - ] - }, - { - "cell_type": "markdown", - "id": "be6a878d-5518-4e41-9c3c-2f5098e7d018", - "metadata": {}, - "source": [ - "\n", - "### Set up and start POC mode\n", - "\n", - "From a terminal, run:\n", - "\n", - "```\n", - " nvflare poc prepare -n 2\n", - " nvflare poc start -ex admin@nvidia.com\n", - "```\n", - "This will prepare a workspace for POC with n = 2 clients. The second command starts the POC clients and server except for the FLARE Admin Console (user name = 'admin@nvidia.com'). Since we are going to the Job CLI for submit job, we don't need the admin console for now. Once the system has started, we are ready to move to the next step: submit job.\n" - ] - }, - { - "cell_type": "markdown", - "id": "e839570e-7522-4732-8034-d05a520fa4eb", - "metadata": { - "tags": [] - }, - "source": [ - "### Submit Job from CLI\n", - "\n", - "You can use the following command to directly submit job from the command line. \n", - "\n", - "Even through in `config_fed_server.conf`, num_rounds = 100, to start with a smaller number of rounds, you can set `num_rounds` in the `nvflare job submit` command without changing the value in the config. \n", - "\n", - "Also, to change the `train_timeout` to 300 seconds instead of 0 (which means no timeout), this arg is also in `config_fed_server.conf`, so you can include it with `num_rounds` after `-f config_fed_server.conf`.\n", - "\n", - "Finally, instead of relying on the default `dataset_path`, you can specify the `dataset_path` in the `nvflare job submit` command." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a5012082-5904-4a24-9211-0e46bc75d6d2", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "! nvflare job submit -j /tmp/nvflare/my_job \\\n", - "-f config_fed_server.conf num_rounds=1 train_timeout=300 \\\n", - "-f config_fed_client.conf app_config=\"--dataset_path /tmp/nvflare/data/cifar10\" \\\n", - "-debug" - ] - }, - { - "cell_type": "markdown", - "id": "14e34ddc-7f4c-4c14-b287-bedad1b603b3", - "metadata": {}, - "source": [ - "You can go to the terminal to monitor the output log. \n", - "\n", - "> the CLI argument\n", - "> ```\n", - "> app_config=\"--dataset_path /tmp/nvflare/data/cifar10\"\n", - "> ```\n", - "> will be translated into \n", - "\n", - "> ```\n", - "> python custom/client.py --dataset_path \"/tmp/nvflare/data/cifar10\"\n", - "> ```\n", - "> in our case, `client.py` takes `--dataset_path` as an argument. \n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "id": "7b5522b5-38a8-4f95-98bf-0cd15c336f16", - "metadata": {}, - "source": [ - "### Submit Job from CLI in production\n", - "\n", - "Before you try to submit to production, the Job CLI will need to know the location of the admin console startup kit directory. \n", - "In POC mode, this is set for the user automatically. In prodcuction, the user will need to set the path to the startup kit for the Job CLI. \n", - "\n", - "The startup kit path is stored in the `config.conf` file in the nvflare hidden directory at the user's home directory. First you can take a look at this file: \n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "52788f23-9e78-4396-ad84-c74ae9ef7937", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "! cat ~/.nvflare/config.conf\n", - " " - ] - }, - { - "cell_type": "markdown", - "id": "9e0dcc33-ff07-4554-8d6f-34b327b4ef44", - "metadata": {}, - "source": [ - "You can directly edit the path in the file:\n", - "```\n", - " startup_kit {\n", - " path = /tmp/nvflare/poc/example_project/prod_00\n", - " }\n", - "```\n", - "Alternatively, you can use the following command:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d1a57527-ee4e-4fc8-bfdf-e48fa6466527", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "! nvflare config --startup_kit_dir /tmp/nvflare/poc/example_project/prod_00" - ] - }, - { - "cell_type": "markdown", - "id": "db24105a-a80c-4c4c-b98f-f354ae5baaa1", - "metadata": {}, - "source": [ - "or" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "01ca3eca-2779-440d-86bc-9a23e1420e99", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "! nvflare config -d /tmp/nvflare/poc/example_project/prod_00" - ] - }, - { - "cell_type": "markdown", - "id": "6cf52c39-cea7-4764-91b3-e061356089cc", - "metadata": {}, - "source": [ - "Once the startup kit directory path is set, you can do the job submit:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a106a7b0-371e-4504-aa26-a45a55a6347b", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "! nvflare job submit -j /tmp/nvflare/my_job \\\n", - "-f config_fed_server.conf num_rounds=1 \\\n", - "-f config_fed_client.conf app_config=\"--dataset_path /tmp/nvflare/data/cifar10\"" - ] - }, - { - "cell_type": "markdown", - "id": "1c668def-55e3-477e-8e0a-f76c003b5c31", - "metadata": {}, - "source": [ - "## Troubleshooting with the `-debug` flag\n", - "\n", - "Since the ```nvflare job submit``` command does not overwrite the job folder configuration during submission, it has to use a temp job folder. \n", - "If you want to check the final configs submited to the server or simply want to see the stack trace of the exception, you can use the `-debug` flag. \n", - "\n", - "With the `-debug` flag, the ``` nvflare job submit ``` command will not delete the temp job folder once it has finished job submission, and it will also print the exception stack trace in case of failure. \n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c676d8ab-6d35-4713-b551-8f5a40927a6e", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "! nvflare job submit -j /tmp/nvflare/my_job \\\n", - "-f config_fed_server.conf num_rounds=1 train_timeout=300 \\\n", - "-f config_fed_client.conf app_config=\"--dataset_path /tmp/nvflare/data/cifar10\" \\\n", - "-debug" - ] - }, - { - "cell_type": "markdown", - "id": "e3640447-52d8-47f5-80f6-98702efc5b35", - "metadata": {}, - "source": [ - "You should see a statement like the following after the message that the job was submitted (the actual random folder name will vary): \n", - "\n", - "```\n", - "in debug mode, job configurations can be examined in temp job directory '/tmp/tmpdnusoyzj'\n", - "```\n", - "\n", - "You can check the job folder with `tree` or `ls -al` \n", - "> note: the temp folder name can be different on your machine" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6a796f30-e99e-49e0-96ce-ad8809c27ce1", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "! tree '/tmp/tmpdnusoyzj'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b446bcda-f77a-4c2e-a548-c16822d28796", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "!cat '/tmp/tmpdnusoyzj/app/config/config_fed_client.conf'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "703ed5d4-1cb2-426d-8efa-03ed2c3464e0", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "!cat '/tmp/tmpdnusoyzj/app/config/config_fed_server.conf'" - ] - }, - { - "cell_type": "markdown", - "id": "493538a7-fe6d-4fc2-a65e-873bf26a09ee", - "metadata": {}, - "source": [ - "You can see if the configs for server and clients are indeed the values specified." - ] - }, - { - "cell_type": "markdown", - "id": "ebaaa444-1f39-4832-ac1b-2d9b3710dbba", - "metadata": { - "tags": [] - }, - "source": [ - "## Troubleshooting - Client API timeout\n", - "\n", - "If the client API has not received training in 60 seconds, the job will be considered failed with a message like the following:\n", - "```\n", - "PTFilePipeLauncherExecutor - ERROR - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=train, task_id=db7940f1-d7b4-44e5-b509-dfed4adeb2ec]: received _PEER_GONE_ while waiting for result for train\n", - "```\n", - "\n", - "If you need to, you can increase the value for the timeout: \n", - "\n", - "```\n", - "heartbeat_timeout = 120\n", - "``` " - ] - }, - { - "cell_type": "markdown", - "id": "0ed9103e-9f45-4987-be8c-907ab86dfa1d", - "metadata": { - "tags": [] - }, - "source": [ - "## Cleanup\n", - "\n", - "Make sure you shut down the POC system when you are done:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "80086770-7a84-42aa-b981-5b9eabb22b9e", - "metadata": {}, - "outputs": [], - "source": [ - "! nvflare poc stop" - ] - }, - { - "cell_type": "markdown", - "id": "767c4fa4-9695-4013-a08d-36a3ab2d7e13", - "metadata": { - "tags": [] - }, - "source": [ - "## Advanced Section\n", - "\n", - "With above sections, you should have understood how to create job with the job template, modify the configuration as needed (either via CLI or manually) and submit job. \n", - "Now, what if you would like to have \n", - "\n", - "* Different configurations on different clients\n", - " You could have different datasets on different sites, therefore, the epoches, batch size, learning rate, etc. can be different. \n", - "\n", - "* Deploy different code pieces to different sites \n", - " You don't need to deploy all the code to all places, only certain code is needed at certain locations. \n", - " \n", - "* add new arguments not in the job templates, modify specific config key using path\n", - "\n", - "* remove configuration\n", - "\n", - "* modify custom configurations\n", - "\n", - "\n", - "\n", - "\n", - "In this section, we will discuss how to do this. So far, we assumed all sites (server and client sites) had the same code and configuration, we deploy all the code + configs to all sites with the following meta.conf\n", - "\n", - "```\n", - "name = \"my_job\"\n", - "resource_spec {}\n", - "deploy_map {\n", - " app = [\n", - " \"@ALL\"\n", - " ]\n", - "}\n", - "min_clients = 2\n", - "mandatory_clients = []\n", - "\n", - "```\n", - "\n", - "Notice the **deploy_map** \n", - "```\n", - "deploy_map {\n", - " app = [\n", - " \"@ALL\"\n", - " ]\n", - "}\n", - "\n", - "```\n", - "### Set up job with different site-specific configurations \n", - "\n", - "We are saying that there is \"app\" is deployed to \"ALL\" sites. Let's look at a different example\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1b1942b9-f189-4726-b1e8-1ccb3a9ece1d", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "! nvflare job create \\\n", - "-j /tmp/nvflare/my_job -w sag_pt_deploy_map " - ] - }, - { - "cell_type": "markdown", - "id": "d39a5466-fcc9-426e-b311-363087727eea", - "metadata": {}, - "source": [ - "Here we have three different apps : \"app_server\", \"app_1\" and \"app_2\". \n", - "We would like to change the following: \n", - "\n", - "* change number of training rounds to 2\n", - "* set the model class path to \"model.SimpleNetwork\"\n", - "* change default app_script from \"cifar10.py\" to \"client.py\" for both app_1 and app_2\n", - "* change the app_1 batch_size to 4, app_2 batch_size to 6" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b8e2b701-99e6-4aa0-bfac-ab2054635b8f", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "! nvflare job create \\\n", - "-j /tmp/nvflare/my_job -w sag_pt_deploy_map \\\n", - "-f app_server/config_fed_server.conf num_rounds=2 model_class_path=model.SimpleNetwork \\\n", - "-f app_1/config_fed_client.conf app_script=client.py app_config=\"--batch_size 4\" \\\n", - "-f app_2/config_fed_client.conf app_script=client.py app_config=\"--batch_size 6\" \\\n", - "-sd ./code \\\n", - "-force" - ] - }, - { - "cell_type": "markdown", - "id": "47fb01d8-71da-4ab3-a6a6-d66563bd6c8b", - "metadata": { - "tags": [] - }, - "source": [ - "Now let's look at the job folder structure. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8270bdfe-a3ab-4c54-92d9-f67ecc59d210", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "!tree /tmp/nvflare/my_job" - ] - }, - { - "cell_type": "markdown", - "id": "050a88ce-084d-4da4-b160-69ac8918b7a8", - "metadata": {}, - "source": [ - "The job folder consists of three sub-folders, each representing one application: app_server, app_1, app_2. Now look at the meta.conf's deploy_map" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e4aaf60e-254d-4490-970b-fd530b09b84b", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "!cat /tmp/nvflare/my_job/meta.conf" - ] - }, - { - "cell_type": "markdown", - "id": "821cde68-38cc-48a0-8448-835693b2a131", - "metadata": {}, - "source": [ - "Notice, app_server is deployed to \"server\", \"app_1\" and \"app_2\" respectively. The app_1 and app_2 only need client configurations and app_server only need server configuration. Since the server is not doing the training job. we could **remove** ther client.py from the app_server app. and look at again" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2e0c56d0-a3f6-452e-8b77-c9693c8edd79", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "!rm /tmp/nvflare/my_job/app_server/custom/client.py" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f0d3d9f1-40df-44b8-8fb7-52ab820358ca", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "!tree /tmp/nvflare/my_job" - ] - }, - { - "cell_type": "markdown", - "id": "a65fe596-5abc-4e4b-9f83-67102ddca6cb", - "metadata": {}, - "source": [ - "Look at the job configuration variables " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d161dc09-804e-4fa8-a3ca-ad4b9efeccf8", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "! nvflare job show_variables -j /tmp/nvflare/my_job" - ] - }, - { - "cell_type": "markdown", - "id": "3c7408ce-fb16-49dc-8df8-fb9d38ad881e", - "metadata": {}, - "source": [ - "This shows the same information we previously seen. Except it shows each app's configuration. Lets explain a bit mroe about the commnand syntax\n", - "\n", - "```\n", - " nvflare job create \\\n", - "-j /tmp/nvflare/my_job -w sag_pt_deploy_map \\\n", - "-f app_server/config_fed_server.conf num_rounds=2 model_class_path=model.SimpleNetwork \\\n", - "-f app_1/config_fed_client.conf app_script=client.py app_config=\"--batch_size 4\" \\\n", - "-f app_2/config_fed_client.conf app_script=client.py app_config=\"--batch_size 6\" \\\n", - "-sd ./code \\\n", - "-force\n", - "\n", - "```\n", - "\n", - "to specify app specific configuration, you use\n", - "\n", - "```-f app_server/config_fed_server.conf num_rounds=2 model_class_path=model.SimpleNetwork ```\n", - "\n", - "instead \n", - "\n", - "```\n", - "-f config_fed_server.conf num_rounds=2 model_class_path=model.SimpleNetwork \n", - "\n", - "```\n", - "\n", - "Here it tells the command that that only change the config for \"app_server\" app, without \"app_server/\" the command is considered to use the default \"app\" configuration. \n", - "\n", - "if the \"app_name\" is not previously defined in the job templates, the command will show error. For example\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "062dfee7-5f6d-431c-bf69-6003e220909f", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "! nvflare job create \\\n", - "-j /tmp/nvflare/my_job -w sag_pt_deploy_map \\\n", - "-f app_server/config_fed_server.conf num_rounds=2 \\\n", - "-force" - ] - }, - { - "cell_type": "markdown", - "id": "b695d5e3-3dde-456a-a2db-54ad8c2ccb7d", - "metadata": {}, - "source": [ - "Once you have the job folder. You should be able to run the job as before\n" - ] - }, - { - "cell_type": "markdown", - "id": "49deee6f-eb9a-4086-80ff-2e96ab94ede6", - "metadata": {}, - "source": [ - "### Add arguments not originally specified in Job Template\n", - "\n", - "In some cases, we need add additional arguments not defined in the job templates, and we would like to the add to a specific args of certain component. This requires we specify the path to the component. \n", - "\n", - "We use the following notations to indicate the path\n", - "* for single component, we can use dot notation. such as ```model.args.number_classes=2```\n", - "* for component list, we use index notation. such as ```components[1].model.args.number_classes=2```\n", - "\n", - "In the 2nd case, ```components[1]``` indicates the 2nd component of the component list. The first component will be ```component[0]```\n", - "\n", - "Let's look at how do we use this to add or modify the job template. \n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b544e5dc-74e5-46de-9efd-8c1b441a4efc", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "! nvflare job create -j /tmp/nvflare/my_job -w sag_pt -force" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9d616c49-9a50-46d6-bd78-113cd5ca63ab", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "! cat /tmp/nvflare/my_job/app/config/config_fed_server.conf" - ] - }, - { - "cell_type": "markdown", - "id": "8317cfbd-1126-4e4c-b5e0-5ebe9b905713", - "metadata": {}, - "source": [ - "Here, we would like to modify the peristor component configuration from \n", - "```\n", - " components = [\n", - " {\n", - " # This is the persistence component used in above workflow.\n", - " # PTFileModelPersistor is a Pytorch persistor which save/read the model to/from file.\n", - "\n", - " id = \"persistor\"\n", - " path = \"nvflare.app_opt.pt.file_model_persistor.PTFileModelPersistor\"\n", - "\n", - " # the persitor class take model class as argument\n", - " # This imply that the model is initialized from the server-side.\n", - " # The initialized model will be broadcast to all the clients to start the training.\n", - " args.model.path = \"{model_class_path}\"\n", - " },\n", - "```\n", - "to: \n", - "\n", - "```\n", - " components = [\n", - " {\n", - " id = \"persistor\"\n", - " path = \"nvflare.app_opt.pt.file_model_persistor.PTFileModelPersistor\"\n", - " args {\n", - " model { \n", - " path = \"{model_class_path}\"\n", - " args {\n", - " in_channels = 165\n", - " hidden_channels = 256\n", - " num_classes = 2\n", - " num_layers = 3\n", - " }\n", - " }\n", - " }\n", - " },\n", - "```\n", - "Notice, that new models.args are new keys and values. This is Model Persistor is the 1st component. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b4ca2e1a-093b-4c9d-b6b8-9d0ab9382b4c", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "! nvflare job create -j /tmp/nvflare/my_job -w sag_pt -force \\\n", - "-f config_fed_server.conf \\\n", - "components[0].args.model.args.in_channels=165 \\\n", - "components[0].args.model.args.hidden_channels=256 \\\n", - "components[0].args.model.args.num_classes=2 \\\n", - "components[0].args.model.args.num_layers=3" - ] - }, - { - "cell_type": "markdown", - "id": "603dfb03-740d-4df4-afa1-4074405702cd", - "metadata": {}, - "source": [ - "Now, look at the modified configuration again" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a9d135b5-ceea-44bd-b9e1-04d9967defa1", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "! cat /tmp/nvflare/my_job/app/config/config_fed_server.conf" - ] - }, - { - "cell_type": "markdown", - "id": "2c316852-4a61-4ddc-9ae1-42300ecc65d1", - "metadata": {}, - "source": [ - "### Remove configuration\n", - "\n", - "In some cases, we have changed the local training code class contructor, the arguments from the job template's argments need to be removed. We will show you how to do that. \n", - "\n", - "Let's take a look an example: " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b245cb9d-90c0-4250-98b6-c9ea6734af76", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "! nvflare job create -j /tmp/nvflare/jobs/my_job -w stats_df -force" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "625655dc-145c-4e7e-992f-8573c7be8529", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "! cat /tmp/nvflare/jobs/my_job/app/config/config_fed_client.conf" - ] - }, - { - "cell_type": "markdown", - "id": "3d6425ad-c2d0-4f99-bcce-aa735a77c390", - "metadata": {}, - "source": [ - "Notice that the 1st component: \n", - "```\n", - " \"components\": [\n", - " {\n", - " \"id\": \"df_stats_generator\",\n", - " \"path\": \"df_statistics.DFStatistics\",\n", - " \"args\": {\n", - " \"data_path\": \"data.csv\"\n", - " }\n", - " },\n", - " \n", - " ...\n", - " ]\n", - "```\n", - "The df_stats_generator uses the class at \"df_statistics.DFStatistics\", withj input arguments as _data_path = \"data.csv\". What if I decided to write my local class called \"df_stats.MyStats\" where it only takes \"data_root_dir\". \n", - "\n", - "Now we need to do \n", - "1) change the path of \"df_stats_generator\" to the new class path\n", - "2) remove data_path configuration\n", - "3) add data_root_dir argument. \n", - "\n", - "To remove a configuration key, we can use the **** syntax, i.e add \"-\" at the end of key. The key must be exists and must be expressed in full path. such as\n", - " **\"components[0].args.data_path-\"**\n", - " \n", - "both\n", - " **\"components[0].args.data_path-\"** or \n", - " **\"components[0].args.data_path-=value \"** works, although value will be ignored. \n", - " \n", - " \n", - "> Limitation:\n", - "> **the configuration key removal must be against leaf node key. we can't remove parent key such as \"component[0].args\"**\n", - " \n", - "Let's try this out\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "92733f42-10c9-457b-95e2-277a4faf4a89", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "! nvflare job create -j /tmp/nvflare/jobs/my_job -w stats_df \\\n", - "-f config_fed_client.conf \\\n", - "components[0].path=\"df_stats.MyDFStats\" \\\n", - "components[0].args.data_path- \\\n", - "components[0].args.data_root_dir=\"/tmp/dataset\" -force\n" - ] - }, - { - "cell_type": "markdown", - "id": "74794afa-e863-449f-bb97-a681ffebc3ac", - "metadata": {}, - "source": [ - "Now let's look the configuration file again, notice the arguments have changed" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c35b0851-b7cf-49b2-96aa-f188bd9ecfd8", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "! cat /tmp/nvflare/jobs/my_job/app/config/config_fed_client.conf" - ] - }, - { - "cell_type": "markdown", - "id": "2731e06d", - "metadata": {}, - "source": [ - "### Modify custom configuration\n", - "\n", - "Up to this point, we have been discussing how to create and modify NVFLARE-specific configurations, such as meta.conf, config_fed_client.conf, and config_fed_server.conf. What about custom configurations needed by the training code? The custom configuration files are located in the custom directory of the app. \n", - "\n", - "such as \n", - " app1/custom/my_config.yaml\n", - " app2/custom/my_config.yaml\n", - " app_server/custom/my_config.yaml\n", - " \n", - "In such cases, the configuration file name is arbitrary, and the file format can be any one of JSON, PYHOCON, or OmegaConf. We can still modify these files. Before jumping into the specifics, let's see what format the CLI offers for input files. We can specify the config file in one of the following ways: \n", - "\n", - "```\n", - " -f config_fed_client.conf\n", - " \n", - "```\n", - "\n", - "This implies that the input config file, is default to \"app/config/config_client.conf\". Or \n", - "\n", - "\n", - "```\n", - " -f app1/config_fed_client.conf\n", - " \n", - "```\n", - "This implies that the input config file, is default to \"app1/config/config_client.conf\". Or directly spell out the full path\n", - " \n", - " \n", - "```\n", - " -f app1/config/config_fed_client.conf\n", - " \n", - "```\n", - " \n", - "Similarly, we can use \n", - " \n", - "\n", - "```\n", - " -f app1/custom/my_config.yaml\n", - " \n", - "```\n", - "\n", - "Let's use an example to demonstrate this with job template \"sag_nemo\"\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a5febd2e", - "metadata": {}, - "outputs": [], - "source": [ - "! nvflare job create -j /tmp/nvflare/my_job -w sag_nemo -force -sd ../../integration/nemo/examples/peft/nemo_nvflare" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a2604462", - "metadata": {}, - "outputs": [], - "source": [ - "! tree /tmp/nvflare/my_job" - ] - }, - { - "cell_type": "markdown", - "id": "ce121ec6", - "metadata": {}, - "source": [ - "Notice we use the example script directory \"code\". CLI create job copied the custom folder to the job folder. And there are many custom configuration yaml files. Notice \"weight_decay\" is one of parameters io megatron_gpt_peft_tuning_config.ymal. Let's change it from 0.01 to 0.02 on app_server. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "87be7ee7", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "! nvflare job create -j /tmp/nvflare/my_job -w sag_nemo -force \\\n", - " -sd ../../integration/nemo/examples/peft/nemo_nvflare \\\n", - " -f app_server/custom/megatron_gpt_peft_tuning_config.yaml weight_decay=0.02" - ] - }, - { - "cell_type": "markdown", - "id": "bc7195b6", - "metadata": {}, - "source": [ - "Notice that the weight_decay value in the app_server's megatron_gpt_peft_tuning_config.yaml is updated to 0.02. We can also look at the file saved. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c1ee9a5e", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "!cat /tmp/nvflare/my_job/app_server/custom//megatron_gpt_peft_tuning_config.yaml | grep weight_decay" - ] - }, - { - "cell_type": "markdown", - "id": "2a1deaf8-7ee0-4d64-98f9-5eec6102f436", - "metadata": { - "tags": [] - }, - "source": [ - "This works! Alternatively, you can specify the variable path, which can be used to update the exact variable in cases where there are duplicate variables (e.g., two weight_decay variables under different paths). Let's do it again " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ce7a9b6a-a5ea-4e6f-9bb9-f3b164db3dcb", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "! nvflare job create -j /tmp/nvflare/my_job -w sag_nemo -force \\\n", - " -sd ../../integration/nemo/examples/peft/nemo_nvflare \\\n", - " -f app_server/custom/megatron_gpt_peft_tuning_config.yaml model.optim.weight_decay=0.02" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "071aa7b4-5d2a-4d07-8a7c-c6e8abc76638", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "!cat /tmp/nvflare/my_job/app_server/custom//megatron_gpt_peft_tuning_config.yaml | grep weight_decay" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "nvflare_example", - "language": "python", - "name": "nvflare_example" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.2" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/examples/tutorials/nvflare_cli.ipynb b/examples/tutorials/nvflare_cli.ipynb new file mode 100644 index 0000000000..58c306caa9 --- /dev/null +++ b/examples/tutorials/nvflare_cli.ipynb @@ -0,0 +1,535 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "nvflare-cli-title", + "metadata": {}, + "source": [ + "# NVFlare CLI Tutorial\n", + "\n", + "This notebook walks through a working end-to-end NVFlare CLI workflow against a running two-client POC system. It replaces the older Job CLI tutorial, which focused on deprecated job-template commands.\n", + "\n", + "The notebook itself does not start or stop POC services. Start POC from a terminal first, run the notebook commands, then stop POC from the terminal when finished.\n" + ] + }, + { + "cell_type": "markdown", + "id": "nvflare-cli-terminal-prereq", + "metadata": {}, + "source": [ + "## Terminal Prerequisite\n", + "\n", + "Run these commands in a terminal before running the notebook cells:\n", + "\n", + "```bash\n", + "nvflare poc config --pw /tmp/nvflare_cli_tutorial\n", + "nvflare poc prepare -n 2 --force\n", + "nvflare poc start --timeout 60\n", + "```\n", + "\n", + "`poc prepare` creates the local startup kits, registers the Project Admin startup kit, and makes it active for `nvflare job` and `nvflare system` commands. After finishing the tutorial, stop the POC system from the terminal:\n", + "\n", + "```bash\n", + "nvflare poc stop\n", + "```\n" + ] + }, + { + "cell_type": "markdown", + "id": "nvflare-cli-values-text", + "metadata": {}, + "source": [ + "## Notebook Values\n", + "\n", + "The job export and result download use `/tmp` so rerunning the tutorial does not write into the repository. A fresh submit token is generated for each notebook run.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "nvflare-cli-values", + "metadata": {}, + "outputs": [], + "source": [ + "from uuid import uuid4\n", + "\n", + "JOB_FOLDER = \"/tmp/nvflare/jobs/job_config/hello-pt\"\n", + "STUDY_NAME = f\"cli-tutorial-{uuid4().hex[:8]}\"\n", + "RESULT_DIR = f\"/tmp/nvflare_cli_tutorial/results-{uuid4().hex[:8]}\"\n", + "SUBMIT_TOKEN = f\"nvflare-cli-tutorial-hello-pt-{uuid4().hex}\"\n", + "ABORT_SUBMIT_TOKEN = f\"nvflare-cli-tutorial-abort-{uuid4().hex}\"\n", + "\n", + "print(f\"JOB_FOLDER={JOB_FOLDER}\")\n", + "print(f\"STUDY_NAME={STUDY_NAME}\")\n", + "print(f\"RESULT_DIR={RESULT_DIR}\")\n", + "print(f\"SUBMIT_TOKEN={SUBMIT_TOKEN}\")\n", + "print(f\"ABORT_SUBMIT_TOKEN={ABORT_SUBMIT_TOKEN}\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "nvflare-cli-check-cli", + "metadata": {}, + "source": [ + "## Check the CLI Surface\n", + "\n", + "Use `--help` and `--schema` to verify the active command interface. In a source checkout, the version flag can reflect local Git tag state, so this tutorial checks the commands directly.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "nvflare-cli-help", + "metadata": {}, + "outputs": [], + "source": [ + "!nvflare --help\n", + "!nvflare job --help\n", + "!nvflare job submit --schema\n" + ] + }, + { + "cell_type": "markdown", + "id": "nvflare-cli-recipes", + "metadata": {}, + "source": [ + "## Discover Recipes\n", + "\n", + "The current workflow is recipe-based: export or create a job folder, then submit it with the CLI. `recipe list` and `recipe show` are the replacement for the old job-template discovery flow.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "nvflare-cli-recipes-code", + "metadata": {}, + "outputs": [], + "source": [ + "!nvflare recipe list\n", + "!nvflare recipe show fedavg\n" + ] + }, + { + "cell_type": "markdown", + "id": "nvflare-cli-config", + "metadata": {}, + "source": [ + "## Verify Startup-Kit Configuration\n", + "\n", + "These commands should show the Project Admin startup kit registered by `nvflare poc prepare`. If there is no active startup kit, return to the terminal prerequisite and run `poc prepare` again.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "nvflare-cli-config-code", + "metadata": {}, + "outputs": [], + "source": [ + "!nvflare config list\n", + "!nvflare config inspect\n" + ] + }, + { + "cell_type": "markdown", + "id": "nvflare-cli-system", + "metadata": {}, + "source": [ + "## Verify the Running FL System\n", + "\n", + "Now confirm the server and both POC clients are reachable. This is the first meaningful check that the notebook is connected to a live FL system.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "nvflare-cli-system-code", + "metadata": {}, + "outputs": [], + "source": [ + "!nvflare system status\n", + "!nvflare system resources server\n", + "!nvflare system resources client site-1 site-2\n", + "!nvflare system version --site all\n" + ] + }, + { + "cell_type": "markdown", + "id": "nvflare-cli-system-log", + "metadata": {}, + "source": [ + "## Configure System Logging\n", + "\n", + "`system log-config` changes the runtime log level or log mode for the server and client sites. This tutorial applies `INFO`, which is the normal operating level and is safe to run repeatedly.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "nvflare-cli-system-log-code", + "metadata": {}, + "outputs": [], + "source": [ + "!nvflare system log-config INFO --site all\n" + ] + }, + { + "cell_type": "markdown", + "id": "nvflare-cli-study", + "metadata": {}, + "source": [ + "## Work with Study Scope\n", + "\n", + "A study scopes site membership, user access, and job history. This notebook creates a temporary named study with the two POC client sites, submits the job to that study, and removes the study after downloading the result. Production users follow the same pattern with collaboration-specific study names and site membership.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "nvflare-cli-study-code", + "metadata": {}, + "outputs": [], + "source": [ + "!nvflare study --help\n", + "!nvflare study list\n", + "!nvflare study register {STUDY_NAME} --site-org nvidia:site-1,site-2\n", + "!nvflare study show {STUDY_NAME}\n" + ] + }, + { + "cell_type": "markdown", + "id": "nvflare-cli-export", + "metadata": {}, + "source": [ + "## Export a Real Job Folder\n", + "\n", + "This tutorial uses `examples/hello-world/hello-pt`, a PyTorch FedAvg job. The export command enables job log streaming so the later `job logs` cell can retrieve server and client logs. It also uses synthetic data so the tutorial can run without downloading CIFAR-10. The `cd` command keeps the export self-contained: `job.py` runs from its example directory so it can find `client.py` and `model.py`.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "nvflare-cli-export-code", + "metadata": {}, + "outputs": [], + "source": [ + "!cd ../hello-world/hello-pt && python job.py --export_config --enable_log_streaming --synthetic_data --train_size 2048 --test_size 256 --num_rounds 2 --epochs 1 --batch_size 64 --num_workers 0\n", + "!find {JOB_FOLDER} -maxdepth 3 -type f | sort | head -30\n" + ] + }, + { + "cell_type": "markdown", + "id": "nvflare-cli-submit", + "metadata": {}, + "source": [ + "## Submit the Job\n", + "\n", + "`job submit` returns immediately with a `job_id`. The submit token makes this operation safe to retry for the same job content and submitter.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "nvflare-cli-submit-code", + "metadata": {}, + "outputs": [], + "source": [ + "submit_result = !nvflare --format json job submit -j {JOB_FOLDER} --study {STUDY_NAME} --submit-token {SUBMIT_TOKEN}\n", + "print(\"\\n\".join(submit_result))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "nvflare-cli-job-id-code", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "\n", + "submit_response = json.loads(\"\\n\".join(submit_result))\n", + "if submit_response.get(\"status\") != \"ok\":\n", + " raise RuntimeError(json.dumps(submit_response, indent=2))\n", + "\n", + "submit_data = submit_response.get(\"data\", {})\n", + "JOB_ID = submit_data.get(\"job_id\") or submit_data.get(\"existing_job_id\")\n", + "if not JOB_ID:\n", + " raise RuntimeError(json.dumps(submit_response, indent=2))\n", + "print(f\"JOB_ID={JOB_ID}\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "nvflare-cli-job-log", + "metadata": {}, + "source": [ + "## Configure Job Logging\n", + "\n", + "`job log` is an alias for `job log-config`. It changes runtime logging for an active job in the selected study. Because this tutorial job is intentionally short, the command may find that the job has already completed; that is handled as a non-fatal timing case.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "nvflare-cli-job-log-code", + "metadata": {}, + "outputs": [], + "source": [ + "!nvflare job log {JOB_ID} INFO --study {STUDY_NAME} --site all\n" + ] + }, + { + "cell_type": "markdown", + "id": "nvflare-cli-token-recovery", + "metadata": {}, + "source": [ + "## Recover the Job ID from the Submit Token\n", + "\n", + "If automation loses the original submit response, it can recover the submitted job by querying with the same submit token.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "nvflare-cli-token-recovery-code", + "metadata": {}, + "outputs": [], + "source": [ + "lookup_result = !nvflare --format json job list --study {STUDY_NAME} --submit-token {SUBMIT_TOKEN}\n", + "print(\"\\n\".join(lookup_result))\n" + ] + }, + { + "cell_type": "markdown", + "id": "nvflare-cli-list-monitor", + "metadata": {}, + "source": [ + "## List, Monitor, and Wait\n", + "\n", + "Use `job list` for recent job history, `job stats` while the job is running, `job monitor` for interactive progress, and `job wait` for automation that needs a final command result. Each command below passes `--study {STUDY_NAME}` so the job lookup happens in the intended study. The tutorial job is intentionally small, so it can finish before the stats command samples it on fast systems or when cells are run slowly.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "nvflare-cli-list-code", + "metadata": {}, + "outputs": [], + "source": [ + "!nvflare job list --study {STUDY_NAME} -m 5\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "nvflare-cli-stats-code", + "metadata": {}, + "outputs": [], + "source": [ + "stats_result = !nvflare --format json job stats {JOB_ID} --study {STUDY_NAME} --site all\n", + "print(\"\\n\".join(stats_result))\n", + "\n", + "stats_response = json.loads(\"\\n\".join(stats_result))\n", + "if stats_response.get(\"status\") != \"ok\":\n", + " not_running = stats_response.get(\"error_code\") == \"JOB_NOT_RUNNING\" or (\n", + " stats_response.get(\"error_code\") == \"INTERNAL_ERROR\" and \"is not running\" in stats_response.get(\"message\", \"\")\n", + " )\n", + " if not_running:\n", + " print(\"The job completed before live stats were sampled; continue with monitor/wait for terminal status.\")\n", + " else:\n", + " raise RuntimeError(json.dumps(stats_response, indent=2))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "nvflare-cli-monitor-code", + "metadata": {}, + "outputs": [], + "source": [ + "!nvflare job monitor {JOB_ID} --study {STUDY_NAME} --timeout 120 --interval 5\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "nvflare-cli-wait-code", + "metadata": {}, + "outputs": [], + "source": [ + "wait_result = !nvflare --format json job wait {JOB_ID} --study {STUDY_NAME} --timeout 600 --interval 5\n", + "print(\"\\n\".join(wait_result))\n", + "\n", + "wait_response = json.loads(\"\\n\".join(wait_result))\n", + "if wait_response.get(\"status\") != \"ok\":\n", + " raise RuntimeError(json.dumps(wait_response, indent=2))\n" + ] + }, + { + "cell_type": "markdown", + "id": "nvflare-cli-inspect-job", + "metadata": {}, + "source": [ + "## Inspect the Completed Job\n", + "\n", + "After the job reaches a terminal state, inspect its metadata and retrieve streamed server/client logs.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "nvflare-cli-meta-code", + "metadata": {}, + "outputs": [], + "source": [ + "!nvflare job meta {JOB_ID} --study {STUDY_NAME}\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "nvflare-cli-logs-code", + "metadata": {}, + "outputs": [], + "source": [ + "!nvflare job logs {JOB_ID} --study {STUDY_NAME} --sites all --tail 80\n" + ] + }, + { + "cell_type": "markdown", + "id": "nvflare-cli-download", + "metadata": {}, + "source": [ + "## Download Job Results\n", + "\n", + "Download the server-side job artifacts and inspect the downloaded files.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "nvflare-cli-download-code", + "metadata": {}, + "outputs": [], + "source": [ + "!nvflare job download {JOB_ID} --study {STUDY_NAME} --output-dir {RESULT_DIR} --force\n", + "!find {RESULT_DIR} -maxdepth 3 -type f | sort | head -50\n" + ] + }, + { + "cell_type": "markdown", + "id": "nvflare-cli-abort", + "metadata": {}, + "source": [ + "## Abort a Running Job\n", + "\n", + "Submit a second job in the same study and abort it. The abort demo uses a longer synthetic run so the job is still active when `job abort` is issued. `job wait` reports `JOB_ABORTED` for an aborted terminal state; that is the expected result here. The only Python below extracts the generated job id so the following CLI commands can reference it.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "nvflare-cli-abort-code", + "metadata": {}, + "outputs": [], + "source": [ + "!cd ../hello-world/hello-pt && python job.py --export_config --enable_log_streaming --synthetic_data --train_size 8192 --test_size 512 --num_rounds 20 --epochs 1 --batch_size 64 --num_workers 0\n", + "\n", + "abort_submit_result = !nvflare --format json job submit -j {JOB_FOLDER} --study {STUDY_NAME} --submit-token {ABORT_SUBMIT_TOKEN}\n", + "print(\"\\n\".join(abort_submit_result))\n", + "\n", + "ABORT_JOB_ID = json.loads(\"\\n\".join(abort_submit_result))[\"data\"][\"job_id\"]\n", + "print(f\"ABORT_JOB_ID={ABORT_JOB_ID}\")\n", + "\n", + "!nvflare job abort {ABORT_JOB_ID} --study {STUDY_NAME} --force\n", + "!nvflare job wait {ABORT_JOB_ID} --study {STUDY_NAME} --timeout 120 --interval 5\n", + "!nvflare job meta {ABORT_JOB_ID} --study {STUDY_NAME}\n" + ] + }, + { + "cell_type": "markdown", + "id": "nvflare-cli-cleanup", + "metadata": {}, + "source": [ + "## Clean Up the Study\n", + "\n", + "After downloading the result and aborting the second job, remove the remote job records and delete the temporary study created for this notebook. The downloaded files remain in `RESULT_DIR`.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "nvflare-cli-cleanup-code", + "metadata": {}, + "outputs": [], + "source": [ + "!nvflare job delete {ABORT_JOB_ID} --study {STUDY_NAME} --force\n", + "!nvflare job delete {JOB_ID} --study {STUDY_NAME} --force\n", + "!nvflare study remove {STUDY_NAME}\n" + ] + }, + { + "cell_type": "markdown", + "id": "nvflare-cli-more-commands", + "metadata": {}, + "source": [ + "## Other CLI Command Groups\n", + "\n", + "The end-to-end workflow above used `config`, `study`, `system`, `recipe`, and `job`. The same `nvflare` CLI also includes command groups for distributed provisioning, package assembly, and deployment preparation. The commands below are read-only help commands so they are safe to run in this notebook.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "nvflare-cli-more-commands-code", + "metadata": {}, + "outputs": [], + "source": [ + "!nvflare cert --help\n", + "!nvflare package --help\n", + "!nvflare deploy prepare --help\n" + ] + }, + { + "cell_type": "markdown", + "id": "nvflare-cli-migration", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "## Quick Migration Reference\n", + "\n", + "| Older tutorial command | Current workflow |\n", + "| --- | --- |\n", + "| `nvflare job list-templates` | `nvflare recipe list` and `nvflare recipe show ` |\n", + "| `nvflare job create` | Create/export a job folder with Job Recipe API code or an example script, then submit it |\n", + "| `nvflare job show-variables` | Inspect the recipe, example code, or generated job folder directly |\n", + "| Repeating startup-kit paths in each command | POC: `nvflare poc prepare` registers the Project Admin kit. Production: register a real admin startup kit with `nvflare config add/use`. |\n", + "| Implicit default job scope | Pass `--study ` on `job submit/list/monitor/wait/meta/logs/stats/download/clone/abort/delete` when working outside the default study. |\n", + "| Admin Console-only job operations | `nvflare job submit/list/monitor/wait/meta/logs/stats/download/clone/abort/delete` |\n", + "| Admin Console-only system checks | `nvflare system status/resources/version/log-config/restart/shutdown` |\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-3_federated_computing_platform/03.1_federated_computing_architecture/system_architecture.ipynb b/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-3_federated_computing_platform/03.1_federated_computing_architecture/system_architecture.ipynb index cd0adfa14e..287501e5b2 100644 --- a/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-3_federated_computing_platform/03.1_federated_computing_architecture/system_architecture.ipynb +++ b/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-3_federated_computing_platform/03.1_federated_computing_architecture/system_architecture.ipynb @@ -351,7 +351,7 @@ "\n", "### Job Templates\n", "\n", - "Job templates provide predefined configurations that can be customized for specific needs. They simplify the process of creating job configurations by providing starting points for common scenarios. You can leverage existing [job templates](https://github.com/NVIDIA/NVFlare/tree/main/job_templates) which are a set of predefined configurations and use the [job CLI](https://github.com/NVIDIA/NVFlare/blob/main/examples/tutorials/job_cli.ipynb) to customize to your needs. \n", + "Job templates provide predefined configurations that can be customized for specific needs. They simplify the process of creating job configurations by providing starting points for common scenarios. You can leverage existing [job templates](https://github.com/NVIDIA/NVFlare/tree/main/job_templates) and see the [NVFlare CLI tutorial](https://github.com/NVIDIA/NVFlare/blob/main/examples/tutorials/nvflare_cli.ipynb) for the current command-line workflow. \n", "\n", "#### Template Structure\n", "\n", @@ -387,7 +387,7 @@ "\n", "You can use the NVIDIA FLARE job CLI to view and modify templates when creating job configurations. This provides a user-friendly way to customize templates for your specific needs.\n", "\n", - "For more information, see the [job CLI tutorial](../../../../job_cli.ipynb).\n", + "For more information, see the [NVFlare CLI tutorial](../../../../nvflare_cli.ipynb).\n", "\n", "## Summary\n", "\n", diff --git a/nvflare/tool/job/job_cli.py b/nvflare/tool/job/job_cli.py index b85d63c13f..3890b4e74d 100644 --- a/nvflare/tool/job/job_cli.py +++ b/nvflare/tool/job/job_cli.py @@ -1838,7 +1838,7 @@ def _is_terminal_job_status(status: str) -> bool: def cmd_job_stats(cmd_args): - from nvflare.fuel.flare_api.api_spec import AuthenticationError, JobNotFound, NoConnection + from nvflare.fuel.flare_api.api_spec import AuthenticationError, JobNotFound, JobNotRunning, NoConnection from nvflare.tool.cli_output import output_error, output_ok from nvflare.tool.cli_schema import handle_schema_flag @@ -1876,6 +1876,9 @@ def cmd_job_stats(cmd_args): hint=_job_not_found_hint(study), ) return + except JobNotRunning: + output_error("JOB_NOT_RUNNING", job_id=cmd_args.job_id) + return except AuthenticationError: raise except NoConnection as e: diff --git a/tests/unit_test/tool/job/job_stats_test.py b/tests/unit_test/tool/job/job_stats_test.py index c1a8e62f14..6dec4d83d4 100644 --- a/tests/unit_test/tool/job/job_stats_test.py +++ b/tests/unit_test/tool/job/job_stats_test.py @@ -18,7 +18,7 @@ import pytest -from nvflare.fuel.flare_api.api_spec import AuthenticationError, JobNotFound, NoConnection +from nvflare.fuel.flare_api.api_spec import AuthenticationError, JobNotFound, JobNotRunning, NoConnection from nvflare.tool import cli_output @@ -117,6 +117,25 @@ def test_stats_connection_failed_exits_2(self, capsys): assert envelope["error_code"] == "CONNECTION_FAILED" assert envelope["exit_code"] == 2 + def test_stats_job_not_running_exits_1(self, capsys): + """JobNotRunning maps to JOB_NOT_RUNNING, exit 1.""" + from nvflare.tool.job.job_cli import cmd_job_stats + + mock_sess = MagicMock() + mock_sess.show_stats.side_effect = JobNotRunning("job is not running") + + with patch("nvflare.tool.job.job_cli._session", side_effect=self._fake_session(mock_sess)): + with pytest.raises(SystemExit) as exc_info: + cmd_job_stats(_make_args(job_id="abc123")) + assert exc_info.value.code == 1 + + captured = capsys.readouterr() + envelope = json.loads(captured.out) + assert envelope["status"] == "error" + assert envelope["error_code"] == "JOB_NOT_RUNNING" + assert envelope["exit_code"] == 1 + assert "abc123" in envelope["message"] + def test_stats_authentication_error_propagates(self): from nvflare.tool.job.job_cli import cmd_job_stats From d023456d77eb3b59420df28373d87de566e11763 Mon Sep 17 00:00:00 2001 From: chesterxgchen Date: Tue, 19 May 2026 07:34:43 -0700 Subject: [PATCH 2/3] [2.8] Address CLI tutorial review comments --- examples/hello-world/hello-pt/client.py | 2 + examples/tutorials/nvflare_cli.ipynb | 10 +++- nvflare/tool/job/job_cli.py | 12 ++++- .../examples/hello_pt_client_test.py | 54 +++++++++++++++++++ tests/unit_test/tool/job/job_stats_test.py | 1 + 5 files changed, 76 insertions(+), 3 deletions(-) create mode 100644 tests/unit_test/examples/hello_pt_client_test.py diff --git a/examples/hello-world/hello-pt/client.py b/examples/hello-world/hello-pt/client.py index ec36704151..f72511f093 100644 --- a/examples/hello-world/hello-pt/client.py +++ b/examples/hello-world/hello-pt/client.py @@ -45,6 +45,8 @@ def evaluate(net, data_loader, device): total += labels.size(0) correct += (predicted == labels).sum().item() + if total == 0: + raise ValueError("Evaluation data_loader produced no samples; check data preparation and --test_size.") accuracy = 100 * correct // total print(f"Accuracy of the network on {total} test images: {accuracy} %") return accuracy diff --git a/examples/tutorials/nvflare_cli.ipynb b/examples/tutorials/nvflare_cli.ipynb index 58c306caa9..b838a779b4 100644 --- a/examples/tutorials/nvflare_cli.ipynb +++ b/examples/tutorials/nvflare_cli.ipynb @@ -331,6 +331,7 @@ "\n", "stats_response = json.loads(\"\\n\".join(stats_result))\n", "if stats_response.get(\"status\") != \"ok\":\n", + " # Older servers reported a completed job as an INTERNAL_ERROR with an unstructured message.\n", " not_running = stats_response.get(\"error_code\") == \"JOB_NOT_RUNNING\" or (\n", " stats_response.get(\"error_code\") == \"INTERNAL_ERROR\" and \"is not running\" in stats_response.get(\"message\", \"\")\n", " )\n", @@ -438,7 +439,14 @@ "abort_submit_result = !nvflare --format json job submit -j {JOB_FOLDER} --study {STUDY_NAME} --submit-token {ABORT_SUBMIT_TOKEN}\n", "print(\"\\n\".join(abort_submit_result))\n", "\n", - "ABORT_JOB_ID = json.loads(\"\\n\".join(abort_submit_result))[\"data\"][\"job_id\"]\n", + "abort_response = json.loads(\"\\n\".join(abort_submit_result))\n", + "if abort_response.get(\"status\") != \"ok\":\n", + " raise RuntimeError(json.dumps(abort_response, indent=2))\n", + "\n", + "abort_data = abort_response.get(\"data\", {})\n", + "ABORT_JOB_ID = abort_data.get(\"job_id\") or abort_data.get(\"existing_job_id\")\n", + "if not ABORT_JOB_ID:\n", + " raise RuntimeError(json.dumps(abort_response, indent=2))\n", "print(f\"ABORT_JOB_ID={ABORT_JOB_ID}\")\n", "\n", "!nvflare job abort {ABORT_JOB_ID} --study {STUDY_NAME} --force\n", diff --git a/nvflare/tool/job/job_cli.py b/nvflare/tool/job/job_cli.py index 3890b4e74d..232d760104 100644 --- a/nvflare/tool/job/job_cli.py +++ b/nvflare/tool/job/job_cli.py @@ -1516,7 +1516,11 @@ def cmd_job_abort(cmd_args): ) return except JobNotRunning: - output_error("JOB_NOT_RUNNING", job_id=cmd_args.job_id) + output_error( + "JOB_NOT_RUNNING", + job_id=cmd_args.job_id, + detail="abort is available only while the job is running", + ) return except AuthenticationError: raise @@ -1877,7 +1881,11 @@ def cmd_job_stats(cmd_args): ) return except JobNotRunning: - output_error("JOB_NOT_RUNNING", job_id=cmd_args.job_id) + output_error( + "JOB_NOT_RUNNING", + job_id=cmd_args.job_id, + detail="stats are available only while the job is running", + ) return except AuthenticationError: raise diff --git a/tests/unit_test/examples/hello_pt_client_test.py b/tests/unit_test/examples/hello_pt_client_test.py new file mode 100644 index 0000000000..ff2f97073a --- /dev/null +++ b/tests/unit_test/examples/hello_pt_client_test.py @@ -0,0 +1,54 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import importlib.util +import os +import sys + +import pytest + +HAS_PT_DEPS = all(importlib.util.find_spec(dep) is not None for dep in ("torch", "torchvision")) +pytestmark = pytest.mark.skipif(not HAS_PT_DEPS, reason="PyTorch example dependencies are not installed") + + +def _load_hello_pt_module(file_name: str, module_name: str): + repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "..")) + example_dir = os.path.join(repo_root, "examples", "hello-world", "hello-pt") + module_path = os.path.join(example_dir, file_name) + spec = importlib.util.spec_from_file_location(module_name, module_path) + module = importlib.util.module_from_spec(spec) + assert spec.loader is not None + + original_model_module = sys.modules.pop("model", None) + sys.path.insert(0, example_dir) + try: + spec.loader.exec_module(module) + except RuntimeError as e: + if "torchvision" in str(e): + pytest.skip(f"PyTorch example dependency is unavailable: {e}") + raise + finally: + sys.path.pop(0) + if original_model_module is not None: + sys.modules["model"] = original_model_module + else: + sys.modules.pop("model", None) + return module + + +def test_hello_pt_evaluate_rejects_empty_data_loader(): + client_module = _load_hello_pt_module("client.py", "hello_pt_client") + + with pytest.raises(ValueError, match="Evaluation data_loader produced no samples"): + client_module.evaluate(net=None, data_loader=[], device="cpu") diff --git a/tests/unit_test/tool/job/job_stats_test.py b/tests/unit_test/tool/job/job_stats_test.py index 6dec4d83d4..c4bdf9d302 100644 --- a/tests/unit_test/tool/job/job_stats_test.py +++ b/tests/unit_test/tool/job/job_stats_test.py @@ -135,6 +135,7 @@ def test_stats_job_not_running_exits_1(self, capsys): assert envelope["error_code"] == "JOB_NOT_RUNNING" assert envelope["exit_code"] == 1 assert "abc123" in envelope["message"] + assert "stats are available only while the job is running" in envelope["message"] def test_stats_authentication_error_propagates(self): from nvflare.tool.job.job_cli import cmd_job_stats From 5a66aee60a2e74de0c3556096cbbef2a15d90595 Mon Sep 17 00:00:00 2001 From: chesterxgchen Date: Tue, 19 May 2026 07:46:53 -0700 Subject: [PATCH 3/3] [2.8] Use recipe export args in hello-pt docs --- docs/examples/hello_pt_job_api.rst | 60 +++++++++++++++++-------- examples/hello-world/hello-pt/README.md | 14 ++++++ examples/hello-world/hello-pt/job.py | 19 +++----- examples/tutorials/nvflare_cli.ipynb | 6 +-- 4 files changed, 64 insertions(+), 35 deletions(-) diff --git a/docs/examples/hello_pt_job_api.rst b/docs/examples/hello_pt_job_api.rst index 1738ae613f..74f5089eac 100644 --- a/docs/examples/hello_pt_job_api.rst +++ b/docs/examples/hello_pt_job_api.rst @@ -56,27 +56,46 @@ To run this example: .. code-block:: shell - $ python fedavg_script_runner_pt.py + $ python job.py -The script will create an NVFlare job in /tmp/nvflare/jobs/job_config/hello-pt_cifar10_fedavg -and run it using the FL Simulator. +The script creates an NVFlare job recipe and runs it using the FL Simulator. + +To export the job folder for submission to a running FL system, use the standard Recipe API export flags: + +.. code-block:: shell + + $ python job.py --export --export-dir /tmp/nvflare/jobs/job_config + +The exported job is written to ``/tmp/nvflare/jobs/job_config/hello-pt``. +You can combine the export flags with example-specific options, for example: + +.. code-block:: shell + + $ python job.py --export --export-dir /tmp/nvflare/jobs/job_config \ + --enable_log_streaming --synthetic_data --train_size 2048 --test_size 256 \ + --num_rounds 2 --epochs 1 --batch_size 64 --num_workers 0 NVIDIA FLARE Job API -------------------- -The ``fedavg_script_runner_pt.py`` script for this hello-pt example is very similar to the ``fedavg_script_runner_hello-numpy.py`` script -for the :doc:`Hello NumPy ` exercise. Other than changes to the names of the job and client script, the only difference -is a line to define the initial global model for the server: +The ``job.py`` script for this hello-pt example defines a :class:`FedAvgRecipe`. +The recipe combines the PyTorch model, client training script, and simulator/export behavior: .. code-block:: python - # Define the initial global model and send to server - job.to(SimpleNetwork(), "server") + recipe = FedAvgRecipe( + name="hello-pt", + min_clients=n_clients, + num_rounds=num_rounds, + model=SimpleNetwork(), + train_script="client.py", + train_args=train_args, + ) NVIDIA FLARE Client Training Script ------------------------------------ -The training script for this example, ``hello-pt_cifar10_fl.py``, is the main script that will be run on the clients. It contains the PyTorch specific +The training script for this example, ``client.py``, is the main script that will be run on the clients. It contains the PyTorch specific logic for training. Neural Network @@ -90,7 +109,7 @@ Let's see the simplified CIFAR10 model used in this example: - :github_nvflare_link:`model.py ` This ``SimpleNetwork`` class is your convolutional neural network to train with the CIFAR10 dataset. -This is not related to NVIDIA FLARE, so we implement it in a file called ``simple_network.py``. +This is not related to NVIDIA FLARE, so we implement it in a file called ``model.py``. Dataset & Setup ^^^^^^^^^^^^^^^^ @@ -101,7 +120,7 @@ the dataset we will be using on each client. Additionally, you need to set up the optimizer, loss function and transform to process the data. You can think of all of this code as part of your local training loop, as every deep learning training has a similar setup. -In the ``hello-pt_cifar10_fl.py`` script, we take care of all of this setup before the ``flare.init()``. +In the ``client.py`` script, we take care of all of this setup before the ``flare.init()``. Local Train ^^^^^^^^^^^ @@ -137,7 +156,7 @@ Now with the network and dataset setup, let's also implement the local training flare.send(output_model) -The code above is simplified from the ``hello-pt_cifar10_fl.py`` script to focus on the three essential methods of the NVFlare's Client API to +The code above is simplified from the ``client.py`` script to focus on the three essential methods of the NVFlare's Client API to achieve the training workflow: - `init()`: Initializes NVFlare Client API environment. @@ -148,9 +167,9 @@ NVIDIA FLARE Server & Application --------------------------------- In this example, the server runs :class:`FedAvg` with the default settings. -If you export the job with the :func:`export` function, you will see the +If you export the job with ``python job.py --export --export-dir ``, you will see the configurations for the server and each client. The server configuration is ``config_fed_server.json`` in the config folder -in app_server: +in the exported app folder: .. code-block:: json @@ -161,6 +180,7 @@ in app_server: "id": "controller", "path": "nvflare.app_common.workflows.fedavg.FedAvg", "args": { + "aggregation_weights": {}, "num_clients": 2, "num_rounds": 2 } @@ -185,6 +205,7 @@ in app_server: "path": "nvflare.app_opt.tracking.tb.tb_receiver.TBAnalyticsReceiver", "args": { "events": [ + "analytix_log_stats", "fed.analytix_log_stats" ] } @@ -194,13 +215,13 @@ in app_server: "path": "nvflare.app_opt.pt.file_model_persistor.PTFileModelPersistor", "args": { "model": { - "path": "src.simple_network.SimpleNetwork", + "path": "model.SimpleNetwork", "args": {} } } }, { - "id": "model_locator", + "id": "locator", "path": "nvflare.app_opt.pt.file_model_locator.PTFileModelLocator", "args": { "pt_persistor_id": "persistor" @@ -213,8 +234,8 @@ in app_server: This is automatically created by the Job API. The server application configuration leverages NVIDIA FLARE built-in components. -Note that ``persistor`` points to ``PTFileModelPersistor``. This is automatically configured when the model SimpleNetwork is added -to the server with the :func:`to` function. The Job API detects that the model is a PyTorch model +Note that ``persistor`` points to ``PTFileModelPersistor``. This is automatically configured from the +``SimpleNetwork`` model supplied to the recipe. The Job API detects that the model is a PyTorch model and automatically configures :class:`PTFileModelPersistor` and :class:`PTFileModelLocator`. @@ -236,7 +257,8 @@ The client configuration is ``config_fed_client.json`` in the config folder of e "executor": { "path": "nvflare.app_opt.pt.in_process_client_api_executor.PTInProcessClientAPIExecutor", "args": { - "task_script_path": "src/hello-pt_cifar10_fl.py" + "task_script_path": "client.py", + "task_script_args": "--batch_size 16 --epochs 2 --num_workers 2" } } } diff --git a/examples/hello-world/hello-pt/README.md b/examples/hello-world/hello-pt/README.md index 2d8146189a..b5dc4fcecc 100644 --- a/examples/hello-world/hello-pt/README.md +++ b/examples/hello-world/hello-pt/README.md @@ -184,6 +184,20 @@ The cross-site evaluation results can be viewed with: cat /tmp/nvflare/simulation/hello-pt/server/simulate_job/cross_site_val/cross_val_results.json ``` +To export the job folder for submission to a running FL system, use the standard Recipe API export flags: + +``` +python job.py --export --export-dir /tmp/nvflare/jobs/job_config +``` + +The exported job is written to `/tmp/nvflare/jobs/job_config/hello-pt`. You can combine the export flags with the example-specific arguments, for example: + +``` +python job.py --export --export-dir /tmp/nvflare/jobs/job_config \ + --enable_log_streaming --synthetic_data --train_size 2048 --test_size 256 \ + --num_rounds 2 --epochs 1 --batch_size 64 --num_workers 0 +``` + > **Note:** Depending on the number of clients, you might run into errors if several clients try to download the data at the same time. It is suggested to pre-download the data to avoid such errors. ## Notebook diff --git a/examples/hello-world/hello-pt/job.py b/examples/hello-world/hello-pt/job.py index 0bd78e1b9d..14db722852 100644 --- a/examples/hello-world/hello-pt/job.py +++ b/examples/hello-world/hello-pt/job.py @@ -36,7 +36,6 @@ def define_parser(): parser.add_argument("--test_size", type=int, default=10000) parser.add_argument("--train_script", type=str, default="client.py") parser.add_argument("--cross_site_eval", action="store_true") - parser.add_argument("--export_config", action=argparse.BooleanOptionalAction, default=False) parser.add_argument("--enable_log_streaming", action=argparse.BooleanOptionalAction, default=False) parser.add_argument( "--launch_external_process", @@ -85,18 +84,12 @@ def main(): if args.enable_log_streaming: recipe.enable_log_streaming() - if args.export_config: - job_dir = "/tmp/nvflare/jobs/job_config" - recipe.export(job_dir) - print(f"Job config exported to {job_dir}") - else: - # Run FL simulation - env = SimEnv(num_clients=n_clients) - run = recipe.execute(env) - print() - print("Job Status is:", run.get_status()) - print("Result can be found in :", run.get_result()) - print() + env = SimEnv(num_clients=n_clients) + run = recipe.execute(env) + print() + print("Job Status is:", run.get_status()) + print("Result can be found in :", run.get_result()) + print() if __name__ == "__main__": diff --git a/examples/tutorials/nvflare_cli.ipynb b/examples/tutorials/nvflare_cli.ipynb index b838a779b4..6fe3e80e7b 100644 --- a/examples/tutorials/nvflare_cli.ipynb +++ b/examples/tutorials/nvflare_cli.ipynb @@ -203,7 +203,7 @@ "source": [ "## Export a Real Job Folder\n", "\n", - "This tutorial uses `examples/hello-world/hello-pt`, a PyTorch FedAvg job. The export command enables job log streaming so the later `job logs` cell can retrieve server and client logs. It also uses synthetic data so the tutorial can run without downloading CIFAR-10. The `cd` command keeps the export self-contained: `job.py` runs from its example directory so it can find `client.py` and `model.py`.\n" + "This tutorial uses `examples/hello-world/hello-pt`, a PyTorch FedAvg job. The recipe-level export command enables job log streaming so the later `job logs` cell can retrieve server and client logs. It also uses synthetic data so the tutorial can run without downloading CIFAR-10. The `cd` command keeps the export self-contained: `job.py` runs from its example directory so it can find `client.py` and `model.py`.\n" ] }, { @@ -213,7 +213,7 @@ "metadata": {}, "outputs": [], "source": [ - "!cd ../hello-world/hello-pt && python job.py --export_config --enable_log_streaming --synthetic_data --train_size 2048 --test_size 256 --num_rounds 2 --epochs 1 --batch_size 64 --num_workers 0\n", + "!cd ../hello-world/hello-pt && python job.py --export --export-dir /tmp/nvflare/jobs/job_config --enable_log_streaming --synthetic_data --train_size 2048 --test_size 256 --num_rounds 2 --epochs 1 --batch_size 64 --num_workers 0\n", "!find {JOB_FOLDER} -maxdepth 3 -type f | sort | head -30\n" ] }, @@ -434,7 +434,7 @@ "metadata": {}, "outputs": [], "source": [ - "!cd ../hello-world/hello-pt && python job.py --export_config --enable_log_streaming --synthetic_data --train_size 8192 --test_size 512 --num_rounds 20 --epochs 1 --batch_size 64 --num_workers 0\n", + "!cd ../hello-world/hello-pt && python job.py --export --export-dir /tmp/nvflare/jobs/job_config --enable_log_streaming --synthetic_data --train_size 8192 --test_size 512 --num_rounds 20 --epochs 1 --batch_size 64 --num_workers 0\n", "\n", "abort_submit_result = !nvflare --format json job submit -j {JOB_FOLDER} --study {STUDY_NAME} --submit-token {ABORT_SUBMIT_TOKEN}\n", "print(\"\\n\".join(abort_submit_result))\n",